diff --git a/NEWS b/NEWS index 5f8dde354..dfc2e870d 100644 --- a/NEWS +++ b/NEWS @@ -52,7 +52,7 @@ a list). Those that are not bugfixes or code tidies are: matches the "fullwidth" versions of hex digits. PCRE2_EXTRA_ASCII_DIGIT can be used to keep it ASCII only. -* Make PCRE2_UCP the default in UTF mode in pcre2grep and add -no_ucp, +* Make PCRE2_UCP the default in UTF mode in pcre2grep and add --no-ucp, --case-restrict and --posix-digit. * Add --group-separator and --no-group-separator to pcre2grep. diff --git a/RunGrepTest b/RunGrepTest index 30a5b8157..1ba05f4c2 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -637,6 +637,8 @@ echo "RC=$?" >>testtrygrep echo "---------------------------- Test 120 ------------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -HO '$0:$2$1$3' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -HO '$&:$2$1$3' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep +echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -m 1 -O '$0:$a$b$e$f$r$t$v' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep echo "RC=$?" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -HO '${X}' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1 diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index 6576e9440..df439c4c4 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -4082,7 +4082,7 @@
-The default case transformations applied by PCRE2 are reasonably complete, and, +The default case transformations applied by PCRE2 are reasonably complete, and, in UTF or UCP mode, perform the basic locale-invariant case transformations as specified by Unicode. This is suitable for the internal (invisible) case-equivalence procedures used during pattern matching, but an application diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html index d0c7100b4..862a62b85 100644 --- a/doc/html/pcre2grep.html +++ b/doc/html/pcre2grep.html @@ -724,9 +724,9 @@
echo -e "abcde\n12345" | pcre2grep \ diff --git a/doc/pcre2.txt b/doc/pcre2.txt index 9958c6382..e0246f470 100644 --- a/doc/pcre2.txt +++ b/doc/pcre2.txt @@ -188,8 +188,8 @@ REVISION PCRE2 10.38 27 August 2021 PCRE2(3) ------------------------------------------------------------------------------ - - + + PCRE2API(3) Library Functions Manual PCRE2API(3) @@ -4209,8 +4209,8 @@ REVISION PCRE2 10.45 04 October 2024 PCRE2API(3) ------------------------------------------------------------------------------ - - + + PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3) @@ -4835,8 +4835,8 @@ REVISION PCRE2 10.44 15 April 2024 PCRE2BUILD(3) ------------------------------------------------------------------------------ - - + + PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3) @@ -5268,8 +5268,8 @@ REVISION PCRE2 10.43 19 January 2024 PCRE2CALLOUT(3) ------------------------------------------------------------------------------ - - + + PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3) @@ -5522,8 +5522,8 @@ REVISION PCRE2 10.45 01 September 2024 PCRE2COMPAT(3) ------------------------------------------------------------------------------ - - + + PCRE2JIT(3) Library Functions Manual PCRE2JIT(3) @@ -5977,8 +5977,8 @@ REVISION PCRE2 10.45 23 July 2024 PCRE2JIT(3) ------------------------------------------------------------------------------ - - + + PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3) @@ -6060,8 +6060,8 @@ REVISION PCRE2 10.43 1 August 2023 PCRE2LIMITS(3) ------------------------------------------------------------------------------ - - + + PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3) @@ -6293,8 +6293,8 @@ REVISION PCRE2 10.45 30 August 2024 PCRE2MATCHING(3) ------------------------------------------------------------------------------ - - + + PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3) @@ -6676,8 +6676,8 @@ REVISION PCRE2 10.34 04 September 2019 PCRE2PARTIAL(3) ------------------------------------------------------------------------------ - - + + PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3) @@ -10368,8 +10368,8 @@ REVISION PCRE2 10.45 21 Sepbember 2024 PCRE2PATTERN(3) ------------------------------------------------------------------------------ - - + + PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3) @@ -10623,8 +10623,8 @@ REVISION PCRE2 10.41 27 July 2022 PCRE2PERFORM(3) ------------------------------------------------------------------------------ - - + + PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3) @@ -10981,8 +10981,8 @@ REVISION PCRE2 10.43 19 January 2024 PCRE2POSIX(3) ------------------------------------------------------------------------------ - - + + PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3) @@ -11265,8 +11265,8 @@ REVISION PCRE2 10.32 27 June 2018 PCRE2SERIALIZE(3) ------------------------------------------------------------------------------ - - + + PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3) @@ -11923,8 +11923,8 @@ REVISION PCRE2 10.45 24 September 2024 PCRE2SYNTAX(3) ------------------------------------------------------------------------------ - - + + PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3) @@ -12393,5 +12393,5 @@ REVISION PCRE2 10.45 22 July 2024 PCRE2UNICODE(3) ------------------------------------------------------------------------------ - - + + diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index af0590eac..beab4107f 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -4064,7 +4064,7 @@ processing a substitution such as: .sp pcre2_substitute(..., "\e\eU$1", ...) .P -The default case transformations applied by PCRE2 are reasonably complete, and, +The default case transformations applied by PCRE2 are reasonably complete, and, in UTF or UCP mode, perform the basic locale-invariant case transformations as specified by Unicode. This is suitable for the internal (invisible) case-equivalence procedures used during pattern matching, but an application diff --git a/doc/pcre2demo.3 b/doc/pcre2demo.3 index 5fccfaa29..8d40a209c 100644 --- a/doc/pcre2demo.3 +++ b/doc/pcre2demo.3 @@ -1,4 +1,4 @@ -.TH PCRE2DEMO 3 " 4 October 2024" "PCRE2 10.44" +.TH PCRE2DEMO 3 " 8 October 2024" "PCRE2 10.44" .\"AUTOMATICALLY GENERATED BY PrepareRelease - do not EDIT! .SH NAME PCRE2DEMO - A demonstration C program for PCRE2 diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1 index f689aa7ee..cd34a12b0 100644 --- a/doc/pcre2grep.1 +++ b/doc/pcre2grep.1 @@ -629,9 +629,9 @@ contents of the matched part of the line and/or captured substrings into the text. .sp $or ${ } is replaced by the captured substring of the given -decimal number; zero substitutes the whole match. If the number is greater than -the number of capturing substrings, or if the capture is unset, the replacement -is empty. +decimal number; $& (or the legacy $0) substitutes the whole match. If the +number is greater than the number of capturing substrings, or if the capture +is unset, the replacement is empty. .sp $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by newline; $r by carriage return; $t by tab; $v by vertical tab. @@ -914,9 +914,9 @@ available, provided that callouts were not completely disabled when zero-terminated string, which means it should not contain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the \fB--output\fP (\fB-O\fP) option (see -above). However, $0 cannot be used to insert a matched substring because the -match is still in progress. Instead, the single character '0' is inserted. Any -syntax errors in the string (for example, a dollar not followed by another +above). However, $0 or $& cannot be used to insert a matched substring because +the match is still in progress. Instead, the single character '0' is inserted. +Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. No terminator is added to the output string, so if you want a newline, you must include it explicitly using the escape $n. For example: @@ -945,9 +945,9 @@ arguments: .sp Any substring (including the executable name) may contain escape sequences started by a dollar character. These are the same as for the \fB--output\fP -(\fB-O\fP) option documented above, except that $0 cannot insert the matched -string because the match is still in progress. Instead, the character '0' -is inserted. If you need a literal dollar or pipe character in any +(\fB-O\fP) option documented above, except that $0 or $& cannot insert the +matched string because the match is still in progress. Instead, the character +'0' is inserted. If you need a literal dollar or pipe character in any substring, use $$ or $| respectively. Here is an example: .sp echo -e "abcde\en12345" | pcre2grep \e diff --git a/doc/pcre2grep.txt b/doc/pcre2grep.txt index ddf5391f8..55d44636b 100644 --- a/doc/pcre2grep.txt +++ b/doc/pcre2grep.txt @@ -702,188 +702,188 @@ OPTIONS captured substrings into the text. $ or ${ } is replaced by the captured sub- - string of the given decimal number; zero substitutes the - whole match. If the number is greater than the number of cap- - turing substrings, or if the capture is unset, the replace- - ment is empty. + string of the given decimal number; $& (or the legacy $0) + substitutes the whole match. If the number is greater than + the number of capturing substrings, or if the capture is un- + set, the replacement is empty. - $a is replaced by bell; $b by backspace; $e by escape; $f by - form feed; $n by newline; $r by carriage return; $t by tab; + $a is replaced by bell; $b by backspace; $e by escape; $f by + form feed; $n by newline; $r by carriage return; $t by tab; $v by vertical tab. $o or $o{ } is replaced by the character whose - code point is the given octal number. In the first form, up - to three octal digits are processed. When more digits are - needed in Unicode mode to specify a wide character, the sec- + code point is the given octal number. In the first form, up + to three octal digits are processed. When more digits are + needed in Unicode mode to specify a wide character, the sec- ond form must be used. - $x or $x{ } is replaced by the character rep- - resented by the given hexadecimal number. In the first form, - up to two hexadecimal digits are processed. When more digits - are needed in Unicode mode to specify a wide character, the + $x or $x{ } is replaced by the character rep- + resented by the given hexadecimal number. In the first form, + up to two hexadecimal digits are processed. When more digits + are needed in Unicode mode to specify a wide character, the second form must be used. - Any other character is substituted by itself. In particular, + Any other character is substituted by itself. In particular, $$ is replaced by a single dollar. -o, --only-matching Show only the part of the line that matched a pattern instead - of the whole line. In this mode, no context is shown. That - is, the -A, -B, and -C options are ignored. If there is more - than one match in a line, each of them is shown separately, - on a separate line of output. If -o is combined with -v (in- - vert the sense of the match to find non-matching lines), no - output is generated, but the return code is set appropri- - ately. If the matched portion of the line is empty, nothing - is output unless the file name or line number are being - printed, in which case they are shown on an otherwise empty + of the whole line. In this mode, no context is shown. That + is, the -A, -B, and -C options are ignored. If there is more + than one match in a line, each of them is shown separately, + on a separate line of output. If -o is combined with -v (in- + vert the sense of the match to find non-matching lines), no + output is generated, but the return code is set appropri- + ately. If the matched portion of the line is empty, nothing + is output unless the file name or line number are being + printed, in which case they are shown on an otherwise empty line. This option is mutually exclusive with --output, --file-offsets and --line-offsets. -onumber, --only-matching=number - Show only the part of the line that matched the capturing + Show only the part of the line that matched the capturing parentheses of the given number. Up to 50 capturing parenthe- - ses are supported by default. This limit can be changed via - the --om-capture option. A pattern may contain any number of - capturing parentheses, but only those whose number is within - the limit can be accessed by -o. An error occurs if the num- + ses are supported by default. This limit can be changed via + the --om-capture option. A pattern may contain any number of + capturing parentheses, but only those whose number is within + the limit can be accessed by -o. An error occurs if the num- ber specified by -o is greater than the limit. -o0 is the same as -o without a number. Because these options - can be given without an argument (see above), if an argument - is present, it must be given in the same shell item, for ex- - ample, -o3 or --only-matching=2. The comments given for the - non-argument case above also apply to this option. If the - specified capturing parentheses do not exist in the pattern, - or were not set in the match, nothing is output unless the + can be given without an argument (see above), if an argument + is present, it must be given in the same shell item, for ex- + ample, -o3 or --only-matching=2. The comments given for the + non-argument case above also apply to this option. If the + specified capturing parentheses do not exist in the pattern, + or were not set in the match, nothing is output unless the file name or line number are being output. - If this option is given multiple times, multiple substrings - are output for each match, in the order the options are - given, and all on one line. For example, -o3 -o1 -o3 causes - the substrings matched by capturing parentheses 3 and 1 and - then 3 again to be output. By default, there is no separator + If this option is given multiple times, multiple substrings + are output for each match, in the order the options are + given, and all on one line. For example, -o3 -o1 -o3 causes + the substrings matched by capturing parentheses 3 and 1 and + then 3 again to be output. By default, there is no separator (but see the next but one option). --om-capture=number - Set the number of capturing parentheses that can be accessed + Set the number of capturing parentheses that can be accessed by -o. The default is 50. --om-separator=text - Specify a separating string for multiple occurrences of -o. - The default is an empty string. Separating strings are never + Specify a separating string for multiple occurrences of -o. + The default is an empty string. Separating strings are never coloured. -P, --no-ucp - Starting from release 10.43, when UTF/Unicode mode is speci- - fied with -u or -U, the PCRE2_UCP option is used by default. + Starting from release 10.43, when UTF/Unicode mode is speci- + fied with -u or -U, the PCRE2_UCP option is used by default. This means that the POSIX classes in patterns match more than - just ASCII characters. For example, [:digit:] matches any - Unicode decimal digit. The --no-ucp option suppresses - PCRE2_UCP, thus restricting the POSIX classes to ASCII char- - acters, as was the case in earlier releases. Note that there - are now more fine-grained option settings within patterns - that affect individual classes. For example, when in UCP + just ASCII characters. For example, [:digit:] matches any + Unicode decimal digit. The --no-ucp option suppresses + PCRE2_UCP, thus restricting the POSIX classes to ASCII char- + acters, as was the case in earlier releases. Note that there + are now more fine-grained option settings within patterns + that affect individual classes. For example, when in UCP mode, the sequence (?aP) restricts [:word:] to ASCII letters, while allowing \w to match Unicode letters and digits. --posix-pattern-file - When patterns are provided with the -f option, do not trim - trailing spaces or ignore empty lines in a similar way than + When patterns are provided with the -f option, do not trim + trailing spaces or ignore empty lines in a similar way than other grep tools. To keep the behaviour consistent with older - versions, if the pattern read was terminated with CRLF (as + versions, if the pattern read was terminated with CRLF (as character literals) then both characters won't be included as - part of it, so if you really need to have pattern ending in - '\r', use a escape sequence or provide it by a different + part of it, so if you really need to have pattern ending in + '\r', use a escape sequence or provide it by a different method. -q, --quiet Work quietly, that is, display nothing except error messages. - The exit status indicates whether or not any matches were + The exit status indicates whether or not any matches were found. -r, --recursive - If any given path is a directory, recursively scan the files - it contains, taking note of any --include and --exclude set- - tings. By default, a directory is read as a normal file; in - some operating systems this gives an immediate end-of-file. - This option is a shorthand for setting the -d option to "re- + If any given path is a directory, recursively scan the files + it contains, taking note of any --include and --exclude set- + tings. By default, a directory is read as a normal file; in + some operating systems this gives an immediate end-of-file. + This option is a shorthand for setting the -d option to "re- curse". --recursion-limit=number - This is an obsolete synonym for --depth-limit. See --match- + This is an obsolete synonym for --depth-limit. See --match- limit above for details. -s, --no-messages - Suppress error messages about non-existent or unreadable - files. Such files are quietly skipped. However, the return + Suppress error messages about non-existent or unreadable + files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. -t, --total-count - This option is useful when scanning more than one file. If - used on its own, -t suppresses all output except for a grand - total number of matching lines (or non-matching lines if -v + This option is useful when scanning more than one file. If + used on its own, -t suppresses all output except for a grand + total number of matching lines (or non-matching lines if -v is used) in all the files. If -t is used with -c, a grand to- - tal is output except when the previous output is just one - line. In other words, it is not output when just one file's - count is listed. If file names are being output, the grand - total is preceded by "TOTAL:". Otherwise, it appears as just - another number. The -t option is ignored when used with -L - (list files without matches), because the grand total would + tal is output except when the previous output is just one + line. In other words, it is not output when just one file's + count is listed. If file names are being output, the grand + total is preceded by "TOTAL:". Otherwise, it appears as just + another number. The -t option is ignored when used with -L + (list files without matches), because the grand total would always be zero. -u, --utf Operate in UTF/Unicode mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (in- - cluding those for any --exclude and --include options) and - all lines that are scanned must be valid strings of UTF-8 + cluding those for any --exclude and --include options) and + all lines that are scanned must be valid strings of UTF-8 characters. If an invalid UTF-8 string is encountered, an er- ror occurs. -U, --utf-allow-invalid - As --utf, but in addition subject lines may contain invalid - UTF-8 code unit sequences. These can never form part of any - pattern match. Patterns themselves, however, must still be + As --utf, but in addition subject lines may contain invalid + UTF-8 code unit sequences. These can never form part of any + pattern match. Patterns themselves, however, must still be valid UTF-8 strings. This facility allows valid UTF-8 strings to be sought within arbitrary byte sequences in executable or - other binary files. For more details about matching in non- + other binary files. For more details about matching in non- valid UTF-8 strings, see the pcre2unicode(3) documentation. -V, --version - Write the version numbers of pcre2grep and the PCRE2 library - to the standard output and then exit. Anything else on the + Write the version numbers of pcre2grep and the PCRE2 library + to the standard output and then exit. Anything else on the command line is ignored. -v, --invert-match - Invert the sense of the match, so that lines which do not - match any of the patterns are the ones that are found. When - this option is set, options such as --only-matching and - --output, which specify parts of a match that are to be out- + Invert the sense of the match, so that lines which do not + match any of the patterns are the ones that are found. When + this option is set, options such as --only-matching and + --output, which specify parts of a match that are to be out- put, are ignored. -w, --word-regex, --word-regexp Force the patterns only to match "words". That is, there must - be a word boundary at the start and end of each matched - string. This is equivalent to having "\b(?:" at the start of - each pattern, and ")\b" at the end. This option applies only - to the patterns that are matched against the contents of - files; it does not apply to patterns specified by any of the + be a word boundary at the start and end of each matched + string. This is equivalent to having "\b(?:" at the start of + each pattern, and ")\b" at the end. This option applies only + to the patterns that are matched against the contents of + files; it does not apply to patterns specified by any of the --include or --exclude options. -x, --line-regex, --line-regexp - Force the patterns to start matching only at the beginnings - of lines, and in addition, require them to match entire + Force the patterns to start matching only at the beginnings + of lines, and in addition, require them to match entire lines. In multiline mode the match may be more than one line. This is equivalent to having "^(?:" at the start of each pat- - tern and ")$" at the end. This option applies only to the - patterns that are matched against the contents of files; it - does not apply to patterns specified by any of the --include + tern and ")$" at the end. This option applies only to the + patterns that are matched against the contents of files; it + does not apply to patterns specified by any of the --include or --exclude options. -Z, --null - Terminate files names in the regular output with a zero byte - (the NUL character) instead of what would normally appear. - This is useful when file names contain unusual characters - such as colons, hyphens, or even newlines. The option does + Terminate files names in the regular output with a zero byte + (the NUL character) instead of what would normally appear. + This is useful when file names contain unusual characters + such as colons, hyphens, or even newlines. The option does not apply to file names in error messages. @@ -897,141 +897,141 @@ ENVIRONMENT VARIABLES NEWLINES - The -N (--newline) option allows pcre2grep to scan files with newline - conventions that differ from the default. This option affects only the - way scanned files are processed. It does not affect the interpretation - of files specified by the -f, --file-list, --exclude-from, or --in- + The -N (--newline) option allows pcre2grep to scan files with newline + conventions that differ from the default. This option affects only the + way scanned files are processed. It does not affect the interpretation + of files specified by the -f, --file-list, --exclude-from, or --in- clude-from options. - Any parts of the scanned input files that are written to the standard - output are copied with whatever newline sequences they have in the in- - put. However, if the final line of a file is output, and it does not - end with a newline sequence, a newline sequence is added. If the new- - line setting is CR, LF, CRLF or NUL, that line ending is output; for + Any parts of the scanned input files that are written to the standard + output are copied with whatever newline sequences they have in the in- + put. However, if the final line of a file is output, and it does not + end with a newline sequence, a newline sequence is added. If the new- + line setting is CR, LF, CRLF or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a single NL is used. - The newline setting does not affect the way in which pcre2grep writes - newlines in informational messages to the standard output and error - streams. Under Windows, the standard output is set to be binary, so - that "\r\n" at the ends of output lines that are copied from the input - is not converted to "\r\r\n" by the C I/O library. This means that any - messages written to the standard output must end with "\r\n". For all - other operating systems, and for all messages to the standard error + The newline setting does not affect the way in which pcre2grep writes + newlines in informational messages to the standard output and error + streams. Under Windows, the standard output is set to be binary, so + that "\r\n" at the ends of output lines that are copied from the input + is not converted to "\r\r\n" by the C I/O library. This means that any + messages written to the standard output must end with "\r\n". For all + other operating systems, and for all messages to the standard error stream, "\n" is used. OPTIONS COMPATIBILITY WITH GNU GREP Many of the short and long forms of pcre2grep's options are the same as - in the GNU grep program. Any long option of the form --xxx-regexp (GNU - terminology) is also available as --xxx-regex (PCRE2 terminology). - However, the --case-restrict, --depth-limit, -E, --file-list, --file- + in the GNU grep program. Any long option of the form --xxx-regexp (GNU + terminology) is also available as --xxx-regex (PCRE2 terminology). + However, the --case-restrict, --depth-limit, -E, --file-list, --file- offsets, --heap-limit, --include-dir, --line-offsets, --locale, - --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- - tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are + --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- + tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are specific to pcre2grep, as is the use of the --only-matching option with a capturing parentheses number. - Although most of the common options work the same way, a few are dif- - ferent in pcre2grep. For example, the --include option's argument is a + Although most of the common options work the same way, a few are dif- + ferent in pcre2grep. For example, the --include option's argument is a glob for GNU grep, but in pcre2grep it is a regular expression to which - the -i option applies. If both the -c and -l options are given, GNU - grep lists only file names, without counts, but pcre2grep gives the + the -i option applies. If both the -c and -l options are given, GNU + grep lists only file names, without counts, but pcre2grep gives the counts as well. OPTIONS WITH DATA There are four different ways in which an option with data can be spec- - ified. If a short form option is used, the data may follow immedi- + ified. If a short form option is used, the data may follow immedi- ately, or (with one exception) in the next command line item. For exam- ple: -f/some/file -f /some/file - The exception is the -o option, which may appear with or without data. - Because of this, if data is present, it must follow immediately in the + The exception is the -o option, which may appear with or without data. + Because of this, if data is present, it must follow immediately in the same item, for example -o3. - If a long form option is used, the data may appear in the same command - line item, separated by an equals character, or (with two exceptions) + If a long form option is used, the data may appear in the same command + line item, separated by an equals character, or (with two exceptions) it may appear in the next command line item. For example: --file=/some/file --file /some/file - Note, however, that if you want to supply a file name beginning with ~ - as data in a shell command, and have the shell expand ~ to a home di- - rectory, you must separate the file name from the option, because the + Note, however, that if you want to supply a file name beginning with ~ + as data in a shell command, and have the shell expand ~ to a home di- + rectory, you must separate the file name from the option, because the shell does not treat ~ specially unless it is at the start of an item. - The exceptions to the above are the --colour (or --color) and --only- - matching options, for which the data is optional. If one of these op- - tions does have data, it must be given in the first form, using an + The exceptions to the above are the --colour (or --color) and --only- + matching options, for which the data is optional. If one of these op- + tions does have data, it must be given in the first form, using an equals character. Otherwise pcre2grep will assume that it has no data. USING PCRE2'S CALLOUT FACILITY - pcre2grep has, by default, support for calling external programs or - scripts or echoing specific strings during matching by making use of - PCRE2's callout facility. However, this support can be completely or - partially disabled when pcre2grep is built. You can find out whether - your binary has support for callouts by running it with the --help op- - tion. If callout support is completely disabled, callouts in patterns - are forbidden by pcre2grep. If the facility is partially disabled, - calling external programs is not supported, and callouts that request + pcre2grep has, by default, support for calling external programs or + scripts or echoing specific strings during matching by making use of + PCRE2's callout facility. However, this support can be completely or + partially disabled when pcre2grep is built. You can find out whether + your binary has support for callouts by running it with the --help op- + tion. If callout support is completely disabled, callouts in patterns + are forbidden by pcre2grep. If the facility is partially disabled, + calling external programs is not supported, and callouts that request it are ignored. - A callout in a PCRE2 pattern is of the form (?C ) where the argu- - ment is either a number or a quoted string (see the pcre2callout docu- - mentation for details). Numbered callouts are ignored by pcre2grep; + A callout in a PCRE2 pattern is of the form (?C ) where the argu- + ment is either a number or a quoted string (see the pcre2callout docu- + mentation for details). Numbered callouts are ignored by pcre2grep; only callouts with string arguments are useful. Echoing a specific string - Starting the callout string with a pipe character invokes an echoing + Starting the callout string with a pipe character invokes an echoing facility that avoids calling an external program or script. This facil- - ity is always available, provided that callouts were not completely - disabled when pcre2grep was built. The rest of the callout string is - processed as a zero-terminated string, which means it should not con- - tain any internal binary zeros. It is written to the output, having - first been passed through the same escape processing as text from the - --output (-O) option (see above). However, $0 cannot be used to insert - a matched substring because the match is still in progress. Instead, - the single character '0' is inserted. Any syntax errors in the string - (for example, a dollar not followed by another character) causes the - callout to be ignored. No terminator is added to the output string, so - if you want a newline, you must include it explicitly using the escape - $n. For example: + ity is always available, provided that callouts were not completely + disabled when pcre2grep was built. The rest of the callout string is + processed as a zero-terminated string, which means it should not con- + tain any internal binary zeros. It is written to the output, having + first been passed through the same escape processing as text from the + --output (-O) option (see above). However, $0 or $& cannot be used to + insert a matched substring because the match is still in progress. In- + stead, the single character '0' is inserted. Any syntax errors in the + string (for example, a dollar not followed by another character) causes + the callout to be ignored. No terminator is added to the output string, + so if you want a newline, you must include it explicitly using the es- + cape $n. For example: pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' - Matching continues normally after the string is output. If you want to - see only the callout output but not any output from an actual match, + Matching continues normally after the string is output. If you want to + see only the callout output but not any output from an actual match, you should end the pattern with (*FAIL). Calling external programs or scripts This facility can be independently disabled when pcre2grep is built. It - is supported for Windows, where a call to _spawnvp() is used, for VMS, - where lib$spawn() is used, and for any Unix-like environment where + is supported for Windows, where a call to _spawnvp() is used, for VMS, + where lib$spawn() is used, and for any Unix-like environment where fork() and execv() are available. If the callout string does not start with a pipe (vertical bar) charac- - ter, it is parsed into a list of substrings separated by pipe charac- - ters. The first substring must be an executable name, with the follow- + ter, it is parsed into a list of substrings separated by pipe charac- + ters. The first substring must be an executable name, with the follow- ing substrings specifying arguments: executable_name|arg1|arg2|... - Any substring (including the executable name) may contain escape se- - quences started by a dollar character. These are the same as for the - --output (-O) option documented above, except that $0 cannot insert the - matched string because the match is still in progress. Instead, the - character '0' is inserted. If you need a literal dollar or pipe charac- - ter in any substring, use $$ or $| respectively. Here is an example: + Any substring (including the executable name) may contain escape se- + quences started by a dollar character. These are the same as for the + --output (-O) option documented above, except that $0 or $& cannot in- + sert the matched string because the match is still in progress. In- + stead, the character substring, use $$ or $| respectively. Here is an + example: echo -e "abcde\n12345" | pcre2grep \ '(?x)(.)(..(.)) @@ -1044,43 +1044,43 @@ USING PCRE2'S CALLOUT FACILITY Arg1: [1] [234] [4] Arg2: |1| () 12345 - The parameters for the system call that is used to run the program or + The parameters for the system call that is used to run the program or script are zero-terminated strings. This means that binary zero charac- - ters in the callout argument will cause premature termination of their - substrings, and therefore should not be present. Any syntax errors in - the string (for example, a dollar not followed by another character) + ters in the callout argument will cause premature termination of their + substrings, and therefore should not be present. Any syntax errors in + the string (for example, a dollar not followed by another character) causes the callout to be ignored. If running the program fails for any - reason (including the non-existence of the executable), a local match- + reason (including the non-existence of the executable), a local match- ing failure occurs and the matcher backtracks in the normal way. MATCHING ERRORS - It is possible to supply a regular expression that takes a very long - time to fail to match certain lines. Such patterns normally involve - nested indefinite repeats, for example: (a+)*\d when matched against a - line of a's with no final digit. The PCRE2 matching function has a re- - source limit that causes it to abort in these circumstances. If this - happens, pcre2grep outputs an error message and the line that caused - the problem to the standard error stream. If there are more than 20 + It is possible to supply a regular expression that takes a very long + time to fail to match certain lines. Such patterns normally involve + nested indefinite repeats, for example: (a+)*\d when matched against a + line of a's with no final digit. The PCRE2 matching function has a re- + source limit that causes it to abort in these circumstances. If this + happens, pcre2grep outputs an error message and the line that caused + the problem to the standard error stream. If there are more than 20 such errors, pcre2grep gives up. - The --match-limit option of pcre2grep can be used to set the overall - resource limit. There are also other limits that affect the amount of - memory used during matching; see the discussion of --heap-limit and + The --match-limit option of pcre2grep can be used to set the overall + resource limit. There are also other limits that affect the amount of + memory used during matching; see the discussion of --heap-limit and --depth-limit above. DIAGNOSTICS Exit status is 0 if any matches were found, 1 if no matches were found, - and 2 for syntax errors, overlong lines, non-existent or inaccessible - files (even if matches were found in other files) or too many matching + and 2 for syntax errors, overlong lines, non-existent or inaccessible + files (even if matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessi- ble files does not affect the return code. - When run under VMS, the return code is placed in the symbol - PCRE2GREP_RC because VMS does not distinguish between exit(0) and + When run under VMS, the return code is placed in the symbol + PCRE2GREP_RC because VMS does not distinguish between exit(0) and exit(1). diff --git a/src/pcre2grep.c b/src/pcre2grep.c index a8ef2e4e7..1ad91da98 100644 --- a/src/pcre2grep.c +++ b/src/pcre2grep.c @@ -2024,11 +2024,23 @@ switch (*(++string)) *last = string; return DDE_ERROR; + case '&': + /* In a callout, no capture is available. Return the character '0' for + consistency with $0. */ + + if (callout) *value = '0'; + else + { + *value = 0; + rc = DDE_CAPTURE; + } + break; + case '{': brace = TRUE; string++; - if (!isdigit((unsigned char)(*string))) /* Syntax error: a decimal number required. */ - { + if (!isdigit((unsigned char)(*string))) /* Syntax error: */ + { /* a decimal number required. */ if (!callout) fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n", (int)(string - begin), "decimal number expected"); @@ -2105,9 +2117,9 @@ switch (*(++string)) { if (!isxdigit(*string)) break; if (*string >= '0' && *string <= '9') - c = c *16 + *string++ - '0'; + c = c *16 + (*string++ - '0'); else - c = c * 16 + (*string++ | 0x20) - 'a' + 10; + c = c * 16 + ((*string++ | 0x20) - 'a') + 10; } *value = c; string--; /* Point to last digit */ diff --git a/src/pcre2posix.c b/src/pcre2posix.c index 75d8cc7b0..933004c51 100644 --- a/src/pcre2posix.c +++ b/src/pcre2posix.c @@ -140,7 +140,7 @@ static const int eint2[] = { 92, REG_INVARG, /* invalid option bits with PCRE2_LITERAL */ 98, REG_EESCAPE, /* missing digit after \0 in NO_BS0 mode */ 99, REG_EESCAPE, /* \K in lookaround */ - 102, REG_EESCAPE /* \ddd octal > \377 in PYTHON_OCTAL mode */ + 102, REG_EESCAPE /* \ddd octal > \377 in PYTHON_OCTAL mode */ }; /* Table of texts corresponding to POSIX error codes */ diff --git a/testdata/grepoutput b/testdata/grepoutput index e4d7a3f5d..94b951264 100644 --- a/testdata/grepoutput +++ b/testdata/grepoutput @@ -876,6 +876,10 @@ RC=0 ./testdata/grepinput:a binary zero:zeroa ./testdata/grepinput:the binary zero.:zerothe. RC=0 +./testdata/grepinput:the binary zero.:zerothe. +./testdata/grepinput:a binary zero:zeroa +./testdata/grepinput:the binary zero.:zerothe. +RC=0 the binary zero.: RC=0 pcre2grep: Error in output text at offset 2: decimal number expected