server/src/lexer/pest/c.pest

// Copyright (C) 2023 Bryan A. Jones.
//
// This file is part of the CodeChat Editor. The CodeChat Editor is free
// software: you can redistribute it and/or modify it under the terms of the GNU
// General Public License as published by the Free Software Foundation, either
// version 3 of the License, or (at your option) any later version.
//
// The CodeChat Editor is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
// details.
//
// You should have received a copy of the GNU General Public License along with
// the CodeChat Editor. If not, see
// [http://www.gnu.org/licenses](http://www.gnu.org/licenses).
//
// # `c.pest` - Pest parser definition for the C language
//
// ## Comments
doc_block = _{ inline_comment | block_comment }

// Per the
// [C standard, section 6.4.3](https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3220.pdf#page=65),
// "white-space consists of: (space, horizontal tab, new-line, vertical tab, and
// form-feed)." Omit newlines, since the rest of this parser uses these.
vertical_tab = { "\x0B" }
form_feed    = { "\x0C" }
white_space  = { (" " | "\t" | vertical_tab | form_feed)* }

// The
// [C standard, section 6.4.9](https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3220.pdf#page=65),
// defines inline and block comments.
//
// ### Inline comments
inline_comment_delims  = _{ inline_comment_delim_0 }
inline_comment_delim_0 =  { "//" }
inline_comment_delim_1 =  { unused }
inline_comment_delim_2 =  { unused }
inline_comment_char =    _{ not_newline }

// ### Block comments
block_comment                 =  { block_comment_0 }
block_comment_opening_delim_0 =  { "/*" }
block_comment_opening_delim_1 =  { unused }
block_comment_opening_delim_2 =  { unused }
block_comment_closing_delim_0 =  { "*/" }
block_comment_closing_delim_1 =  { unused }
block_comment_closing_delim_2 =  { unused }

// ## Code
//
// Per the
// [C standard, section 5.1.1.2](https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3220.pdf#page=24),
// if a line of code ends with a backslash, it continues on the next line. This
// is a logical line; treat it as a single line. Therefore, consider a
// backslash-newline (or anything that's not a newline) a part of the current
// logical line. Note that this parser doesn't apply this rule to comments
// (which, per the spec, it should) for several reasons:
//
// 1.  Comments continued onto another line don't look like a comment; this
//     would confuse most developers.
// 2.  The backslash-newline in a comment creates a
//     [hard line break](https://spec.commonmark.org/0.31.2/#hard-line-breaks)
//     in Markdown, which means inserting a hard line break this way in an
//     inline comment requires the next line to omit the inline comment
//     delimiters. For example: 
//
//     ```C
//     // This is a hard line break\
//     followed by a comment which must not include the // inline comment
//     // delimiter on the line after the line break, but which must
//     include them on following lines.
//     ```
// 3.  The CodeChat Editor web-to-code function produces incorrect results in
//     this case, adding a comment delimiter when it shouldn't. To fix this, it
//     would have to look for a backslash newline only in C/C++-like languages.
logical_line_char   = _{ ("\\" ~ NEWLINE) | not_newline }
code_line_token = _{ logical_line_char }

// ## Dedenter
//
// This parser runs separately; it dedents block comments. There are several
// cases:
//
// - A single line: `/* comment */`. No special handling needed.
// - Multiple lines, in two styles.
//   - Each line of the comment is not consistently whitespace-indented. No
//     special handling needed. For example:
//
//     ```C
//     /* This is
//       not
//        consistently indented. */
//     ```
//
//   - Each line of the comment is consistently whitespace-indented; for
//     example:
//
//     ```C
//     /* This is
//        consistently indented. */
//     ```
//
//     Consistently indented means the first non-whitespace character on a line
//     aligns with, but never comes before, the comment's start. Another
//     example:
//
//     ```C
//     /* This is
//        correct
//
//        indentation.
//      */
//     ```
//
//     Note that the third (blank) line doesn't have an indent; since that line
//     consists only of whitespace, this is OK. Likewise, the last line
//     (containing the closing comment delimiter of `*/`) consists only of
//     whitespace after the comment delimiters are removed.
//
//   - Each line of the comment is consistently asterisk-indented; for example:
//
//     ```C
//     /* This is
//      * correct
//      *
//      * indentation.
//      */
//     ```
//
//     Note that in this case, no whitespace-only lines are allowed. Instead,
//     the special case is lines which have a newline immediately after the `*`.
//
// To implement this dedenting, we must have two paths to accepting the contents
// of a block comment. Otherwise, this parser rejects the block (it cannot be
// dedented). The approach:
//
// 1.  The space-indented path. This requires:
//     1.  The first line ends with a newline. (`valid_first_line`)
//     2.  Non-first lines with contents must be properly indented. If a
//         non-first line ends in a newline, it must not be the last line.
//         (`dedented_line`)
//     3.  A whitespace-only line must not be the last line, unless it has
//         exactly the indent needed to align the closing comment delimiter
//         (`last_line`).
// 2.  The asterisk-indented path. The requirements are the same as the
//     space-indented path, though the proper indent includes an asterisk in the
//     correct location.
dedenter = {
    SOI ~ indent ~ valid_first_line ~ (valid_space_line+ | valid_star_line+) ~ DROP ~ EOI
}
// Provide as input to this the amount of whitespace preceding either a " " or a
// "\* ".
indent           =  _{ PUSH(" "*) ~ NEWLINE }
valid_first_line =   { not_newline* ~ NEWLINE }
valid_space_line =  _{ (space_dedent ~ dedented_line) | last_line | (white_space ~ not_newline_eoi ~ vis_newline) }
valid_star_line  = _{ (star_dedent ~ dedented_line) | last_line | (PEEK ~ " *" ~ not_newline_eoi ~ vis_newline) }
space_dedent     = _{ PEEK ~ "   " }
dedented_line    =  { not_newline* ~ not_newline_eoi ~ newline_eoi }
last_line        =  { PEEK ~ " " ~ EOI }
vis_newline      =  { NEWLINE }
not_newline_eoi  = _{ !(NEWLINE ~ EOI) }
star_dedent      = _{ PEEK ~ " * " }

// CodeChat Editor lexer: c_cpp.