Skip to content

Commit

Permalink
feat: implement an option for allowing invalid escape sequences in re…
Browse files Browse the repository at this point in the history
…gular expressions

Historically, YARA has accepted any character that is preceded by a backslash in a regular expression, even if the sequence is not a valid one. For instance, `\n`, `\t` and `\w` are valid escape sequences in a regexp, but `\N`, `\T` and `\j` are not. However, YARA accepts all of these sequences. The valid escape sequences are interpreted as their special meaning (`\n` is a new-line, `\w` is a word character, etc.), while invalid escape sequences are interpreted simply as the character that appears after the backslash. So, `\N` becomes `N`, and `\j` becomes `j`.

This change introduces the `Compiler::relaxed_regexp_escape_sequences` API, which allows to turn on an option that makes YARA-X to behave in the same way than YARA with respect to invalid escape sequences in regular expressions. This option is turned off by default.

Also, the option `--relaxed-escape-sequences` is added to the CLI.
  • Loading branch information
plusvic committed May 13, 2024
1 parent 6ebc57c commit 9aa4477
Show file tree
Hide file tree
Showing 14 changed files with 260 additions and 80 deletions.
11 changes: 10 additions & 1 deletion cli/src/commands/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ pub fn compile() -> Command {
arg!(--"path-as-namespace")
.help("Use file path as rule namespace"),
)
.arg(
arg!(--"relaxed-escape-sequences")
.help("Allow invalid escape sequences in regular expressions"),
)
.arg(
Arg::new("define")
.short('d')
Expand All @@ -47,7 +51,12 @@ pub fn exec_compile(args: &ArgMatches) -> anyhow::Result<()> {
.get_many::<(String, serde_json::Value)>("define")
.map(|var| var.cloned().collect());

let rules = compile_rules(rules_path, path_as_namespace, external_vars)?;
let rules = compile_rules(
rules_path,
path_as_namespace,
external_vars,
args.get_flag("relaxed-escape-sequences"),
)?;

let output_file = File::create(output_path).with_context(|| {
format!("can not write `{}`", output_path.display())
Expand Down
6 changes: 4 additions & 2 deletions cli/src/commands/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,14 @@ pub fn compile_rules<'a, P>(
paths: P,
path_as_namespace: bool,
external_vars: Option<Vec<(String, Value)>>,
relaxed_regexp_escape_sequences: bool,
) -> Result<Rules, anyhow::Error>
where
P: Iterator<Item = &'a PathBuf>,
{
let mut compiler: Compiler<'_> =
Compiler::new().colorize_errors(stdout().is_tty());
let mut compiler: Compiler<'_> = Compiler::new()
.relaxed_regexp_escape_sequences(relaxed_regexp_escape_sequences)
.colorize_errors(stdout().is_tty());

if let Some(vars) = external_vars {
for (ident, value) in vars {
Expand Down
22 changes: 20 additions & 2 deletions cli/src/commands/scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ pub fn scan() -> Command {
.required(false)
.value_parser(value_parser!(u64).range(1..))
)
.arg(
arg!(--"relaxed-escape-sequences")
.help("Allow invalid escape sequences in regular expressions")
)
.arg(
arg!(-d --"define")
.help("Define external variable")
Expand All @@ -104,6 +108,7 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> {
let skip_larger = args.get_one::<u64>("skip-larger");
let negate = args.get_flag("negate");
let disable_console_logs = args.get_flag("disable-console-logs");

let timeout = args.get_one::<u64>("timeout");

let mut external_vars: Option<Vec<(String, serde_json::Value)>> = args
Expand All @@ -114,7 +119,15 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> {
if rules_path.len() > 1 {
bail!(
"can't use '{}' with more than one RULES_PATH",
Paint::new("--compiled-rules").bold()
Paint::bold("--compiled-rules")
);
}

if args.get_flag("relaxed-escape-sequences") {
bail!(
"can't use '{}' together with '{}'",
Paint::bold("--relaxed-escape-sequences"),
Paint::bold("--compiled-rules")
);
}

Expand All @@ -140,7 +153,12 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> {
// With `take()` we pass the external variables to `compile_rules`,
// while leaving a `None` in `external_vars`. This way external
// variables are not set again in the scanner.
compile_rules(rules_path, path_as_namespace, external_vars.take())?
compile_rules(
rules_path,
path_as_namespace,
external_vars.take(),
args.get_flag("relaxed-escape-sequences"),
)?
};

let rules_ref = &rules;
Expand Down
9 changes: 6 additions & 3 deletions lib/src/compiler/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ pub(in crate::compiler) struct CompileContext<'a, 'src, 'sym> {
/// Information about the rules compiled so far.
pub rules: &'a Vec<RuleInfo>,

/// A slice that contains the IR for the patterns declared in the current
/// rule.
pub current_rule_patterns: &'a mut [ir::PatternInRule<'src>],
/// Reference to a vector that contains the IR for the patterns declared
/// in the current rule.
pub current_rule_patterns: &'a mut Vec<ir::PatternInRule<'src>>,

/// Warnings generated during the compilation.
pub warnings: &'a mut Warnings,
Expand All @@ -42,6 +42,9 @@ pub(in crate::compiler) struct CompileContext<'a, 'src, 'sym> {
/// Stack of variables. These are local variables used during the
/// evaluation of rule conditions, for example for storing loop variables.
pub vars: VarStack,

/// Allow invalid escape sequences in regular expressions.
pub relaxed_regexp_escape_sequences: bool,
}

impl<'a, 'src, 'sym> CompileContext<'a, 'src, 'sym> {
Expand Down
50 changes: 23 additions & 27 deletions lib/src/compiler/ir/ast2ir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,47 +17,42 @@ use crate::compiler::ir::{
MatchAnchor, Of, OfItems, Pattern, PatternFlagSet, PatternFlags,
PatternIdx, PatternInRule, Quantifier, Range, RegexpPattern,
};
use crate::compiler::{CompileContext, CompileError, Warnings};
use crate::compiler::{CompileContext, CompileError};
use crate::modules::BUILTIN_MODULES;
use crate::re;
use crate::re::parser::Error;
use crate::symbols::{Symbol, SymbolKind, SymbolLookup, SymbolTable};
use crate::types::{Map, Regexp, Type, TypeValue, Value};

pub(in crate::compiler) fn patterns_from_ast<'src>(
report_builder: &ReportBuilder,
ctx: &mut CompileContext<'_, 'src, '_>,
patterns: Option<&Vec<ast::Pattern<'src>>>,
warnings: &mut Warnings,
) -> Result<Vec<PatternInRule<'src>>, Box<CompileError>> {
patterns
.into_iter()
.flatten()
.map(|p| pattern_from_ast(report_builder, p, warnings))
.collect::<Result<Vec<PatternInRule<'src>>, Box<CompileError>>>()
) -> Result<(), Box<CompileError>> {
for pattern_ast in patterns.into_iter().flatten() {
let pattern = pattern_from_ast(ctx, pattern_ast)?;
ctx.current_rule_patterns.push(pattern);
}
Ok(())
}

fn pattern_from_ast<'src>(
report_builder: &ReportBuilder,
ctx: &mut CompileContext,
pattern: &ast::Pattern<'src>,
warnings: &mut Warnings,
) -> Result<PatternInRule<'src>, Box<CompileError>> {
match pattern {
ast::Pattern::Text(pattern) => {
Ok(text_pattern_from_ast(report_builder, pattern, warnings)?)
}
ast::Pattern::Hex(pattern) => {
Ok(hex_pattern_from_ast(report_builder, pattern, warnings)?)
Ok(text_pattern_from_ast(ctx, pattern)?)
}
ast::Pattern::Hex(pattern) => Ok(hex_pattern_from_ast(ctx, pattern)?),
ast::Pattern::Regexp(pattern) => {
Ok(regexp_pattern_from_ast(report_builder, pattern, warnings)?)
Ok(regexp_pattern_from_ast(ctx, pattern)?)
}
}
}

pub(in crate::compiler) fn text_pattern_from_ast<'src>(
_report_builder: &ReportBuilder,
_ctx: &mut CompileContext,
pattern: &ast::TextPattern<'src>,
_warnings: &mut Warnings,
) -> Result<PatternInRule<'src>, Box<CompileError>> {
let mut flags = PatternFlagSet::none();

Expand Down Expand Up @@ -117,9 +112,8 @@ pub(in crate::compiler) fn text_pattern_from_ast<'src>(
}

pub(in crate::compiler) fn hex_pattern_from_ast<'src>(
_report_builder: &ReportBuilder,
_ctx: &mut CompileContext,
pattern: &ast::HexPattern<'src>,
_warnings: &mut Warnings,
) -> Result<PatternInRule<'src>, Box<CompileError>> {
Ok(PatternInRule {
identifier: pattern.identifier.name,
Expand All @@ -132,9 +126,8 @@ pub(in crate::compiler) fn hex_pattern_from_ast<'src>(
}

pub(in crate::compiler) fn regexp_pattern_from_ast<'src>(
report_builder: &ReportBuilder,
ctx: &mut CompileContext,
pattern: &ast::RegexpPattern<'src>,
warnings: &mut Warnings,
) -> Result<PatternInRule<'src>, Box<CompileError>> {
let mut flags = PatternFlagSet::none();

Expand Down Expand Up @@ -165,9 +158,9 @@ pub(in crate::compiler) fn regexp_pattern_from_ast<'src>(
{
let i_pos = pattern.regexp.literal.rfind('i').unwrap();

warnings.add(|| {
ctx.warnings.add(|| {
Warning::redundant_case_modifier(
report_builder,
&ctx.report_builder,
pattern.modifiers.nocase().unwrap().span(),
pattern.span().subspan(i_pos, i_pos + 1),
)
Expand Down Expand Up @@ -203,9 +196,10 @@ pub(in crate::compiler) fn regexp_pattern_from_ast<'src>(
let hir = re::parser::Parser::new()
.force_case_insensitive(flags.contains(PatternFlags::Nocase))
.allow_mixed_greediness(false)
.relaxed_escape_sequences(ctx.relaxed_regexp_escape_sequences)
.parse(&pattern.regexp)
.map_err(|err| {
re_error_to_compile_error(report_builder, &pattern.regexp, err)
re_error_to_compile_error(ctx.report_builder, &pattern.regexp, err)
})?;

// TODO: raise warning when .* used, propose using the non-greedy
Expand Down Expand Up @@ -253,8 +247,10 @@ pub(in crate::compiler) fn expr_from_ast(
ast::Expr::LiteralString(literal) => Ok(Expr::Const(TypeValue::const_string_from(literal.value.as_bytes()))),

ast::Expr::Regexp(regexp) => {
re::parser::Parser::new().parse(regexp.as_ref()).map_err(|err| {
re_error_to_compile_error(ctx.report_builder, regexp, err)
re::parser::Parser::new()
.relaxed_escape_sequences(ctx.relaxed_regexp_escape_sequences)
.parse(regexp.as_ref())
.map_err(|err| { re_error_to_compile_error(ctx.report_builder, regexp, err)
})?;

Ok(Expr::Const(TypeValue::Regexp(Some(Regexp::new(
Expand Down
2 changes: 1 addition & 1 deletion lib/src/compiler/ir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ process goes like:
`source code -> CST -> AST -> IR -> compiled rules`
Contrary to the AST, the IR doesn't have an one-to-one correspondence to the
Contrary to the AST, the IR doesn't have a one-to-one correspondence to the
original source code, the compiler is free to transform the IR in ways that
maintain the semantics of the original source code but doesn't match the code
exactly. This could be done for example for optimization purposes. Another
Expand Down
84 changes: 58 additions & 26 deletions lib/src/compiler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ struct Namespace {
/// ```
///
pub struct Compiler<'a> {
/// Allow invalid escape sequences in regexps.
relaxed_regexp_escape_sequences: bool,

/// Used for generating error and warning reports.
report_builder: ReportBuilder,

Expand Down Expand Up @@ -304,6 +307,7 @@ impl<'a> Compiler<'a> {
wasm_mod,
wasm_symbols,
wasm_exports,
relaxed_regexp_escape_sequences: false,
next_pattern_id: PatternId(0),
current_pattern_id: PatternId(0),
current_namespace: default_namespace,
Expand Down Expand Up @@ -517,6 +521,8 @@ impl<'a> Compiler<'a> {

let mut rules = Rules {
serialized_globals,
relaxed_regexp_escape_sequences: self
.relaxed_regexp_escape_sequences,
wasm_mod: compiled_wasm_mod,
ac: None,
num_patterns: self.next_pattern_id.0 as usize,
Expand Down Expand Up @@ -552,8 +558,28 @@ impl<'a> Compiler<'a> {
///
/// Colorized error messages contain ANSI escape sequences that make them
/// look nicer on compatible consoles. The default setting is `false`.
pub fn colorize_errors(mut self, b: bool) -> Self {
self.report_builder.with_colors(b);
pub fn colorize_errors(mut self, yes: bool) -> Self {
self.report_builder.with_colors(yes);
self
}

/// Allow invalid escape sequences in regular expressions.
///
/// Historically, YARA has accepted any character that is preceded by a
/// backslash in a regular expression, even if the sequence is not a valid
/// one. For instance, `\n`, `\t` and `\w` are valid escape sequences in a
/// regexp, but `\N`, `\T` and `\j` are not. However, YARA accepts all of
/// these sequences. The valid escape sequences are interpreted as their
/// special meaning (`\n` is a new-line, `\w` is a word character, etc.),
/// while invalid escape sequences are interpreted simply as the character
/// that appears after the backslash. So, `\N` becomes `N`, and `\j`
/// becomes `j`.
///
/// This controls whether the parser should accept invalid escape sequences
/// and translate them to plain characters. They are not accepted by
/// default.
pub fn relaxed_regexp_escape_sequences(mut self, yes: bool) -> Self {
self.relaxed_regexp_escape_sequences = yes;
self
}

Expand Down Expand Up @@ -686,13 +712,6 @@ impl<'a> Compiler<'a> {
// corresponding to failed rules.
let snapshot = self.take_snapshot();

// Convert the patterns from AST to IR.
let mut patterns_in_rule = patterns_from_ast(
&self.report_builder,
rule.patterns.as_ref(),
&mut self.warnings,
)?;

// The RuleId for the new rule is current length of `self.rules`. The
// first rule has RuleId = 0.
let rule_id = RuleId(self.rules.len() as i32);
Expand All @@ -715,22 +734,35 @@ impl<'a> Compiler<'a> {
is_private: rule.flags.contains(RuleFlag::Private),
});

let mut rule_patterns = Vec::new();

let mut ctx = CompileContext {
relaxed_regexp_escape_sequences: self
.relaxed_regexp_escape_sequences,
current_symbol_table: None,
symbol_table: &mut self.symbol_table,
ident_pool: &mut self.ident_pool,
report_builder: &self.report_builder,
rules: &self.rules,
current_rule_patterns: &mut rule_patterns,
warnings: &mut self.warnings,
vars: VarStack::new(),
};

// Convert the patterns from AST to IR. Populates `patterns_in_rule`
// vector.
if let Err(err) = patterns_from_ast(&mut ctx, rule.patterns.as_ref()) {
drop(ctx);
self.restore_snapshot(snapshot);
return Err(Box::new(*err));
};

// Convert the rule condition's AST to the intermediate representation
// (IR). Also updates the patterns with information about whether they
// are anchored or not.
let condition = bool_expr_from_ast(
&mut CompileContext {
current_symbol_table: None,
symbol_table: &mut self.symbol_table,
ident_pool: &mut self.ident_pool,
report_builder: &self.report_builder,
rules: &self.rules,
current_rule_patterns: patterns_in_rule.as_mut_slice(),
warnings: &mut self.warnings,
vars: VarStack::new(),
},
&rule.condition,
);
let condition = bool_expr_from_ast(&mut ctx, &rule.condition);

drop(ctx);

// In case of error, restore the compiler to the state it was before
// entering this function. Also, if the error is due to an unknown
Expand Down Expand Up @@ -798,14 +830,14 @@ impl<'a> Compiler<'a> {
// No other symbol with the same identifier should exist.
assert!(existing_symbol.is_none());

let mut pattern_ids = Vec::with_capacity(patterns_in_rule.len());
let mut pattern_ids = Vec::with_capacity(rule_patterns.len());
let mut pending_patterns = HashSet::new();

let current_rule = self.rules.last_mut().unwrap();

for pattern in &patterns_in_rule {
for pattern in &rule_patterns {
// Check if this pattern has been declared before, in this rule or
// in some other rule. In such cases the pattern ID is re-used and
// in some other rule. In such cases the pattern ID is re-used, and
// we don't need to process (i.e: extract atoms and add them to
// Aho-Corasick automaton) the pattern again. Two patterns are
// considered equal if they are exactly the same, including any
Expand Down Expand Up @@ -839,7 +871,7 @@ impl<'a> Compiler<'a> {
// to `self.sub_patterns`
for (pattern_id, pattern, span) in izip!(
pattern_ids.iter(),
patterns_in_rule.into_iter(),
rule_patterns.into_iter(),
rule.patterns.iter().flatten().map(|p| p.span())
) {
if pending_patterns.contains(pattern_id) {
Expand Down
Loading

0 comments on commit 9aa4477

Please sign in to comment.