Skip to content

Commit

Permalink
preprocess default corpus and embed in binary
Browse files Browse the repository at this point in the history
though this does increase binary size by ~40MB
  • Loading branch information
evanrichter committed Jul 3, 2023
1 parent 2536493 commit 18ae02a
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 12 deletions.
67 changes: 67 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,18 @@ anyhow = "1.0.71"
clap = "4.3.3"
glob = "0.3.1"
log = "0.4.19"
postcard = { version = "1.0.4", features = ["use-std"], default-features = false }
serde = { version = "1.0.164", features = ["derive"] }
simple_logger = "4.1.0"
tablestream = "0.1.3"

[dev-dependencies]
assert_approx_eq = "1.1.0"
rand = "0.8.5"

[build-dependencies]
anyhow = "1.0.71"
glob = "0.3.1"
log = "0.4.19"
postcard = { version = "1.0.4", features = ["use-std"], default-features = false }
serde = { version = "1.0.164", features = ["derive"] }
20 changes: 20 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#[allow(dead_code)]
mod corpus {
include!("src/corpus.rs");
}

fn main() {
// load default corpus
let default = corpus::load_corpus("cpu_rec_corpus/*.corpus").unwrap();
println!("cargo:rerun-if-changed=cpu_rec_corpus");

// serialize to bytes
let bytes = postcard::to_stdvec(&default).unwrap();

// output path to target build folder
let mut outfile = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap());
outfile.push("default.pc");

// write to file
std::fs::write(outfile, bytes).unwrap();
}
3 changes: 2 additions & 1 deletion src/corpus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
use anyhow::{Context, Error, Ok, Result};
use glob::glob;
use log::debug;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt::Debug;
use std::str::FromStr;
use std::string::String;

#[derive(Debug)]
#[derive(Debug, Deserialize, Serialize)]
pub struct CorpusStats {
pub arch: String,
bigrams_freq: HashMap<(u8, u8), f32>,
Expand Down
35 changes: 24 additions & 11 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ fn main() -> Result<()> {
.propagate_version(true)
.author("Raphaël Rigo <[email protected]>")
.about("Identifies CPU architectures in binaries")
.arg(arg!(--corpus <corpus_dir>).default_value("cpu_rec_corpus"))
.arg(arg!(--corpus <corpus_dir>))
.arg(arg!(-d - -debug))
.arg(arg!(-v - -verbose))
.arg(
Expand All @@ -223,17 +223,30 @@ fn main() -> Result<()> {
};
simple_logger::init_with_level(level)?;

let corpus_dir = args.get_one::<String>("corpus").unwrap().to_owned();
if !Path::new(&corpus_dir).is_dir() {
return Err(Error::msg(format!(
"{} is not a valid directory",
corpus_dir
)));
}
let corpus_files: String = args.get_one::<String>("corpus").unwrap().to_owned() + "/*.corpus";
println!("Loading corpus from {}", corpus_files);
let default_corpus: Vec<CorpusStats> = {
// serialized bytes embedded from build.rs
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/default.pc"));

// deserialize
postcard::from_bytes(bytes).unwrap()
};

let corpus_stats = load_corpus(&corpus_files)?;
let corpus_stats = match args.get_one::<&str>("corpus") {
// if no arg given, use embedded corpus
None => default_corpus,
// attempt to load the given corpus folder
Some(corpus_dir) => {
if !Path::new(corpus_dir).is_dir() {
return Err(Error::msg(format!(
"{} is not a valid directory",
corpus_dir
)));
}
let corpus_files = format!("{corpus_dir}/*.corpus");
println!("Loading corpus from {}", corpus_files);
load_corpus(&corpus_files)?
}
};

info!("Corpus size: {}", corpus_stats.len());

Expand Down

0 comments on commit 18ae02a

Please sign in to comment.