Skip to content

Commit

Permalink
add documentations
Browse files Browse the repository at this point in the history
  • Loading branch information
mahmudhera committed Nov 7, 2024
1 parent 398538e commit 42d1717
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 8 deletions.
22 changes: 20 additions & 2 deletions src/cpp/compute_similarity.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
/*
* Author: Mahmudur Rahman Hera ([email protected])
* Date: November 1, 2024
* Description: yacht train core using indexing of sketches to do genome comparison
* Description: This code reads the query and target sketches from the files, builds an index from the target sketches, and computes the similarity matrix.
* All query vs all target pairs are written if containment(query,target) >= provided threshold.
*
* Output files are written in the output directory. Many output files are
* generated, in the form a_bcd.txt, where a is the pass id, and bcd is the thread id.
* By default, the output files are not combined. If you want to combine the output files,
* use the -C flag. The combined output file will be written to the output filename provided.
* Each line in the output file contains the query and target sketch ids, and the similarity value.
* A typical line in the output file looks like this: 12 34 0.2 0.3 0.4
* This means that the (12+1)-th query sketch is similar to the (34+1)-th target sketch,
* and the Jaccard, containment(query,target), and containment(target,query) values are 0.2, 0.3, and 0.4.
*/

#include "argparse.hpp"
Expand Down Expand Up @@ -49,7 +59,15 @@ typedef unsigned long long int hash_t;

void parse_arguments(int argc, char *argv[], Arguments &arguments) {

argparse::ArgumentParser parser("yacht train using indexing of sketches");
argparse::ArgumentParser parser("compute similarity of targets with queries");

parser.add_description("This code reads the query and target sketches from the files, builds an index from the target sketches, and computes the similarity matrix.\n"
"All query vs all target pairs are written if containment(query,target) >= provided threshold.\n"
"Output files are written in the output directory. Many output files are generated, in the form a_bcd.txt, where a is the pass id, and bcd is the thread id.\n"
"By default, the output files are not combined. If you want to combine the output files, use the -C flag. The combined output file will be written to the output filename provided.\n"
"Each line in the output file contains the query and target sketch ids, and the similarity value.\n"
"A typical line in the output file looks like this: 12,34,0.2,0.3,0.4\n"
"This means that the (12+1)-th query sketch is similar to the (34+1)-th target sketch, and the Jaccard, containment(query,target), and containment(target,query) values are 0.2, 0.3, and 0.4.");

parser.add_argument("file_list_query")
.help("file containing paths of query sketches")
Expand Down
74 changes: 68 additions & 6 deletions src/cpp/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,85 @@ using json = nlohmann::json;

typedef unsigned long long int hash_t;

std::vector<hash_t> read_min_hashes(const std::string&);
/**
* @brief Read the min-hashes from a FMH sketch file
*
* Assumption: the file is a json file, and its not gzipped
*
* @param sketch_path The path to the sketch file
*/
std::vector<hash_t> read_min_hashes(const std::string& sketch_path);

void compute_index_from_sketches(std::vector<std::vector<hash_t>>&, std::unordered_map<hash_t, std::vector<int>>&);

/*
Function to read the paths of the sketches from a
*/
void get_sketch_paths(const std::string&, std::vector<std::string>&);




/**
* @brief Compute the index from the sketches
*
* @param sketches The sketches
* @param hash_index The reference to the hash index (where the index will be stored)
*/
void compute_index_from_sketches(std::vector<std::vector<hash_t>>& sketches, std::unordered_map<hash_t, std::vector<int>>& hash_index);






/**
* @brief Get the sketch paths
*
* @param filelist The file containing the paths of the sketches
* @param sketch_paths The vector to store the paths
*/
void get_sketch_paths(const std::string& filelist, std::vector<std::string>& sketch_paths);






/**
* @brief Read the sketches from the sketch paths
*
* @param sketch_paths The paths to the sketches
* @param sketches The vector to store the sketches
* @param empty_sketch_ids The vector to store the ids of empty sketches
* @param num_threads The number of threads to use
*/
void read_sketches(std::vector<std::string>& sketch_paths,
std::vector<std::vector<hash_t>>& sketches,
std::vector<int>& empty_sketch_ids,
const uint num_threads);





/**
* @brief Show the empty sketches
*
* @param empty_sketch_ids The ids of the empty sketches
*/
void show_empty_sketches(const std::vector<int>&);




/**
* @brief Compute the intersection matrix
*
* @param sketches_query The query sketches
* @param sketches_ref The reference (target) sketches
* @param hash_index_ref The index of the reference (target) sketches
* @param out_dir The output directory to store the results
* @param similars The vector to store the similar sketches
* @param containment_threshold The containment threshold
* @param num_passes The number of passes to use
* @param num_threads The number of threads to use
*/
void compute_intersection_matrix(const std::vector<std::vector<hash_t>>& sketches_query,
const std::vector<std::vector<hash_t>>& sketches_ref,
const std::unordered_map<hash_t, std::vector<int>>& hash_index_ref,
Expand Down

0 comments on commit 42d1717

Please sign in to comment.