From df84a6d5bcbc494a88afdc732c70b88443c0f696 Mon Sep 17 00:00:00 2001 From: "REDMOND\\ninchen" Date: Fri, 12 Jan 2024 10:05:18 -0800 Subject: [PATCH] BANN single file Save and Load. --- include/abstract_graph_store.h | 8 +++ include/defaults.h | 2 +- include/in_mem_graph_store.h | 12 +++-- include/index.h | 2 + include/parameters.h | 16 +++--- src/in_mem_graph_store.cpp | 15 +++++- src/index.cpp | 97 +++++++++++++++++++++++----------- src/pq_flash_index.cpp | 2 +- 8 files changed, 108 insertions(+), 46 deletions(-) diff --git a/include/abstract_graph_store.h b/include/abstract_graph_store.h index 750fec727..5c239da7e 100644 --- a/include/abstract_graph_store.h +++ b/include/abstract_graph_store.h @@ -7,6 +7,8 @@ #include #include "types.h" +class AlignedFileReader; + namespace diskann { @@ -21,8 +23,14 @@ class AbstractGraphStore virtual ~AbstractGraphStore() = default; // returns tuple of +#ifdef EXEC_ENV_OLS + virtual std::tuple load(AlignedFileReader &reader, const size_t num_points, + size_t offset) = 0; +#else virtual std::tuple load(const std::string &index_path_prefix, const size_t num_points, size_t offset) = 0; +#endif + virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_fz_points, const uint32_t start) = 0; diff --git a/include/defaults.h b/include/defaults.h index 5ea5af495..ef1750fcf 100644 --- a/include/defaults.h +++ b/include/defaults.h @@ -17,7 +17,7 @@ const uint32_t NUM_FROZEN_POINTS_STATIC = 0; const uint32_t NUM_FROZEN_POINTS_DYNAMIC = 1; // In-mem index related limits -const float GRAPH_SLACK_FACTOR = 1.3; +const float GRAPH_SLACK_FACTOR = 1.3f; // SSD Index related limits const uint64_t MAX_GRAPH_DEGREE = 512; diff --git a/include/in_mem_graph_store.h b/include/in_mem_graph_store.h index 95e4dbcce..543a0cca7 100644 --- a/include/in_mem_graph_store.h +++ b/include/in_mem_graph_store.h @@ -14,8 +14,13 @@ class InMemGraphStore : public AbstractGraphStore InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree); // returns tuple of - virtual std::tuple load(const std::string &index_path_prefix, const size_t num_points, +#ifdef EXEC_ENV_OLS + virtual std::tuple load(AlignedFileReader &reader, const size_t num_points, size_t offset) override; +#else + virtual std::tuple load(const std::string &filename, size_t expected_num_points, + size_t offset); +#endif virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_frozen_points, const uint32_t start) override; virtual int store(std::ofstream &writer, const size_t num_points, const size_t num_fz_points, const uint32_t start, @@ -34,11 +39,12 @@ class InMemGraphStore : public AbstractGraphStore virtual uint32_t get_max_observed_degree() override; protected: - virtual std::tuple load_impl(const std::string &filename, size_t expected_num_points, - size_t offset); #ifdef EXEC_ENV_OLS virtual std::tuple load_impl(AlignedFileReader &reader, size_t expected_num_points, size_t offset); +#else + virtual std::tuple load_impl(const std::string &filename, size_t expected_num_points, + size_t offset); #endif int save_graph(std::ofstream &writer, const size_t active_points, const size_t num_frozen_points, diff --git a/include/index.h b/include/index.h index 387a9ac07..fc42d6e0d 100644 --- a/include/index.h +++ b/include/index.h @@ -35,6 +35,8 @@ struct SaveLoadMetaDataV1 uint64_t delete_list_offset; uint64_t tags_offset; uint64_t graph_offset; + + SaveLoadMetaDataV1(); }; inline double estimate_ram_usage(size_t size, uint32_t dim, uint32_t datasize, uint32_t degree) diff --git a/include/parameters.h b/include/parameters.h index 2bba9aeca..3c771a730 100644 --- a/include/parameters.h +++ b/include/parameters.h @@ -16,15 +16,7 @@ class IndexWriteParameters { public: - const uint32_t search_list_size; // L - const uint32_t max_degree; // R - const bool saturate_graph; - const uint32_t max_occlusion_size; // C - const float alpha; - const uint32_t num_threads; - const uint32_t filter_list_size; // Lf - private: IndexWriteParameters(const uint32_t search_list_size, const uint32_t max_degree, const bool saturate_graph, const uint32_t max_occlusion_size, const float alpha, const uint32_t num_threads, const uint32_t filter_list_size) @@ -34,6 +26,14 @@ class IndexWriteParameters { } + const uint32_t search_list_size; // L + const uint32_t max_degree; // R + const bool saturate_graph; + const uint32_t max_occlusion_size; // C + const float alpha; + const uint32_t num_threads; + const uint32_t filter_list_size; // Lf + friend class IndexWriteParametersBuilder; }; diff --git a/src/in_mem_graph_store.cpp b/src/in_mem_graph_store.cpp index fe14c8a0d..fae35ced0 100644 --- a/src/in_mem_graph_store.cpp +++ b/src/in_mem_graph_store.cpp @@ -4,6 +4,7 @@ #include "in_mem_graph_store.h" #include "utils.h" + namespace diskann { InMemGraphStore::InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree) @@ -16,11 +17,21 @@ InMemGraphStore::InMemGraphStore(const size_t total_pts, const size_t reserve_gr } } +#ifdef EXEC_ENV_OLS +std::tuple InMemGraphStore::load(AlignedFileReader &reader, + const size_t num_points, size_t offset) +{ + + return load_impl(reader, num_points, offset); +} +#else std::tuple InMemGraphStore::load(const std::string &index_path_prefix, const size_t num_points, size_t offset) { + return load_impl(index_path_prefix, num_points, offset); } +#endif int InMemGraphStore::store(const std::string &index_path_prefix, const size_t num_points, const size_t num_frozen_points, const uint32_t start) { @@ -90,7 +101,6 @@ std::tuple InMemGraphStore::load_impl(AlignedFileRea size_t file_frozen_pts; uint32_t start; - auto max_points = get_max_points(); int header_size = 2 * sizeof(size_t) + 2 * sizeof(uint32_t); std::unique_ptr header = std::make_unique(header_size); read_array(reader, header.get(), header_size, offset); @@ -143,8 +153,8 @@ std::tuple InMemGraphStore::load_impl(AlignedFileRea << std::endl; return std::make_tuple(nodes_read, start, file_frozen_pts); } -#endif +#else std::tuple InMemGraphStore::load_impl(const std::string &filename, size_t expected_num_points, size_t offset) { @@ -208,6 +218,7 @@ std::tuple InMemGraphStore::load_impl(const std::str << std::endl; return std::make_tuple(nodes_read, start, file_frozen_pts); } +#endif int InMemGraphStore::save_graph(std::ofstream &writer, const size_t num_points, const size_t num_frozen_points, const uint32_t start, size_t offset) diff --git a/src/index.cpp b/src/index.cpp index 10db4dab6..7130b3d3d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -25,6 +25,11 @@ namespace diskann { +SaveLoadMetaDataV1::SaveLoadMetaDataV1() : data_offset(0), delete_list_offset(0), tags_offset(0), graph_offset(0) +{ +} + + // Initialize an index with metric m, load the data of type T with filename // (bin), and initialize max_points template @@ -411,16 +416,16 @@ void Index::save(const char *filename, bool compact_before_save curr_pos += sizeof(SaveLoadMetaDataV1); // Save data. - metadata.data_offset = static_cast(curr_pos); - curr_pos += _data_store->save(writer, (location_t)(_nd + _num_frozen_pts), curr_pos); + { + metadata.data_offset = static_cast(curr_pos); + curr_pos += _data_store->save(writer, (location_t)(_nd + _num_frozen_pts), curr_pos); + } // Save delete list. { - if (_delete_set->size() == 0) - { - metadata.delete_list_offset = static_cast(curr_pos); - } - else + metadata.delete_list_offset = static_cast(curr_pos); + + if (_delete_set->size() != 0) { std::unique_ptr delete_list = std::make_unique(_delete_set->size()); uint32_t i = 0; @@ -434,12 +439,9 @@ void Index::save(const char *filename, bool compact_before_save // Save tags. { - if (!_enable_tags) - { - diskann::cout << "Not saving tags as they are not enabled." << std::endl; - metadata.tags_offset = static_cast(curr_pos); - } - else + metadata.tags_offset = static_cast(curr_pos); + + if (_enable_tags) { TagT *tag_data = new TagT[_nd + _num_frozen_pts]; for (uint32_t i = 0; i < _nd; i++) @@ -466,17 +468,24 @@ void Index::save(const char *filename, bool compact_before_save } // Save graph. - metadata.graph_offset = static_cast(curr_pos); - curr_pos += _graph_store->store(writer, _nd + _num_frozen_pts, _num_frozen_pts, _start, curr_pos); + { + metadata.graph_offset = static_cast(curr_pos); + _graph_store->store(writer, _nd + _num_frozen_pts, _num_frozen_pts, _start, curr_pos); - // Save metadata. - writer.seekp(meta_data_start, writer.beg); - writer.write((char *)&metadata, sizeof(SaveLoadMetaDataV1)); - writer.close(); + // Save metadata. + writer.seekp(meta_data_start, writer.beg); + writer.write((char *)&metadata, sizeof(SaveLoadMetaDataV1)); + writer.close(); + } + + std::cout << "Metadata Saved. data_offset: " << std::to_string(metadata.data_offset) + << " delete_list_offset: " << std::to_string(metadata.delete_list_offset) + << " tag_offset: " << std::to_string(metadata.tags_offset) + << " graph_offset: " << std::to_string(metadata.graph_offset) << std::endl; } else { - diskann::cout << "Save index in a single file currently only support _save_as_one_file_version = 1. " + std::cout << "Save index in a single file currently only support _save_as_one_file_version = 1. " "Not saving the index." << std::endl; } @@ -487,7 +496,7 @@ void Index::save(const char *filename, bool compact_before_save // _max_points. reposition_frozen_point_to_end(); - diskann::cout << "Time taken for save: " << timer.elapsed() / 1000000.0 << "s." << std::endl; + std::cout << "Time taken for save: " << timer.elapsed() / 1000000.0 << "s." << std::endl; } #ifdef EXEC_ENV_OLS @@ -647,6 +656,7 @@ void Index::load(const char *filename, uint32_t num_threads, ui #endif if (!_load_from_one_file) { + std::cout << "DLVS should not load multiple files." << std::endl; // For DLVS Store, we will not support saving the index in multiple // files. #ifndef EXEC_ENV_OLS @@ -670,15 +680,18 @@ void Index::load(const char *filename, uint32_t num_threads, ui { if (_filtered_index) { - diskann::cout << "Single index file saving/loading support for filtered index is not yet " + std::cout << "Single index file saving/loading support for filtered index is not yet " "enabled. Not loading the index." << std::endl; } else { - uint64_t version; + std::cout << "Start loading index from one file." << std::endl; + uint64_t version = 0; #ifdef EXEC_ENV_OLS + std::cout << "Start Version Check." << std::endl; + std::vector readReqs; AlignedRead readReq; uint64_t buf[1]; @@ -687,11 +700,24 @@ void Index::load(const char *filename, uint32_t num_threads, ui readReq.offset = 0; readReq.len = sizeof(uint64_t); readReqs.push_back(readReq); + std::cout << "Load Version request is ready." << std::endl; + reader.read(readReqs, ctx); // synchronous - if ((*(ctx.m_pRequestsStatus))[0] == IOContext::READ_SUCCESS) + std::cout << "Load Version processed." << std::endl; + + if ((*(ctx.m_pRequestsStatus.get()))[0] == IOContext::READ_SUCCESS) { version = buf[0]; + std::cout << "Load Version is " << std::to_string(version) << "." << std::endl; } + else + { + std::stringstream str; + str << "Could not read binary metadata from index file at offset: 0." << std::endl; + std::cout << str.str() << std::endl; + throw diskann::ANNException(str.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } + #else std::ifstream reader(filename, std::ios::binary); reader.read((char *)&version, sizeof(uint64_t)); @@ -699,34 +725,42 @@ void Index::load(const char *filename, uint32_t num_threads, ui if (version == _load_from_one_file_version) { + std::cout << "Version Check passed, start loading meta data." << std::endl; SaveLoadMetaDataV1 metadata; #ifdef EXEC_ENV_OLS std::vector metadata_readReqs; AlignedRead metadata_readReq; - uint64_t metadata_buf[1]; + uint64_t metadata_buf[sizeof(SaveLoadMetaDataV1)]; metadata_readReq.buf = metadata_buf; metadata_readReq.offset = sizeof(uint64_t); metadata_readReq.len = sizeof(SaveLoadMetaDataV1); - metadata_readReq.push_back(readReq); + metadata_readReqs.push_back(metadata_readReq); reader.read(metadata_readReqs, ctx); // synchronous if ((*(ctx.m_pRequestsStatus))[0] == IOContext::READ_SUCCESS) { memcpy((void *)&metadata, (void *)buf, sizeof(SaveLoadMetaDataV1)); } + + std::cout << "Metadata loaded. data_offset: " << std::to_string(metadata.data_offset) + << " delete_list_offset: " << std::to_string(metadata.delete_list_offset) + << " tag_offset: " << std::to_string(metadata.tags_offset) + << " graph_offset: " << std::to_string(metadata.graph_offset) + << std::endl; + #else reader.read((char *)&metadata, sizeof(SaveLoadMetaDataV1)); #endif // Load data #ifdef EXEC_ENV_OLS - load_data(reader, metadata.data_offset) + load_data(reader, metadata.data_offset); #else load_data(filename, metadata.data_offset); #endif // Load delete list when presents. - if (metadata.data_offset != metadata.delete_list_offset) + if (metadata.data_offset != metadata.delete_list_offset) { #ifdef EXEC_ENV_OLS load_delete_set(reader, metadata.delete_list_offset); @@ -752,12 +786,11 @@ void Index::load(const char *filename, uint32_t num_threads, ui } else { - diskann::cout << "load index from a single file currently only support _save_as_one_file_version = 1. " + std::cout << "load index from a single file currently only support _save_as_one_file_version = 1. " "Not loading the index." << std::endl; } } - return; } if (data_file_num_pts != graph_num_pts || (data_file_num_pts != tags_file_num_pts && _enable_tags)) @@ -866,13 +899,15 @@ size_t Index::get_graph_num_frozen_points(const std::string &gr template size_t Index::load_graph(AlignedFileReader &reader, size_t expected_num_points, size_t offset) { + auto res = _graph_store->load(reader, expected_num_points, offset); + #else template size_t Index::load_graph(std::string filename, size_t expected_num_points, size_t offset) { -#endif auto res = _graph_store->load(filename, expected_num_points, offset); +#endif _start = std::get<1>(res); _num_frozen_pts = std::get<2>(res); return std::get<0>(res); diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index c9b2c0ebb..33867d4be 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -1123,7 +1123,7 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons { uint64_t dumr, dumc; float *norm_val; - diskann::load_bin(files, norm_val, dumr, dumc); + diskann::load_bin(files, norm_file, norm_val, dumr, dumc); #else if (file_exists(norm_file) && metric == diskann::Metric::INNER_PRODUCT) {