diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8c0a43..93d94ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,8 @@ if(${UNIX})
set(CMAKE_CXX_STANDARD 11) # for std::unordered_set, std::unique_ptr
set(CMAKE_CXX_STANDARD_REQUIRED ON)
find_package(Threads REQUIRED)
+ find_package(BLAS REQUIRED)
+ find_package(LAPACK REQUIRED)
endif()
add_subdirectory("${PROJECT_SOURCE_DIR}/lib")
diff --git a/README-jp.md b/README-jp.md
index 4c986f2..d691348 100644
--- a/README-jp.md
+++ b/README-jp.md
@@ -10,6 +10,7 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data
ニュース
-------
+- 2022/08/10 QBG(Quantized Blob Graph)およびQG(NGTQGの改良版)が利用可能となりました。ngtqおよびngtqgは[qbg](https://ghe.corp.yahoo.co.jp/NGT/NGT/blob/qbg/bin/qbg/README.md)で置き換えられました。
- 2022/02/04 FP16(半精度浮動小数点)が利用可能になりました。(v1.14.0)
- 2021/03/12 READMEに量子化グラフの結果を追加しました。
- 2021/01/15 [量子化グラフ (NGTQG)](bin/ngtqg/README.md)を実装した NGT v1.13.0 をリリースしました。
@@ -20,21 +21,12 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data
- 2018/12/14 [NGTQ](bin/ngtq/README-jp.md) (NGT with Quantization) が利用可能になりました。(v1.5.0)
- 2018/08/08 [ONNG](README-jp.md#onng)が利用可能になりました。(v1.4.0)
-特徴
-----
-- OS:Linux、macOS
-- データの追加削除が可能
-- [共有メモリ(マップドメモリ)](README-jp.md#共有メモリの利用)のオプションによるNGTではメモリサイズを超えるデータが利用可能
-- データ型:1バイト整数、4バイト単精度浮動小数点
-- 距離関数:L1、L2、コサイン類似度、角度、ハミング、ジャッカード、ポアンカレ、ローレンツ
-- 対応言語:[Python](/python/README-jp.md)、[Ruby](https://github.com/ankane/ngt)、[Go](https://github.com/yahoojapan/gongt)、C、C++
-- 分散サーバ:[ngtd](https://github.com/yahoojapan/ngtd), [vald](https://github.com/vdaas/vald)
-- 量子化版NGT([NGTQ](bin/ngtq/README-jp.md))は10億ものデータの検索が可能
-
-ドキュメント
------------
-
-- [NGT チュートリアル](https://github.com/yahoojapan/NGT/wiki)
+手法
+---
+このリポジトリは次の手法を提供します。
+- NGT: Graph and tree-based method
+- QG: Quantized graph-based method
+- QBG: Quantized blob graph-based method
インストール
-----------
@@ -43,16 +35,34 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data
- [Releases](https://github.com/yahoojapan/NGT/releases)
-### ビルド済み
+### ビルド
-#### macOS
+#### Linux
- $ brew install ngt
+ $ unzip NGT-x.x.x.zip
+ $ cd NGT-x.x.x
+ $ mkdir build
+ $ cd build
+ $ cmake ..
+ $ make
+ $ make install
+ $ ldconfig /usr/local/lib
-### ビルド
+#### CentOS
-#### Linux
+ $ yum install blas-devel lapack-devel
+ $ unzip NGT-x.x.x.zip
+ $ cd NGT-x.x.x
+ $ mkdir build
+ $ cd build
+ $ cmake ..
+ $ make
+ $ make install
+ $ ldconfig /usr/local/lib
+#### Ubuntu
+
+ $ apt install libblas-dev liblapack-dev
$ unzip NGT-x.x.x.zip
$ cd NGT-x.x.x
$ mkdir build
@@ -66,9 +76,7 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data
$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
$ brew install cmake
- $ brew install gcc@9
- $ export CXX=/usr/local/bin/g++-9
- $ export CC=/usr/local/bin/gcc-9
+ $ brew install openblas
$ unzip NGT-x.x.x.zip
$ cd NGT-x.x.x
$ mkdir build
@@ -87,14 +95,45 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data
#### 大規模データの利用
-約500万以上のオブジェクトを登録する場合には、検索速度向上のために以下のパラメータを追加してください。
+約500万以上のオブジェクトをNGTに登録する場合には、検索速度向上のために以下のパラメータを追加してください。
$ cmake -DNGT_LARGE_DATASET=ON ..
+#### QG and QBGの無効化
+
+QBおよびQBGはBLASおよびLAPACKライブラリを必要とします。もし、これらのライブラリをインストールしたくなく、かつ、QGやQBGを利用しない場合には、QGおよびQBGを無効化できます。
+
+ $ cmake -DNGT_QBG_DISABLED=ON ..
+
+### ビルド済み
+
+#### macOS
+
+ $ brew install ngt
+
+NGT (Graph and tree-based method)
+=================================
+
+特徴
+----
+- OS:Linux、macOS
+- データの追加削除が可能
+- [共有メモリ(マップドメモリ)](README-jp.md#共有メモリの利用)のオプションによるNGTではメモリサイズを超えるデータが利用可能
+- データ型:1バイト整数、4バイト単精度浮動小数点
+- 距離関数:L1、L2、コサイン類似度、角度、ハミング、ジャッカード、ポアンカレ、ローレンツ
+- 対応言語:[Python](/python/README-jp.md)、[Ruby](https://github.com/ankane/ngt)、[Go](https://github.com/yahoojapan/gongt)、C、C++
+- 分散サーバ:[ngtd](https://github.com/yahoojapan/ngtd), [vald](https://github.com/vdaas/vald)
+
+ドキュメント
+-----------
+
+- [NGT チュートリアル](https://github.com/yahoojapan/NGT/wiki)
+
+
ユーティリティ
-------------
-- コマンド : [ngt](/bin/ngt/README-jp.md#command), [ngtq](bin/ngtq/README-jp.md)
+- コマンド : [ngt](/bin/ngt/README-jp.md#command)
- サーバ : [ngtd](https://github.com/yahoojapan/ngtd), [vald](https://github.com/vdaas/vald)
対応言語
@@ -108,6 +147,48 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data
- C
- C++([sample code](samples))
+QG (Quantized graph-based method)
+=================================
+
+特徴
+----
+- NGTよりも高性能
+- OS:Linux、macOS
+- 距離関数:L2、コサイン類似度
+- 対応言語:C++, C, Python
+
+ユーティリティ
+-------------
+- コマンド : [qbg](bin/qbg/README.md)
+
+対応言語
+--------
+
+- C++
+- C
+- Python (検索のみ対応)
+
+QBG (Quantized blob graph-based method)
+=======================================
+
+特徴
+----
+- 10億ものオブジェクトの検索が可能
+- OS:Linux、macOS
+- 距離関数:L2
+- 対応言語:C++, C, Python
+
+ユーティリティ
+-------------
+- コマンド : [qbg](bin/qbg/README.md)
+
+対応言語
+--------
+
+- C++
+- C
+- Python (検索のみ対応)
+
ベンチマーク結果
---------------
diff --git a/README.md b/README.md
index b0bca1d..8524041 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,11 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data
[Home](/README.md) / [Installation](/README.md#Installation) / [Command](/bin/ngt/README.md#command) / [License](/README.md#license) / [Publications](/README.md#publications) / [About Us](http://research-lab.yahoo.co.jp/en/) / [日本語](/README-jp.md)
-**NGT** provides commands and a library for performing high-speed approximate nearest neighbor searches against a large volume of data (several million to several 10 million items of data) in high dimensional vector data space (several ten to several thousand dimensions).
+**NGT** provides commands and a library for performing high-speed approximate nearest neighbor searches against a large volume of data in high dimensional vector data space (several ten to several thousand dimensions).
News
----
+- 08/10/2022 QBG (Quantized Blob Graph) and QG (renewed NGTQG) are now available. The command-line interface ngtq and ngtqg are now obsolete by replacing [qbg](bin/qbg/README.md). (v2.0.0)
- 02/04/2022 FP16 (half-precision floating point) is now available. (v1.14.0)
- 03/12/2021 The results for the quantized graph are added to this README.
- 01/15/2021 NGT v1.13.0 to provide the [quantized graph (NGTQG)](bin/ngtqg/README.md) is released.
@@ -20,39 +21,48 @@ News
- 12/14/2018 [NGTQ](bin/ngtq/README.md) (NGT with Quantization) is now available. (v1.5.0)
- 08/08/2018 [ONNG](README.md#onng) is now available. (v1.4.0)
-Key Features
-------------
-- Supported operating systems: Linux and macOS
-- Object additional registration and removal are available.
-- Objects beyond the memory size can be handled using [the shared memory (memory mapped file) option](README.md#shared-memory-use).
-- Supported distance functions: L1, L2, Cosine similarity, Angular, Hamming, Jaccard, Poincare, and Lorentz
-- Data Types: 4 byte floating point number and 1 byte unsigned integer
-- Supported languages: [Python](/python/README.md), [Ruby](https://github.com/ankane/ngt), [Rust](https://crates.io/crates/ngt), [Go](https://github.com/yahoojapan/gongt), C, and C++
-- Distributed servers: [ngtd](https://github.com/yahoojapan/ngtd) and [vald](https://github.com/vdaas/vald)
-- [NGTQ](bin/ngtq/README.md) can handle billions of objects.
-
-Documents
----------
-
-- [NGT tutorial](https://github.com/yahoojapan/NGT/wiki)
+Methods
+-------
+This repository the following methods.
+- NGT: Graph and tree-based method
+- QG: Quantized graph-based method
+- QBG: Quantized blob graph-based method
Installation
------------
-### Downloads
+### Build
-- [Releases](https://github.com/yahoojapan/NGT/releases)
+#### Downloads
-### Pre-Built
+- [Releases](https://github.com/yahoojapan/NGT/releases)
-#### On macOS
+#### On Linux
- $ brew install ngt
+ $ unzip NGT-x.x.x.zip
+ $ cd NGT-x.x.x
+ $ mkdir build
+ $ cd build
+ $ cmake ..
+ $ make
+ $ make install
+ $ ldconfig /usr/local/lib
-### Build
+#### On CentOS
-#### On Linux
+ $ yum install blas-devel lapack-devel
+ $ unzip NGT-x.x.x.zip
+ $ cd NGT-x.x.x
+ $ mkdir build
+ $ cd build
+ $ cmake ..
+ $ make
+ $ make install
+ $ ldconfig /usr/local/lib
+
+#### On Ubuntu
+ $ apt install libblas-dev liblapack-dev
$ unzip NGT-x.x.x.zip
$ cd NGT-x.x.x
$ mkdir build
@@ -66,9 +76,7 @@ Installation
$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
$ brew install cmake
- $ brew install gcc@9
- $ export CXX=/usr/local/bin/g++-9
- $ export CC=/usr/local/bin/gcc-9
+ $ brew install openblas
$ unzip NGT-x.x.x.zip
$ cd NGT-x.x.x
$ mkdir build
@@ -87,14 +95,63 @@ Note: Since there is no lock function, the index should be used only for referen
#### Large-scale data use
-When you insert more than about 5 million objects, please add the following parameter to improve the search time.
+When you insert more than about 5 million objects for the graph-based method, please add the following parameter to improve the search time.
$ cmake -DNGT_LARGE_DATASET=ON ..
+#### Disable QG and QBG
+QB and QBG requires BLAS and LAPACK libraries. If you would not like to install these libraries and do not use QG and QBG, you can disable QG and QBG.
+
+ $ cmake -DNGT_QBG_DISABLED=ON ..
+
+### Pre-Built
+
+#### On macOS
+
+ $ brew install ngt
+
+
+License
+-------
+
+Copyright (C) 2015 Yahoo Japan Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this software except in compliance with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and limitations under the License.
+
+Contributor License Agreement
+-----------------------------
+
+This project requires contributors to accept the terms in the [Contributor License Agreement (CLA)](https://gist.github.com/yahoojapanoss/9bf8afd6ea67f32d29b4082abf220340).
+
+Please note that contributors to the NGT repository on GitHub (https://github.com/yahoojapan/NGT) shall be deemed to have accepted the CLA without individual written agreements.
+
+NGT (Graph and tree-based method)
+=================================
+
+Key Features
+------------
+- Supported operating systems: Linux and macOS
+- Object additional registration and removal are available.
+- Objects beyond the memory size can be handled using [the shared memory (memory mapped file) option](README.md#shared-memory-use).
+- Supported distance functions: L1, L2, Cosine similarity, Angular, Hamming, Jaccard, Poincare, and Lorentz
+- Data Types: 4 byte floating point number and 1 byte unsigned integer
+- Supported languages: [Python](/python/README.md), [Ruby](https://github.com/ankane/ngt), [Rust](https://crates.io/crates/ngt), [Go](https://github.com/yahoojapan/gongt), C, and C++
+- Distributed servers: [ngtd](https://github.com/yahoojapan/ngtd) and [vald](https://github.com/vdaas/vald)
+
+Documents
+---------
+
+- [NGT tutorial](https://github.com/yahoojapan/NGT/wiki)
+
Utilities
---------
-- Command : [ngt](/bin/ngt/README.md#command), [ngtq](bin/ngtq/README.md), [ngtqg](bin/ngtqg/README.md)
+- Command : [ngt](/bin/ngt/README.md#command), [qbg](bin/qbg/README.md)
- Server : [ngtd](https://github.com/yahoojapan/ngtd), [vald](https://github.com/vdaas/vald)
Supported Programming Languages
@@ -108,9 +165,50 @@ Supported Programming Languages
- C
- C++([sample code](samples))
+QG (Quantized graph-based method)
+=================================
+
+Key Features
+------------
+- Higher performance than the graph and tree-based method
+- Supported operating systems: Linux and macOS
+- Supported distance functions: L2 and Cosine similarity
+
+Utilities
+---------
+- Command : [qbg](bin/qbg/README.md)
+
+Supported Programming Languages
+-------------------------------
+
+- C++
+- C
+- Python only for search
+
+
+QBG (Quantized blob graph-based method)
+=======================================
+
+Key Features
+------------
+- [QBG](bin/qbg/README.md) can handle billions of objects.
+- Supported operating systems: Linux and macOS
+- Supported distance functions: L2
+
+Utilities
+---------
+- Command : [qbg](bin/qbg/README.md)
+
+Supported Programming Languages
+-------------------------------
+
+- C++
+- C
+- Python only for search
+
Benchmark Results
-----------------
-The followings are the results of [ann benchmarks](https://github.com/erikbern/ann-benchmarks) for NGT v1.13.5 where the timeout is 5 hours on an AWS c5.4xlarge instance.
+The followings are the results of [ann benchmarks](https://github.com/erikbern/ann-benchmarks) for NGT v2.0.0 where the timeout is 5 hours on an AWS c5.4xlarge instance.
#### glove-100-angular
@@ -127,25 +225,6 @@ The followings are the results of [ann benchmarks](https://github.com/erikbern/a
#### sift-128-euclidean
-License
--------
-
-Copyright (C) 2015 Yahoo Japan Corporation
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this software except in compliance with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and limitations under the License.
-
-Contributor License Agreement
------------------------------
-
-This project requires contributors to accept the terms in the [Contributor License Agreement (CLA)](https://gist.github.com/yahoojapanoss/9bf8afd6ea67f32d29b4082abf220340).
-
-Please note that contributors to the NGT repository on GitHub (https://github.com/yahoojapan/NGT) shall be deemed to have accepted the CLA without individual written agreements.
-
Contact Person
--------------
[masajiro](https://github.com/masajiro)
diff --git a/VERSION b/VERSION
index 9be7846..227cea2 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.14.8
+2.0.0
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index d56d78a..222d38c 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -1,7 +1,8 @@
-if( ${UNIX} )
+if(${UNIX})
include_directories("${PROJECT_SOURCE_DIR}/lib" "${PROJECT_BINARY_DIR}/lib/")
link_directories("${PROJECT_BINARY_DIR}/lib/NGT")
add_subdirectory("${PROJECT_SOURCE_DIR}/bin/ngt")
- add_subdirectory("${PROJECT_SOURCE_DIR}/bin/ngtq")
- add_subdirectory("${PROJECT_SOURCE_DIR}/bin/ngtqg")
+ if(NOT ${NGT_SHARED_MEMORY_ALLOCATOR})
+ add_subdirectory("${PROJECT_SOURCE_DIR}/bin/qbg")
+ endif()
endif()
diff --git a/bin/ngt/ngt.cpp b/bin/ngt/ngt.cpp
index b50ed5e..1403c8b 100644
--- a/bin/ngt/ngt.cpp
+++ b/bin/ngt/ngt.cpp
@@ -102,6 +102,8 @@ main(int argc, char **argv)
ngt.optimizeNumberOfEdgesForANNG(args);
} else if (command == "export-graph") {
ngt.exportGraph(args);
+ } else if (command == "export-objects") {
+ ngt.exportObjects(args);
#ifndef NGT_SHARED_MEMORY_ALLOCATOR
} else if (command == "extract-query") {
NGT::Optimizer::extractQueries(args);
diff --git a/bin/ngtq/CMakeLists.txt b/bin/ngtq/CMakeLists.txt
deleted file mode 100644
index 0e2ce3e..0000000
--- a/bin/ngtq/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-if( ${UNIX} )
- include_directories("${PROJECT_BINARY_DIR}/lib")
- include_directories("${PROJECT_SOURCE_DIR}/lib")
- link_directories("${PROJECT_SOURCE_DIR}/lib/NGT")
-
- add_executable(ngtq_exe ngtq.cpp)
- add_dependencies(ngtq_exe ngt)
- set_target_properties(ngtq_exe PROPERTIES OUTPUT_NAME ngtq)
- target_link_libraries(ngtq_exe ngt pthread)
-
- install(TARGETS ngtq_exe RUNTIME DESTINATION bin)
-
-endif()
diff --git a/bin/ngtq/README-jp.md b/bin/ngtq/README-jp.md
deleted file mode 100644
index f1f6c92..0000000
--- a/bin/ngtq/README-jp.md
+++ /dev/null
@@ -1,122 +0,0 @@
-NGTQ
-===
-
-Neighborhood Graph and Tree for Indexing High-dimensional Data with Quantization
-
-Command
-=======
-
-
-
-**ngtq** - 大規模高次元ベクトルデータの近傍検索
-
-
-
- $ ngtq command [option] index [data]
-
-**注:**
-CygWin といった POSIXLY_CORERECT が設定されている環境では、コマンドの前にオプションを指定しなければなりません。
-
- $ ngtq [option] command index [data]
-
-
-
-十億件以上もの高次元ベクトルデータ(数十~数千次元)に対して高速な近傍検索を提供します。
-
-**command** is one of:
-
-- *[create](#create)*
-- *[append](#append)*
-- *[search](#search)*
-
-### CREATE
-
-指定されたインデックスを生成した上で指定されたデータをインデックスに登録します。
-
- $ ngtq create -d no_of_dimensions [-p no_of_threads] [-o object_type] [-n no_of_registration_data]
- [-C global_codebook_size] [-c local_codebook_size] [-N no_of_divisions]
- [-L local_centroid_creation_mode]
- index registration_data
-
-*index*
-生成するインデックス名を指定します。データを登録後、本インデックス名のディレクトリが生成されてその中に複数のファイルからなるインデックスが生成されます。
-
-*registration\_data*
-登録するベクトルデータを指定します。1行が1オブジェクト(データ)で構成され、各次元要素のデータはスペースまたはタブで区切られていなければなりません。距離関数はL2のみが利用可能です。
-
-**-d** *no\_of\_dimensions*
-登録データの次元数を指定します。
-
-**-p** *no\_of\_threads* (default = 24, recomended value = number of cores)
-生成時の並列処理時に利用するスレッド数を指定します。
-
-**-o** *object\_type*
-データオブジェクトの型を指定する。
-- __c__: 1バイト整数 (一部未実装)
-- __f__: 4バイト浮動小数点(デフォルト、推奨)
-
-**-n** *no\_of\_registration\_data*
-Specifies the number of data items to be registered. If not specified, all data in the specified file will be registered.
-
-**-C** *global\_codebook\_size*
-グローバルコード(セントロイド)の数を指定します。
-
-**-c** *local\_codebook\_size*
-ローカルコード(セントロイド)の数を指定します。
-
-**-N** *no\_of\_divisions*
-ローカルベクトルデータ(残差データ)を生成するためのベクトルデータの分割数を指定します。
-
-**-L** *local\_centroid\_creation\_mode*
-ローカルセントロイドを生成するモードを指定します。
-- __d__: 指定された登録データの先頭をローカルセントロイドとして使用します。
-- __k__: kmeans を使用してローカルセントロイドを生成します。
-
-
-### APPEND
-
-指定された登録データを指定されたインデックスに追加登録します。
-
- $ ngtq append [-n no_of_registration_data] index registration_data
-
-
-*index*
-既存のインデックスを指定します。
-
-*registration\_data*
-登録するベクトルデータを指定します。1行が1オブジェクト(データ)で、各次元のデータはスペース又はタブで区切られていなければなりません。
-
-**-n** *no\_of\_registration\_data*
-登録するデータ数を指定します。指定しない場合には指定されたファイル中のすべてのデータを登録します。
-
-### SEARCH
-
-指定されたクエリデータを用いてインデックスを検索します。
-
- $ ngtq search [-n no_of_search_results] [-e search_range_coefficient] [-m mode]
- [-r search_radius] [-E approximate-expansion] index query_data
-
-
-*index*
-既存のインデックス名を指定します。
-
-*query\_data*
-クエリデータのファイル名を指定します。1行が1クエリデータであり、登録データと同様に各次元のデータはスペース又はタブで区切られていなければなりません。複数クエリを与えた場合には順次検索します。
-
-**-n** *no\_of\_search\_results* (default: 20)
-検索結果数を指定します。
-
-**-e** *search\_range\_coefficient* (default = recomended value = 0.1)
-グローバルコードブックを検索する時の探索範囲の拡大係数です。大きければ精度が高くなりますが遅くなり、小さければ精度は下がりますが速くなります。0~0.3の範囲内で調整することが望ましいですが、負の値も指定可能です。
-
-**-m** *search\_mode* (__r__|__e__|__l__|__c__|__a__)
-検索モードを
-- __a__: 近似距離を用いて検索します。
-- __c__: 近似距離を用いて検索します。計算済みローカル距離がキャッシュされることで検索時間が削減されます。(推奨)
-- __l__: ローカル距離のルックアップテーブルによる近似距離を用いて検索します。
-- __e__: 正確な距離を用いて検索します。ローカルコードブックを利用しません。
-- __r__: 近似距離を用いて絞り込んだ後に正確な距離を用いて検索します。(正確な距離が必要な場合には推奨)
-
-**-E** *approximate\_expansion*
-検索結果に対する近似検索結果の割合を指定します。例えば、割合が10で検索結果数が20の場合近似検索結果数は200となります。
-
diff --git a/bin/ngtq/README.md b/bin/ngtq/README.md
deleted file mode 100644
index cc8db55..0000000
--- a/bin/ngtq/README.md
+++ /dev/null
@@ -1,124 +0,0 @@
-NGTQ
-===
-
-Neighborhood Graph and Tree for Indexing High-dimensional Data with Quantization
-
-Command
-=======
-
-
-
-**ngtq** - proximity search for billions of high dimensional data
-
-
-
- $ ngtq command [option] index [data]
-
-**Note:**
-
-When the environment variable POSIXLY_CORERECT is set on some platforms such as Cygwin, you should specifiy options
-before the command as follows.
-
- $ ngtq [option] command index [data]
-
-
-
-**ngtq** provides high-speed nearest neighbor searches against billions of data in high dimensional vector data space (several ten to several thousand dimensions).
-
-**command** is one of:
-
-- *[create](#create)*
-- *[append](#append)*
-- *[search](#search)*
-
-### CREATE
-
-Constructs the specified index with the specified data.
-
- $ ngtq create -d no_of_dimensions [-p no_of_threads] [-o object_type] [-n no_of_registration_data]
- [-C global_codebook_size] [-c local_codebook_size] [-N no_of_divisions]
- [-L local_centroid_creation_mode]
- index registration_data
-
-*index*
-Specifies the name of the directory for the index to be generated. After data registration, the directory consists of multiple files for the index.
-
-*registration\_data*
-Specifies the vector data to be registered. These data shall consist of one object (data item) per line and each dimensional element shall be delimited by a space or tab. Note that L2 is only avilable as the distance function.
-
-**-d** *no\_of\_dimensions*
-Specifies the number of dimensions of registration data.
-
-**-p** *no\_of\_threads* (default = 24, recomended value = number of cores)
-Specifies the number of threads to be used for parallel processing at generation time.
-
-**-o** *object\_type*
-Specifies the data object type.
-- __c__: 1 byte unsigned integer (not fully implemented)
-- __f__: 4 byte floating point number (default/recommended)
-
-**-n** *no\_of\_registration\_data*
-Specifies the number of data items to be registered. If not specified, all data in the specified file will be registered.
-
-**-C** *global\_codebook\_size*
-Specifies the number of the global codes (centroids).
-
-**-c** *local\_codebook\_size*
-Specifies the number of the local codes (centroids).
-
-**-N** *no\_of\_divisions*
-Specifies the number of division of the vector data for the local vector data (residual data).
-
-**-L** *local\_centroid\_creation\_mode*
-Specifies the creation mode for the local centroids.
-- __d__: The heads of the specified registration data are used as the local centroids.
-- __k__: The local centoroids are generated by using kmeans.
-
-
-### APPEND
-
-Adds the specified data to the specified index.
-
- $ ngtq append [-n no_of_registration_data] index registration_data
-
-
-*index*
-Specifies the name of the existing index.
-
-*registration\_data*
-Specifies the vector data to be registered. These data shall consist of one object (data item) per line and each dimensional element shall be delimited by a space or tab.
-
-**-n** *no\_of\_registration\_data*
-Specifies the number of data items to be registered. If not specified, all data in the specified file will be registered.
-
-### SEARCH
-
-Searches the index using the specified query data.
-
- $ ngtq search [-n no_of_search_results] [-e search_range_coefficient] [-m mode]
- [-r search_radius] [-E approximate-expansion] index query_data
-
-
-*index*
-Specifies the name of the existing index.
-
-*query\_data*
-Specifies the name of the file containing query data. This file shall consist of one item of query data per line and each dimensional element of that data item shall be delimited by a space or tab the same as registration data. Each search shall be sequentially performed when providing multiple queries.
-
-**-n** *no\_of\_search\_results* (default: 20)
-Specifies the number of search results.
-
-**-e** *search\_range\_coefficient* (default = recomended value = 0.1)
-Specifies the magnification coefficient of the search range to search for the global codebook. A larger value means greater accuracy but slower searching, while a smaller value means a drop in accuracy but faster searching. While it is desirable to adjust this value within the range of 0 - 0.3, a negative value may also be specified.
-
-**-m** *search\_mode* (__r__|__e__|__l__|__c__|__a__)
-Specifies the search mode.
-- __a__: searches using approximate distances.
-- __c__: searches using approximate distances. Caching computed local distances reduces the query time. (recommended)
-- __l__: searches using approximate distances with local distance lookup tables.
-- __e__: searches using exact distances. The local codebooks are not used.
-- __r__: searches using exact distances after screening by approximate distances. (recommended if you need exact distances)
-
-**-E** *approximate\_expansion*
-Specifies the expansion ratio of the number of approximate search results to the number of search results. For example, when the ratio is 10 and the number of search results is 20, the number of the approximate search results is set to 200.
-
diff --git a/bin/ngtq/ngtq.cpp b/bin/ngtq/ngtq.cpp
deleted file mode 100644
index 27ce4a0..0000000
--- a/bin/ngtq/ngtq.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-// Copyright (C) 2016 Yahoo Japan Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#include "NGT/NGTQ/NGTQCommand.h"
-
-int
-main(int argc, char **argv)
-{
- NGT::Args args(argc, argv);
-
- NGTQ::Command ngtq;
-
- ngtq.execute(args);
-}
-
-
-
diff --git a/bin/ngtqg/CMakeLists.txt b/bin/ngtqg/CMakeLists.txt
deleted file mode 100644
index 6c61cfa..0000000
--- a/bin/ngtqg/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-if( ${UNIX} )
- include_directories("${PROJECT_BINARY_DIR}/lib")
- include_directories("${PROJECT_SOURCE_DIR}/lib")
- link_directories("${PROJECT_SOURCE_DIR}/lib/NGT")
-##-# link_directories("${PROJECT_SOURCE_DIR}/lib/NGTQG")
-
- add_executable(ngtqg_exe ngtqg.cpp)
- add_dependencies(ngtqg_exe ngt)
- set_target_properties(ngtqg_exe PROPERTIES OUTPUT_NAME ngtqg) ## 名前をngtqgに
-##-# target_link_libraries(ngtqg_exe ngtqg ngt pthread)
- target_link_libraries(ngtqg_exe ngt pthread)
-
- install(TARGETS ngtqg_exe RUNTIME DESTINATION bin)
-
-endif()
diff --git a/bin/ngtqg/README.md b/bin/ngtqg/README.md
deleted file mode 100644
index 163c47d..0000000
--- a/bin/ngtqg/README.md
+++ /dev/null
@@ -1,138 +0,0 @@
-NGTQG
-===
-
-Neighborhood Graph and Tree for Indexing High-dimensional Data with Quantized Graph
-
-Command
-=======
-
-## Name
-
-**ngtqg** - proximity search for high dimensional data with quantized graph
-
-## Synopsis
-
- $ ngtqg command [option] index [data]
-
-**Note:**
-
-When the environment variable POSIXLY_CORERECT is set on some platforms such as Cygwin, you should specifiy options
-before the command as follows.
-
- $ ngtqg [option] command index [data]
-
-## Description
-
-**ngtqg** provides high-speed nearest neighbor searches against high dimensional data.
-
-**command** is one of:
-
-- *[quantize](#quantize)*
-- *[search](#search)*
-
-### QUANTIZE
-
-Quantize the objects of the specified index and build a quantized graph into the index.
-
- $ ngtqg quantize [-E max_no_of_edges] [-Q dimension_of_subvector] index
-
-*index*
-Specify the name of the directory for the existing index such as ANNG or ONNG to be quantized. The index only with L2 distance and normalized cosine similarity distance can be quantized. You should build the ANNG or ONNG with normalized cosine similarity in order to use cosine similarity for the quantized graph.
-
-**-E** *max_no_of_edges*
-Specify the maximum number of edges to build a qunatized graph. Since every 16 objects that are associated with edges of each node are processed, the number should be a multiple of 16.
-
-**-Q** *dimension_of_subvector*
-Specify dimension of a suvbector for quantized objects. The dimension should be a divisor of the dimension of the inserted objects.
-
-### SEARCH
-
-Search the index using the specified query data.
-
- $ ngtqg search [-n no_of_search_objects] [-e search_range_coefficient] [-p result_expansion]
- [-r search_radius] index query_data
-
-
-*index*
-Specify the path of the existing quantized index.
-
-*query_data*
-Specify the path of the file containing query data. This file shall consist of one item of query data per line and each dimensional element of that data item shall be delimited by a space or tab. Each search shall be sequentially performed when providing multiple queries.
-
-**-n** *no_of_search_objects* (default: 20)
-Specify the number of search objects.
-
-**-e** *search_range_coefficient* (default = 0.02)
-Specify the magnification coefficient (epsilon) of the search range. A larger value means higher accuracy but slower searching, while a smaller value means a drop in accuracy but faster searching. While it is desirable to adjust this value within the range of 0 - 0.1, a negative value (> -1.0) may also be specified.
-
-**-p** *result_expansion* (default = 3.0)
-Specify the expansion ratio of the number of approximate inner search objects to the number of search objects. For example, when the ratio is 10 and the number of search objects is 20, the number of the approximate search objects is set to 200 inside the search processing. A larger value brings higher accuracy but slower searching.
-
-**-r** *search_radius* (default = infinite circle)
-Specify the search range in terms of the radius of a circle.
-
-Examples of using the quantized graph
--------------------------------------
-
-### Setup data
-
- $ curl -L -O https://github.com/yahoojapan/NGT/raw/master/tests/datasets/ann-benchmarks/sift-128-euclidean.tsv
- $ curl -L -O https://github.com/yahoojapan/NGT/raw/master/tests/datasets/ann-benchmarks/sift-128-euclidean_query.tsv
- $ head -1 sift-128-euclidean_query.tsv > query.tsv
-
-### Build the quantized graph
-
-Build an ANNG for 128-dimensional, floating point data:
-
- $ ngt create -d 128 -o f -D 2 anng sift-128-euclidean.tsv
- Data loading time=15.4804 (sec) 15480.4 (msec)
- # of objects=1000000
- Processed 100000 objects. time= 4.26452 (sec)
- ...
- Processed 1000000 objects. time= 7.06745 (sec)
- Index creation time=63.3504 (sec) 63350.4 (msec)
-
-Quantize the objects in the ANNG and build the quantized graph:
-
- $ ngtqg quantize anng
- Clustering of the subvector is complete. anng/qg/local-0:17
- ...
- Clustering of the subvector is complete. anng/qg/local-127:17
- Processed 100000 objects.
- ...
- Processed 1000000 objects.
-
-### Search with the quantized graph
-
-Search k nearest neighbors with the quantized graph:
-
- $ ngtqg search -n 20 -e 0.02 anng query.tsv
- Query No.1
- Rank ID Distance
- 1 932086 232.871
- 2 934877 234.715
- 3 561814 243.99
- ...
- 20 2177 276.781
- Query Time= 0.0005034 (sec), 0.5034 (msec)
- Average Query Time= 0.0005034 (sec), 0.5034 (msec), (0.0005034/1)
-
-Examples of building the quantized graph for higher performance
-------------------------------------------------------------
-
-Build an ANNG having more edges for higher performance:
-
- $ ngt create -d 128 -o f -D 2 -E 40 anng-40 sift-128-euclidean.tsv
-
-Build an ONNG:
-
- $ ngt reconstruct-graph -m S -E 64 -o 64 -i 120 anng-40 onng-40
-
-Quantize the objects and build a quantized graph from the ONNG:
-
- $ ngtqg quantize onng-40
-
-Search k nearest neighbors with the quantized graph:
-
- $ ngtqg search -n 20 -e 0.02 onng-40 query.tsv
-
diff --git a/bin/qbg/CMakeLists.txt b/bin/qbg/CMakeLists.txt
new file mode 100644
index 0000000..fe8b7d2
--- /dev/null
+++ b/bin/qbg/CMakeLists.txt
@@ -0,0 +1,13 @@
+if( ${UNIX} )
+ include_directories("${PROJECT_BINARY_DIR}/lib")
+ include_directories("${PROJECT_SOURCE_DIR}/lib")
+ link_directories("${PROJECT_SOURCE_DIR}/lib/NGT")
+
+ add_executable(qbg_exe qbg.cpp)
+ add_dependencies(qbg_exe ngt)
+ set_target_properties(qbg_exe PROPERTIES OUTPUT_NAME qbg) ## 名前をqbgに
+ target_link_libraries(qbg_exe ngt pthread)
+
+ install(TARGETS qbg_exe RUNTIME DESTINATION bin)
+
+endif()
diff --git a/bin/qbg/README.md b/bin/qbg/README.md
new file mode 100644
index 0000000..99a102f
--- /dev/null
+++ b/bin/qbg/README.md
@@ -0,0 +1,287 @@
+QBG
+===
+
+Command-line interface for NGT with Quantization for indexing high-dimensional data
+
+Command
+=======
+
+
+
+**qbg** - proximity search for high dimensional data with quantization
+
+
+
+ $ qbg command [option] index [data]
+
+**Note:**
+
+When the environment variable POSIXLY_CORERECT is set on some platforms such as Cygwin, you should specifiy options
+before the command as follows.
+
+ $ qbg [option] command index [data]
+
+
+
+**qbg** handles two types of graphs with quantization: Quantized Graph (QG) and Quantized Blob Graph (QBG).
+
+**command** for the quantized graph is one of:
+
+- *[create-qg](#create-qg)*
+- *[build-qg](#build-qg)*
+- *[search-qg](#search-qg)*
+
+**command** for the quantized blob graph is one of:
+
+- *[create](#create)*
+- *[append](#append)*
+- *[build](#build)*
+- *[search](#search)*
+
+### CREATE-QG
+Make and initialize a QG directory for the quantized graph in the specified NGT index directory, and insert the data in the NGT index into the QG index.
+
+ $ qbg create-qg [-P number_of_extended_dimensions] [-Q number_of_subvector_dimensions] index
+
+*index*
+Specify the name of the directory for the existing index such as ANNG or ONNG to be quantized. The index only with L2 distance and normalized cosine similarity distance can be quantized. You should build the ANNG or ONNG with normalized cosine similarity in order to use cosine similarity for the quantized graph.
+
+**-P** *number_of_extended_dimensions*
+Specify the number of the extended dimensions. The number should be greater than or equal to the number of the genuine dimensions, and also should be a multiple of 4. When this option is not specified, the smallest multiple of 4 that is greater than the dimension is set to the number of the extended dimensions.
+
+**-Q** *number_of_subvector_dimension*
+Specify the number of the subvector dimensions. The number should be less than or equal to the the number of the extended dimensions, and also should be a divisor of the number of the extended dimensions. When this option is not specified, one is set to the number of the subvector dimensions.
+
+
+### BUILD-QG
+
+Quantize the objects of the specified index and build a quantized graph into the index.
+
+ $ qbg build-qg [-o number_of_objects_for_quantization] [-E max_number_of_edges] [-M number_of_trials] index
+
+*index*
+Specify the name of the directory for the existing index such as ANNG or ONNG to be quantized. The index only with L2 distance and normalized cosine similarity distance can be quantized. You should build the ANNG or ONNG with normalized cosine similarity in order to use cosine similarity for the quantized graph.
+
+**-o** *number_of_objects_for_quantization*
+Specify the number of object for quantization and optimization. The number should be less than or equal to the number of the registered objects.
+
+**-E** *max_number_of_edges*
+Specify the maximum number of edges to build a qunatized graph. Since every 16 objects that are associated with edges of each node are processed, the number should be a multiple of 16.
+
+**-M** *number_of_trials*
+Specify the number of trials to optimize the subvector quantization.
+
+### SEARCH-QG
+
+Search the index using the specified query data.
+
+ $ qbg search-qg [-n number_of_search_objects] [-e search_range_coefficient] [-p result_expansion]
+ [-r search_radius] index query_data
+
+
+*index*
+Specify the path of the existing quantized index.
+
+*query_data*
+Specify the path of the file containing query data. This file shall consist of one item of query data per line and each dimensional element of that data item shall be delimited by a space or tab. Each search shall be sequentially performed when providing multiple queries.
+
+**-n** *number_of_search_objects* (default: 20)
+Specify the number of search objects.
+
+**-e** *search_range_coefficient* (default = 0.02)
+Specify the magnification coefficient (epsilon) of the search range. A larger value means higher accuracy but slower searching, while a smaller value means a drop in accuracy but faster searching. While it is desirable to adjust this value within the range of 0 - 0.1, a negative value (> -1.0) may also be specified.
+
+**-p** *result_expansion* (default = 3.0)
+Specify the expansion ratio of the number of approximate inner search objects to the number of search objects. For example, when the ratio is 10 and the number of search objects is 20, the number of the approximate search objects is set to 200 inside the search processing. A larger value brings higher accuracy but slower searching.
+
+### CREATE
+Make and initialize a QBG directory for the quantized blob graph.
+
+ $ qbg create [-d number_of_dimension] [-P number_of_extended_dimensions] [-O object_type] [-D distance_function] [-C number_of_blobs] index
+
+*index*
+Specify the name of the directory for QBG.
+
+**-d** *number_of_dimensions*
+Specify the number of dimensions of registration data.
+
+**-P** *number_of_extended_dimensions*
+Specify the number of the extended dimensions. The number should be greater than or equal to the number of the genuine dimensions, and also should be a multiple of 4. When this option is not specified, the smallest multiple of 4 that is greater than the dimension is set to the number of the extended dimensions.
+
+**-O** *object_type*
+Specify the data object type.
+- __c__: 1 byte unsigned integer
+- __f__: 4 byte floating point number (default)
+
+**-D** *distance_function*
+Specify the distance function as follows.
+- __2__: L2 distance (default)
+- __c__: Cosine similarity
+
+**-C** *number_of_blobs*
+Specify the number of blobs that should be less than or equal to the number of quantization clusters.
+
+**-N** *number_of_subvectors*
+Specify the number of subvectors that should be a divisor of the number of the extended dimensions.
+
+### APPEND
+
+Append the specified data to the specified index.
+
+ $ qbg append index registration_data
+
+### BUILD
+
+Quantize the objects of the specified index and build a quantized graph into the index.
+
+ $ qbg build [-o number_of_objects_for_quantization] [-E max_number_of_edges] [-M number_of_trials] [-P rotation] index
+
+*index*
+Specify the name of the directory for the existing index such as ANNG or ONNG to be quantized. The index only with L2 distance and normalized cosine similarity distance can be quantized. You should build the ANNG or ONNG with normalized cosine similarity in order to use cosine similarity for the quantized graph.
+
+**-o** *number_of_objects_for_quantization*
+Specify the number of object for quantization and optimization. The number should be less than or equal to the number of the registered objects.
+
+**-P** *rotation*
+Specify the transform matrix type for the inserted and query object to optimize the subvector quantization.
+- __r__: Rotation matrix.
+- __R__: Rotation and repositioning matrix.
+- __p__: Repositioning matrix.
+- __n__: No matrix.
+
+**-M** *number_of_trials*
+Specify the number of trials to optimize the subvector quantization.
+
+### SEARCH
+
+Search the index using the specified query data.
+
+ $ qbg search [-n number_of_search_objects] [-e search_range_coefficient] [-p result_expansion]
+ index query_data
+
+
+*index*
+Specify the path of the existing quantized index.
+
+*query_data*
+Specify the path of the file containing query data. This file shall consist of one item of query data per line and each dimensional element of that data item shall be delimited by a space or tab. Each search shall be sequentially performed when providing multiple queries.
+
+**-n** *number_of_search_objects* (default: 20)
+Specify the number of search objects.
+
+**-e** *search_range_coefficient* (default = 0.02)
+Specify the magnification coefficient (epsilon) of the search range. A larger value means higher accuracy but slower searching, while a smaller value means a drop in accuracy but faster searching. While it is desirable to adjust this value within the range of 0 - 0.1, a negative value (> -1.0) may also be specified.
+
+**-B** *blob_search_range_coefficient* (default = 0.0)
+Specify the magnification coefficient (epsilon) of the search range for the quantized blob graph.
+
+**-N** *number_of_explored_nodes* (default = 256)
+Specify the number of the explored nodes in the graph. When the number of the explored nodes reached the specified number, the search is terminated.
+
+**-p** *result_expansion* (default = 0.0)
+Specify the expansion ratio of the number of approximate inner search objects to the number of search objects. For example, when the ratio is 10 and the number of search objects is 20, the number of the approximate search objects is set to 200 inside the search processing. A larger value brings higher accuracy but slower searching.
+
+
+Examples of using the quantized graph
+-------------------------------------
+
+### Setup data
+
+ $ curl -L -O https://github.com/yahoojapan/NGT/raw/master/tests/datasets/ann-benchmarks/sift-128-euclidean.tsv
+ $ curl -L -O https://github.com/yahoojapan/NGT/raw/master/tests/datasets/ann-benchmarks/sift-128-euclidean_query.tsv
+ $ head -1 sift-128-euclidean_query.tsv > query.tsv
+
+### Build the quantized graph
+
+Build an ANNG for 128-dimensional, floating point data:
+
+ $ ngt create -d 128 -o f -D 2 anng sift-128-euclidean.tsv
+ Data loading time=15.4804 (sec) 15480.4 (msec)
+ # of objects=1000000
+ Processed 100000 objects. time= 4.26452 (sec)
+ ...
+ Processed 1000000 objects. time= 7.06745 (sec)
+ Index creation time=63.3504 (sec) 63350.4 (msec)
+
+Create and initialize the quantized graph:
+
+ $ qbg create-qg anng
+ creating...
+ appending...
+
+Build the quantized graph:
+
+ $ qbg build-qg anng
+ optimizing...
+ building the inverted index...
+ building the quantized graph...
+
+### Search with the quantized graph
+
+Search k nearest neighbors with the quantized graph:
+
+ $ qbg search-qg -n 20 -e 0.02 anng query.tsv
+ Query No.1
+ Rank ID Distance
+ 1 932086 232.871
+ 2 934877 234.715
+ 3 561814 243.99
+ ...
+ 20 2177 276.781
+ Query Time= 0.0005034 (sec), 0.5034 (msec)
+ Average Query Time= 0.0005034 (sec), 0.5034 (msec), (0.0005034/1)
+
+Examples of building the quantized graph for higher performance
+------------------------------------------------------------
+
+Build an ANNG having more edges for higher performance:
+
+ $ ngt create -d 128 -o f -D 2 -E 40 anng-40 sift-128-euclidean.tsv
+
+Build an ONNG:
+
+ $ ngt reconstruct-graph -m S -E 64 -o 64 -i 120 anng-40 onng-40
+
+Create and initialize the quantized graph:
+
+ $ qbg create-qg onng-40
+
+Build the quantized graph:
+
+ $ qbg build-qg onng-40
+
+Search k nearest neighbors with the quantized graph:
+
+ $ qbg search -n 20 -e 0.02 onng-40 query.tsv
+
+
+Examples of using the quantized blob graph
+-------------------------------------
+
+### Setup data
+
+ $ curl -L -O https://github.com/yahoojapan/NGT/raw/master/tests/datasets/ann-benchmarks/sift-128-euclidean.tsv
+ $ curl -L -O https://github.com/yahoojapan/NGT/raw/master/tests/datasets/ann-benchmarks/sift-128-euclidean_query.tsv
+ $ head -1 sift-128-euclidean_query.tsv > query.tsv
+
+### Build the quantized blob graph
+
+Create and initialize the quantized blob graph:
+
+ $ qbg create -d 128 -D 2 -N 128 qbg-index
+
+Append objects:
+
+ $ qbg append qbg-index sift-128-euclidean.tsv
+
+Build the quantized graph:
+
+ $ qbg build qbg-index
+
+### Search with the quantized graph
+
+Search k nearest neighbors with the quantized graph:
+
+ $ qbg search -n 20 -e 0.02 qbg-index query.tsv
+
+
diff --git a/bin/ngtqg/ngtqg.cpp b/bin/qbg/qbg.cpp
similarity index 92%
rename from bin/ngtqg/ngtqg.cpp
rename to bin/qbg/qbg.cpp
index 0248c45..92281eb 100644
--- a/bin/ngtqg/ngtqg.cpp
+++ b/bin/qbg/qbg.cpp
@@ -14,14 +14,14 @@
// limitations under the License.
//
-#include "NGT/NGTQ/NGTQGCommand.h"
+#include "NGT/NGTQ/QbgCli.h"
int
main(int argc, char **argv)
{
NGT::Args args(argc, argv);
- NGTQG::Command ngt;
+ QBG::CLI ngt;
ngt.execute(args);
}
diff --git a/lib/NGT/ArrayFile.h b/lib/NGT/ArrayFile.h
index d48c610..5a1b68a 100644
--- a/lib/NGT/ArrayFile.h
+++ b/lib/NGT/ArrayFile.h
@@ -31,7 +31,7 @@ namespace NGT {
template
class ArrayFile {
- private:
+ protected:
struct FileHeadStruct {
size_t recordSize;
uint64_t extraData; // reserve
diff --git a/lib/NGT/CMakeLists.txt b/lib/NGT/CMakeLists.txt
index db4c436..fd93bde 100644
--- a/lib/NGT/CMakeLists.txt
+++ b/lib/NGT/CMakeLists.txt
@@ -21,12 +21,13 @@ if( ${UNIX} )
add_dependencies(ngt ngtstatic)
if(${APPLE})
if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
- target_link_libraries(ngt OpenMP::OpenMP_CXX)
+ target_link_libraries(ngt lapack blas OpenMP::OpenMP_CXX)
else()
- target_link_libraries(ngt gomp)
+ target_link_libraries(ngt lapack blas gomp)
endif()
else(${APPLE})
- target_link_libraries(ngt gomp rt)
+ #target_link_libraries(ngt gomp rt)
+ target_link_libraries(ngt gomp rt lapack blas)
endif(${APPLE})
add_custom_command(OUTPUT command DEPENDS ${NGT_SOURCES} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND sh ${PROJECT_SOURCE_DIR}/utils/mk_version_defs_h.sh ${PROJECT_SOURCE_DIR} version_defs.h)
diff --git a/lib/NGT/Clustering.h b/lib/NGT/Clustering.h
index 77cfd21..8fe00d0 100644
--- a/lib/NGT/Clustering.h
+++ b/lib/NGT/Clustering.h
@@ -46,9 +46,12 @@ namespace NGT {
public:
enum InitializationMode {
- InitializationModeHead = 0,
- InitializationModeRandom = 1,
- InitializationModeKmeansPlusPlus = 2
+ InitializationModeHead = 0,
+ InitializationModeRandom = 1,
+ InitializationModeKmeansPlusPlus = 2,
+ InitializationModeRandomFixedSeed = 3,
+ InitializationModeKmeansPlusPlusFixedSeed = 4,
+ InitializationModeBest = 5
};
enum ClusteringType {
@@ -92,14 +95,16 @@ namespace NGT {
double radius;
};
- Clustering(InitializationMode im = InitializationModeHead, ClusteringType ct = ClusteringTypeKmeansWithNGT, size_t mi = 100):
- clusteringType(ct), initializationMode(im), maximumIteration(mi) { initialize(); }
+ Clustering(InitializationMode im = InitializationModeHead, ClusteringType ct = ClusteringTypeKmeansWithNGT, size_t mi = 10000, size_t nc = 0):
+ clusteringType(ct), initializationMode(im), numberOfClusters(nc), maximumIteration(mi) { initialize(); }
void initialize() {
epsilonFrom = 0.12;
epsilonTo = epsilonFrom;
epsilonStep = 0.04;
resultSizeCoefficient = 5;
+ clusterSizeConstraint = false;
+ clusterSizeConstraintCoefficient = 0.0;
}
static void
@@ -162,7 +167,23 @@ namespace NGT {
}
static void
- saveVector(const std::string &file, std::vector &vectors)
+ loadVector(const std::string &file, std::vector &vectors)
+ {
+ std::ifstream is(file);
+ if (!is) {
+ throw std::runtime_error("loadVector::Cannot open " + file );
+ }
+ std::string line;
+ while (true) {
+ size_t v;
+ is >> v;
+ if (is.eof()) break;
+ vectors.push_back(v);
+ }
+ }
+
+ template static void
+ saveVector(const std::string &file, std::vector &vectors)
{
std::ofstream os(file);
for (auto vit = vectors.begin(); vit != vectors.end(); ++vit) {
@@ -249,7 +270,11 @@ namespace NGT {
static void
subtract(std::vector &a, std::vector &b) {
- assert(a.size() == b.size());
+ if (a.size() != b.size()) {
+ std::stringstream msg;
+ std::cerr << "Clustering::subtract: Mismatched dimensions. " << a.size() << "x" << b.size();
+ NGTThrowException(msg);
+ }
auto bit = b.begin();
for (auto ait = a.begin(); ait != a.end(); ++ait, ++bit) {
*ait = *ait - *bit;
@@ -267,34 +292,36 @@ namespace NGT {
}
static void
- getInitialCentroidsRandomly(std::vector > &vectors, std::vector &clusters, size_t size, size_t seed)
+ getInitialCentroidsRandomly(std::vector > &vectors, std::vector &clusters, size_t size, size_t seed = 0)
{
+ size = size > vectors.size() ? vectors.size() : size;
clusters.clear();
- std::random_device rnd;
if (seed == 0) {
+ std::random_device rnd;
seed = rnd();
}
std::mt19937 mt(seed);
+ std::uniform_int_distribution<> dist(0, vectors.size() - 1);
for (size_t i = 0; i < size; i++) {
- size_t idx = mt() * vectors.size() / mt.max();
- if (idx >= size) {
- i--;
- continue;
- }
+ size_t idx = dist(mt);
clusters.push_back(Cluster(vectors[idx]));
}
assert(clusters.size() == size);
}
static void
- getInitialCentroidsKmeansPlusPlus(std::vector > &vectors, std::vector &clusters, size_t size)
+ getInitialCentroidsKmeansPlusPlus(std::vector > &vectors, std::vector &clusters, size_t size, size_t seed = 0)
{
size = size > vectors.size() ? vectors.size() : size;
clusters.clear();
- std::random_device rnd;
- std::mt19937 mt(rnd());
- size_t idx = (long long)mt() * (long long)vectors.size() / (long long)mt.max();
+ if (seed == 0) {
+ std::random_device rnd;
+ seed = rnd();
+ }
+ std::mt19937 mt(seed);
+ std::uniform_int_distribution<> dist(0, vectors.size() - 1);
+ size_t idx = dist(mt);
clusters.push_back(Cluster(vectors[idx]));
NGT::Timer timer;
@@ -334,19 +361,26 @@ namespace NGT {
static void
- assign(std::vector > &vectors, std::vector &clusters,
- size_t clusterSize = std::numeric_limits::max()) {
+ assign(std::vector> &vectors, std::vector &clusters,
+ size_t clusterSize = std::numeric_limits::max(), bool clear = true) {
// compute distances to the nearest clusters, and construct heap by the distances.
NGT::Timer timer;
timer.start();
+ size_t nOfVectors = 0;
+ if (!clear) {
+ for (auto &cluster : clusters) {
+ nOfVectors += cluster.members.size();
+ }
+ }
+
std::vector sortedObjects(vectors.size());
-#pragma omp parallel for
+#pragma omp parallel for
for (size_t vi = 0; vi < vectors.size(); vi++) {
auto vit = vectors.begin() + vi;
{
double mind = DBL_MAX;
- size_t mincidx = -1;
+ int mincidx = -1;
for (auto cit = clusters.begin(); cit != clusters.end(); ++cit) {
double d = distanceL2(*vit, (*cit).centroid);
if (d < mind) {
@@ -354,22 +388,27 @@ namespace NGT {
mincidx = distance(clusters.begin(), cit);
}
}
- sortedObjects[vi] = Entry(vi, mincidx, mind);
+ if (mincidx == -1) {
+ std::cerr << "Clustering: Fatal error " << clusters.size() << std::endl;
+ std::cerr << vi << "/" << vectors.size() << std::endl;
+ abort();
+ }
+ sortedObjects[vi] = Entry(vi + nOfVectors, mincidx, mind);
}
}
std::sort(sortedObjects.begin(), sortedObjects.end());
-
+
// clear
- for (auto cit = clusters.begin(); cit != clusters.end(); ++cit) {
- (*cit).members.clear();
+ if (clear) {
+ for (auto cit = clusters.begin(); cit != clusters.end(); ++cit) {
+ (*cit).members.clear();
+ }
}
-
-
// distribute objects to the nearest clusters in the same size constraint.
for (auto soi = sortedObjects.rbegin(); soi != sortedObjects.rend();) {
Entry &entry = *soi;
if (entry.centroidID >= clusters.size()) {
- std::cerr << "Something wrong. " << entry.centroidID << ":" << clusters.size() << std::endl;
+ std::cerr << "Something wrong. (2) " << entry.centroidID << ":" << clusters.size() << std::endl;
soi++;
continue;
}
@@ -377,6 +416,7 @@ namespace NGT {
clusters[entry.centroidID].members.push_back(entry);
soi++;
} else {
+#if 0
double mind = DBL_MAX;
size_t mincidx = -1;
for (auto cit = clusters.begin(); cit != clusters.end(); ++cit) {
@@ -389,6 +429,25 @@ namespace NGT {
mincidx = distance(clusters.begin(), cit);
}
}
+#else
+ std::vector ds(clusters.size());
+#pragma omp parallel for
+ for (size_t idx = 0; idx < clusters.size(); idx++) {
+ if (clusters[idx].members.size() >= clusterSize) {
+ ds[idx] = std::numeric_limits::max();
+ continue;
+ }
+ ds[idx] = distanceL2(vectors[entry.vectorID], clusters[idx].centroid);
+ }
+ float mind = std::numeric_limits::max();
+ size_t mincidx = -1;
+ for (size_t idx = 0; idx < clusters.size(); idx++) {
+ if (ds[idx] < mind) {
+ mind = ds[idx];
+ mincidx = idx;
+ }
+ }
+#endif
entry = Entry(entry.vectorID, mincidx, mind);
int pt = distance(sortedObjects.rbegin(), soi);
std::sort(sortedObjects.begin(), soi.base());
@@ -493,7 +552,6 @@ namespace NGT {
assignedObjectCount++;
}
}
-
//size_t notAssignedObjectCount = 0;
vector notAssignedObjectIDs;
notAssignedObjectIDs.reserve(dataSize - assignedObjectCount);
@@ -503,7 +561,6 @@ namespace NGT {
}
}
-
if (clusterSize < std::numeric_limits::max()) {
do {
vector> notAssignedObjects(notAssignedObjectIDs.size());
@@ -572,7 +629,6 @@ namespace NGT {
moveFartherObjectsToEmptyClusters(clusters);
}
-
}
@@ -621,25 +677,58 @@ namespace NGT {
}
-
- double kmeansWithoutNGT(std::vector > &vectors, size_t numberOfClusters,
- std::vector &clusters)
+ double kmeansWithoutNGT(std::vector > &vectors, std::vector &clusters,
+ size_t clusterSize)
{
- size_t clusterSize = std::numeric_limits::max();
- if (clusterSizeConstraint) {
- clusterSize = ceil((double)vectors.size() / (double)numberOfClusters);
- }
-
- double diff = 0;
- for (size_t i = 0; i < maximumIteration; i++) {
+ NGT::Timer timer;
+ timer.start();
+ double diff = std::numeric_limits::max();
+ size_t stabilityLimit = 10;
+ size_t stabilityCount = 0;
+ size_t i;
+ for (i = 0; i < maximumIteration; i++) {
assign(vectors, clusters, clusterSize);
// centroid is recomputed.
// diff is distance between the current centroids and the previous centroids.
- diff = calculateCentroid(vectors, clusters);
+ auto d = calculateCentroid(vectors, clusters);
+ if (d == diff) {
+ stabilityCount++;
+ if (stabilityCount >= stabilityLimit) {
+ break;
+ }
+ }
+ if (d < diff) {
+ diff = d;
+ }
if (diff == 0) {
break;
}
}
+ return diff;
+ }
+ double kmeansWithoutNGT(std::vector > &vectors, size_t numberOfClusters,
+ std::vector &clusters)
+ {
+ size_t clusterSize = std::numeric_limits::max();
+
+ double diff = kmeansWithoutNGT(vectors, clusters, clusterSize);
+
+ if (clusterSizeConstraint || clusterSizeConstraintCoefficient != 0.0) {
+ if (clusterSizeConstraintCoefficient >= 1.0) {
+ clusterSize = ceil((double)vectors.size() / (double)numberOfClusters) * clusterSizeConstraintCoefficient;
+ } else if (clusterSizeConstraintCoefficient != 0.0) {
+ std::stringstream msg;
+ msg << "kmeansWithoutNGT: clusterSizeConstraintCoefficient is invalid. " << clusterSizeConstraintCoefficient << " ";
+ throw std::runtime_error(msg.str());
+ } else {
+ clusterSize = ceil((double)vectors.size() / (double)numberOfClusters);
+ }
+ } else {
+ return diff == 0;
+ }
+
+ diff = kmeansWithoutNGT(vectors, clusters, clusterSize);
+
return diff == 0;
}
@@ -662,14 +751,18 @@ namespace NGT {
double diff = 0.0;
size_t resultSize;
resultSize = resultSizeCoefficient * vectors.size() / clusters.size();
- for (size_t i = 0; i < maximumIteration; i++) {
+ size_t i;
+ for (i = 0; i < maximumIteration; i++) {
assignWithNGT(index, vectors, clusters, resultSize, epsilon, clusterSize);
// centroid is recomputed.
// diff is distance between the current centroids and the previous centroids.
+ double prevDiff = diff;
std::vector prevClusters = clusters;
diff = calculateCentroid(vectors, clusters);
- timer.stop();
- timer.start();
+ if (prevDiff == diff) {
+ std::cerr << "epsilon=" << epsilon << "->" << epsilon * 1.1 << std::endl;
+ epsilon *= 1.1;
+ }
diffHistory.push_back(diff);
if (diff == 0) {
@@ -783,7 +876,7 @@ namespace NGT {
{
double mse = 0.0;
size_t count = 0;
- for (auto cit = clusters.begin(); cit != clusters.end(); ++cit) {
+ for (auto cit = clusters.begin(); cit != clusters.end(); ++cit) {
count += (*cit).members.size();
for (auto mit = (*cit).members.begin(); mit != (*cit).members.end(); ++mit) {
mse += meanSumOfSquares((*cit).centroid, vectors[(*mit).vectorID]);
@@ -843,7 +936,12 @@ namespace NGT {
}
case InitializationModeRandom:
{
- getInitialCentroidsRandomly(vectors, clusters, numberOfClusters, 0);
+ getInitialCentroidsRandomly(vectors, clusters, numberOfClusters);
+ break;
+ }
+ case InitializationModeRandomFixedSeed:
+ {
+ getInitialCentroidsRandomly(vectors, clusters, numberOfClusters, 1);
break;
}
case InitializationModeKmeansPlusPlus:
@@ -851,6 +949,11 @@ namespace NGT {
getInitialCentroidsKmeansPlusPlus(vectors, clusters, numberOfClusters);
break;
}
+ case InitializationModeKmeansPlusPlusFixedSeed:
+ {
+ getInitialCentroidsKmeansPlusPlus(vectors, clusters, numberOfClusters, 1);
+ break;
+ }
default:
std::cerr << "proper initMode is not specified." << std::endl;
exit(1);
@@ -858,12 +961,26 @@ namespace NGT {
}
}
+ bool
+ kmeans(std::vector > &vectors, std::vector &clusters) {
+ return kmeans(vectors, numberOfClusters, clusters);
+ }
+
bool
kmeans(std::vector > &vectors, size_t numberOfClusters, std::vector &clusters)
{
+ if (vectors.size() == 0) {
+ std::stringstream msg;
+ msg << "Clustering::kmeans: No vector.";
+ NGTThrowException(msg);
+ }
+ if (vectors[0].size() == 0) {
+ std::stringstream msg;
+ msg << "Clustering::kmeans: No dimension.";
+ NGTThrowException(msg);
+ }
setupInitialClusters(vectors, numberOfClusters, clusters);
-
switch (clusteringType) {
case ClusteringTypeKmeansWithoutNGT:
return kmeansWithoutNGT(vectors, numberOfClusters, clusters);
@@ -912,10 +1029,13 @@ namespace NGT {
}
}
+ void setClusterSizeConstraintCoefficient(float v) { clusterSizeConstraintCoefficient = v; }
+
ClusteringType clusteringType;
InitializationMode initializationMode;
size_t numberOfClusters;
bool clusterSizeConstraint;
+ float clusterSizeConstraintCoefficient;
size_t maximumIteration;
float epsilonFrom;
float epsilonTo;
diff --git a/lib/NGT/Command.cpp b/lib/NGT/Command.cpp
index 495c806..1e8442d 100644
--- a/lib/NGT/Command.cpp
+++ b/lib/NGT/Command.cpp
@@ -1151,4 +1151,36 @@ using namespace std;
#endif
}
+ void NGT::Command::exportObjects(Args &args) {
+#if defined(NGT_SHARED_MEMORY_ALLOCATOR)
+ std::cerr << "ngt: Error: exportObjects is not implemented." << std::endl;
+ abort();
+#else
+ std::string usage = "ngt export-objects index";
+ string indexPath;
+ try {
+ indexPath = args.get("#1");
+ } catch (...) {
+ cerr << "ngt::exportGraph: Index is not specified." << endl;
+ cerr << usage << endl;
+ return;
+ }
+
+ NGT::Index index(indexPath);
+ auto &objectSpace = index.getObjectSpace();
+ size_t size = objectSpace.getRepository().size();
+
+ for (size_t id = 1; id < size; ++id) {
+ std::vector object;
+ objectSpace.getObject(id, object);
+ for (auto v = object.begin(); v != object.end(); ++v) {
+ std::cout << *v;
+ if (v + 1 != object.end()) {
+ std::cout << "\t";
+ }
+ }
+ std::cout << std::endl;
+ }
+#endif
+ }
diff --git a/lib/NGT/Command.h b/lib/NGT/Command.h
index 4377a74..316d58a 100644
--- a/lib/NGT/Command.h
+++ b/lib/NGT/Command.h
@@ -127,7 +127,8 @@ class Command {
void refineANNG(Args &args);
void repair(Args &args);
void exportGraph(Args &args);
-
+ void exportObjects(Args &args);
+
void info(Args &args);
void setDebugLevel(int level) { debugLevel = level; }
int getDebugLevel() { return debugLevel; }
diff --git a/lib/NGT/Common.h b/lib/NGT/Common.h
index 99b9d4b..db4f1c8 100644
--- a/lib/NGT/Common.h
+++ b/lib/NGT/Common.h
@@ -53,17 +53,17 @@ namespace NGT {
typedef half_float::half float16;
#endif
-#define NGTThrowException(MESSAGE) throw NGT::Exception(__FILE__, (size_t)__LINE__, MESSAGE)
-#define NGTThrowSpecificException(MESSAGE, TYPE) throw NGT::TYPE(__FILE__, (size_t)__LINE__, MESSAGE)
+#define NGTThrowException(MESSAGE) throw NGT::Exception(__FILE__, __FUNCTION__, (size_t)__LINE__, MESSAGE)
+#define NGTThrowSpecificException(MESSAGE, TYPE) throw NGT::TYPE(__FILE__, __FUNCTION__, (size_t)__LINE__, MESSAGE)
class Exception : public std::exception {
public:
Exception():message("No message") {}
- Exception(const std::string &file, size_t line, std::stringstream &m) { set(file, line, m.str()); }
- Exception(const std::string &file, size_t line, const std::string &m) { set(file, line, m); }
- void set(const std::string &file, size_t line, const std::string &m) {
+ Exception(const std::string &file, const std::string &function, size_t line, std::stringstream &m) { set(file, function, line, m.str()); }
+ Exception(const std::string &file, const std::string &function, size_t line, const std::string &m) { set(file, function, line, m); }
+ void set(const std::string &file, const std::string &function, size_t line, const std::string &m) {
std::stringstream ss;
- ss << file << ":" << line << ": " << m;
+ ss << file << ":" << function << ":" << line << ": " << m;
message = ss.str();
}
~Exception() throw() {}
@@ -106,16 +106,17 @@ namespace NGT {
} else if (opt.size() > 1 && opt[0] == '-') {
if (opt.size() == 2) {
key = opt[1];
- if (key == "h") {
- value = "";
- } else {
- ++i;
- if (i != opts.end()) {
- value = *i;
- } else {
+ ++i;
+ if (i != opts.end()) {
+ if (((*i)[0] == '-') && ((*i)[1] != 0) ) {
value = "";
--i;
+ } else {
+ value = *i;
}
+ } else {
+ value = "";
+ --i;
}
} else {
key = opt[1];
@@ -178,6 +179,14 @@ namespace NGT {
usedOptions.insert(ai->first);
return ai->second;
}
+ bool getBool(const char *s) {
+ try {
+ get(s);
+ } catch(...) {
+ return false;
+ }
+ return true;
+ }
long getl(const char *s, long v) {
char *e;
long val;
@@ -260,6 +269,28 @@ namespace NGT {
}
+ template
+ static void extractVector(const std::string &textLine, const std::string &sep, T &object) {
+ std::vector tokens;
+ NGT::Common::tokenize(textLine, tokens, sep);
+ size_t idx;
+ for (idx = 0; idx < tokens.size(); idx++) {
+ if (tokens[idx].size() == 0) {
+ std::stringstream msg;
+ msg << "Common::extractVecotFromText: No data. " << textLine;
+ NGTThrowException(msg);
+ }
+ char *e;
+ double v = ::strtod(tokens[idx].c_str(), &e);
+ if (*e != 0) {
+ std::cerr << "ObjectSpace::readText: Warning! Not numerical value. [" << e << "]" << std::endl;
+ break;
+ }
+ object.push_back(v);
+ }
+ }
+
+
static std::string getProcessStatus(const std::string &stat) {
pid_t pid = getpid();
std::stringstream str;
@@ -287,6 +318,24 @@ namespace NGT {
static int getProcessVmSize() { return strtol(getProcessStatus("VmSize")); }
static int getProcessVmPeak() { return strtol(getProcessStatus("VmPeak")); }
static int getProcessVmRSS() { return strtol(getProcessStatus("VmRSS")); }
+ static std::string sizeToString(float size) {
+ char unit = 'K';
+ if (size > 1024) {
+ size /= 1024;
+ unit = 'M';
+ }
+ if (size > 1024) {
+ size /= 1024;
+ unit = 'G';
+ }
+ size = round(size * 100) / 100;
+ std::stringstream str;
+ str << size << unit;
+ return str.str();
+ }
+ static std::string getProcessVmSizeStr() { return sizeToString(getProcessVmSize()); }
+ static std::string getProcessVmPeakStr() { return sizeToString(getProcessVmPeak()); }
+ static std::string getProcessVmRSSStr() { return sizeToString(getProcessVmRSS()); }
};
class StdOstreamRedirector {
@@ -302,6 +351,11 @@ namespace NGT {
void enable() { enabled = true; }
void disable() { enabled = false; }
+ void set(bool e) { enabled = e; }
+ void bgin(bool e) {
+ set(e);
+ begin();
+ }
void begin() {
if (!enabled) {
return;
@@ -1381,7 +1435,7 @@ namespace NGT {
template
class PersistentRepository {
public:
- typedef Vector ARRAY;
+ typedef Vector ARRAY;
typedef TYPE ** iterator;
PersistentRepository():array(0) {}
@@ -1925,6 +1979,7 @@ namespace NGT {
}
}
this->clear();
+ this->shrink_to_fit();
#ifdef ADVANCED_USE_REMOVED_LIST
while(!removedList.empty()){ removedList.pop(); }
#endif
@@ -2159,7 +2214,23 @@ namespace NGT {
}
friend std::ostream &operator<<(std::ostream &os, Timer &t) {
- os << std::setprecision(6) << t.time << " (sec)";
+ auto time = t.time;
+ if (time < 1.0) {
+ time *= 1000.0;
+ os << std::setprecision(6) << time << " (ms)";
+ return os;
+ }
+ if (time < 60.0) {
+ os << std::setprecision(6) << time << " (s)";
+ return os;
+ }
+ time /= 60.0;
+ if (time < 60.0) {
+ os << std::setprecision(6) << time << " (m)";
+ return os;
+ }
+ time /= 60.0;
+ os << std::setprecision(6) << time << " (h)";
return os;
}
diff --git a/lib/NGT/Index.cpp b/lib/NGT/Index.cpp
index 487a439..ed247c9 100644
--- a/lib/NGT/Index.cpp
+++ b/lib/NGT/Index.cpp
@@ -40,7 +40,7 @@ Index::getVersion()
}
#ifdef NGT_SHARED_MEMORY_ALLOCATOR
-NGT::Index::Index(NGT::Property &prop, const string &database) {
+NGT::Index::Index(NGT::Property &prop, const string &database):redirect(false) {
if (prop.dimension == 0) {
NGTThrowException("Index::Index. Dimension is not specified.");
}
@@ -62,7 +62,7 @@ NGT::Index::Index(NGT::Property &prop, const string &database) {
path = "";
}
#else
-NGT::Index::Index(NGT::Property &prop) {
+NGT::Index::Index(NGT::Property &prop):redirect(false) {
if (prop.dimension == 0) {
NGTThrowException("Index::Index. Dimension is not specified.");
}
@@ -512,6 +512,12 @@ NGT::GraphIndex::constructObjectSpace(NGT::Property &prop) {
}
}
+void
+NGT::GraphIndex::loadGraph(const string &ifile, NGT::GraphRepository &graph) {
+ ifstream isg(ifile + "/grp");
+ graph.deserialize(isg);
+}
+
void
NGT::GraphIndex::loadIndex(const string &ifile, bool readOnly, bool graphDisabled) {
objectSpace->deserialize(ifile + "/obj");
@@ -522,12 +528,10 @@ NGT::GraphIndex::loadIndex(const string &ifile, bool readOnly, bool graphDisable
if (readOnly && property.indexType == NGT::Index::Property::IndexType::Graph) {
GraphIndex::NeighborhoodGraph::loadSearchGraph(ifile);
} else {
- ifstream isg(ifile + "/grp");
- repository.deserialize(isg);
+ loadGraph(ifile, repository);
}
#else
- ifstream isg(ifile + "/grp");
- repository.deserialize(isg);
+ loadGraph(ifile, repository);
#endif
}
diff --git a/lib/NGT/Index.h b/lib/NGT/Index.h
index 106f1cf..a702399 100644
--- a/lib/NGT/Index.h
+++ b/lib/NGT/Index.h
@@ -74,9 +74,12 @@ namespace NGT {
pathAdjustmentInterval = 0;
#ifdef NGT_SHARED_MEMORY_ALLOCATOR
databaseType = DatabaseType::MemoryMappedFile;
- graphSharedMemorySize = 512; // MB
- treeSharedMemorySize = 512; // MB
- objectSharedMemorySize = 512; // MB 512 is up to 50M objects.
+ //graphSharedMemorySize = 512; // MB
+ //treeSharedMemorySize = 512; // MB
+ //objectSharedMemorySize = 512; // MB 512 is up to 50M objects.
+ graphSharedMemorySize = 10240; // MB
+ treeSharedMemorySize = 10240; // MB
+ objectSharedMemorySize = 10240; // MB 512 is up to 50M objects.
#else
databaseType = DatabaseType::Memory;
#endif
@@ -368,13 +371,13 @@ namespace NGT {
Index():index(0) {}
#ifdef NGT_SHARED_MEMORY_ALLOCATOR
- Index(NGT::Property &prop, const std::string &database);
+ Index(NGT::Property &prop, const std::string &database);
#else
- Index(NGT::Property &prop);
+ Index(NGT::Property &prop);
#endif
- Index(const std::string &database, bool rdOnly = false):index(0) { open(database, rdOnly); }
- Index(const std::string &database, bool rdOnly, bool graphDisabled):index(0) { open(database, rdOnly, graphDisabled); }
- Index(const std::string &database, NGT::Property &prop):index(0) { open(database, prop); }
+ Index(const std::string &database, bool rdOnly = false):index(0), redirect(false) { open(database, rdOnly); }
+ Index(const std::string &database, bool rdOnly, bool graphDisabled):index(0), redirect(false) { open(database, rdOnly, graphDisabled); }
+ Index(const std::string &database, NGT::Property &prop):index(0), redirect(false) { open(database, prop); }
virtual ~Index() { close(); }
void open(const std::string &database, NGT::Property &prop) {
@@ -423,6 +426,7 @@ namespace NGT {
virtual void load(const std::string &ifile, size_t dataSize) { getIndex().load(ifile, dataSize); }
virtual void append(const std::string &ifile, size_t dataSize) { getIndex().append(ifile, dataSize); }
virtual void append(const float *data, size_t dataSize) {
+ StdOstreamRedirector redirector(redirect);
redirector.begin();
try {
getIndex().append(data, dataSize);
@@ -433,6 +437,7 @@ namespace NGT {
redirector.end();
}
virtual void append(const double *data, size_t dataSize) {
+ StdOstreamRedirector redirector(redirect);
redirector.begin();
try {
getIndex().append(data, dataSize);
@@ -447,6 +452,7 @@ namespace NGT {
virtual size_t getObjectRepositorySize() { return getIndex().getObjectRepositorySize(); }
virtual size_t getGraphRepositorySize() { return getIndex().getGraphRepositorySize(); }
virtual void createIndex(size_t threadNumber, size_t sizeOfRepository = 0) {
+ StdOstreamRedirector redirector(redirect);
redirector.begin();
try {
getIndex().createIndex(threadNumber, sizeOfRepository);
@@ -475,6 +481,7 @@ namespace NGT {
virtual void search(NGT::SearchContainer &sc) { getIndex().search(sc); }
virtual void search(NGT::SearchQuery &sc) { getIndex().search(sc); }
virtual void search(NGT::SearchContainer &sc, ObjectDistances &seeds) { getIndex().search(sc, seeds); }
+ virtual void getSeeds(NGT::SearchContainer &sc, ObjectDistances &seeds, size_t n) { getIndex().getSeeds(sc, seeds, n); }
virtual void remove(ObjectID id, bool force = false) { getIndex().remove(id, force); }
virtual void exportIndex(const std::string &file) { getIndex().exportIndex(file); }
virtual void importIndex(const std::string &file) { getIndex().importIndex(file); }
@@ -505,8 +512,8 @@ namespace NGT {
}
return *index;
}
- void enableLog() { redirector.disable(); }
- void disableLog() { redirector.enable(); }
+ void enableLog() { redirect = true; }
+ void disableLog() { redirect = false; }
static void destroy(const std::string &path) {
#ifdef NGT_SHARED_MEMORY_ALLOCATOR
@@ -562,7 +569,7 @@ namespace NGT {
Index *index;
std::string path;
- StdOstreamRedirector redirector;
+ bool redirect;
};
class GraphIndex : public Index,
@@ -722,9 +729,9 @@ namespace NGT {
}
void saveProperty(const std::string &file);
-
void exportProperty(const std::string &file);
+ static void loadGraph(const std::string &ifile, NGT::GraphRepository &graph);
virtual void loadIndex(const std::string &ifile, bool readOnly, bool graphDisabled);
virtual void exportIndex(const std::string &ofile) {
@@ -797,7 +804,14 @@ namespace NGT {
}
deleteObject(query);
}
-
+ void getSeeds(NGT::SearchContainer &sc, ObjectDistances &seeds, size_t n) {
+ getRandomSeeds(repository, seeds, n);
+ setupDistances(sc, seeds);
+ std::sort(seeds.begin(), seeds.end());
+ if (seeds.size() > n) {
+ seeds.resize(n);
+ }
+ }
// get randomly nodes as seeds.
template void getRandomSeeds(REPOSITORY &repo, ObjectDistances &seeds, size_t seedSize) {
// clear all distances to find the same object as a randomized object.
@@ -1566,6 +1580,39 @@ namespace NGT {
void createTreeIndex();
+ void getSeeds(NGT::SearchContainer &sc, ObjectDistances &seeds, size_t n) {
+ DVPTree::SearchContainer tso(sc.object);
+ tso.mode = DVPTree::SearchContainer::SearchLeaf;
+ tso.radius = 0.0;
+ tso.size = 1;
+ tso.distanceComputationCount = 0;
+ tso.visitCount = 0;
+ try {
+ DVPTree::search(tso);
+ } catch (Exception &err) {
+ std::stringstream msg;
+ msg << "GraphAndTreeIndex::getSeeds: Cannot search for tree.:" << err.what();
+ NGTThrowException(msg);
+ }
+ try {
+ DVPTree::getObjectIDsFromLeaf(tso.nodeID, seeds);
+ } catch (Exception &err) {
+ std::stringstream msg;
+ msg << "GraphAndTreeIndex::getSeeds: Cannot get a leaf.:" << err.what();
+ NGTThrowException(msg);
+ }
+ sc.distanceComputationCount += tso.distanceComputationCount;
+ sc.visitCount += tso.visitCount;
+ if (seeds.size() < n) {
+ GraphIndex::getRandomSeeds(repository, seeds, n);
+ }
+ GraphIndex::setupDistances(sc, seeds);
+ std::sort(seeds.begin(), seeds.end());
+ if (seeds.size() > n) {
+ seeds.resize(n);
+ }
+ }
+
// GraphAndTreeIndex
void getSeedsFromTree(NGT::SearchContainer &sc, ObjectDistances &seeds) {
DVPTree::SearchContainer tso(sc.object);
diff --git a/lib/NGT/NGTQ/Capi.cpp b/lib/NGT/NGTQ/Capi.cpp
index 508a640..40416f4 100644
--- a/lib/NGT/NGTQ/Capi.cpp
+++ b/lib/NGT/NGTQ/Capi.cpp
@@ -19,8 +19,14 @@
#include
#include "NGT/Capi.h"
+#include "NGT/NGTQ/Quantizer.h"
#include "NGT/NGTQ/Capi.h"
#include "NGT/NGTQ/QuantizedGraph.h"
+#include "NGT/NGTQ/QuantizedBlobGraph.h"
+#include "NGT/NGTQ/Optimizer.h"
+#include "NGT/NGTQ/HierarchicalKmeans.h"
+
+#ifdef NGTQ_QBG
static bool operate_error_string_(const std::stringstream &ss, NGTError error){
if(error != NULL){
@@ -119,7 +125,11 @@ void ngtqg_initialize_quantization_parameters(NGTQGQuantizationParameters *param
bool ngtqg_quantize(const char *indexPath, NGTQGQuantizationParameters parameters, NGTError error) {
try{
+#ifdef NGTQ_QBG
+ NGTQG::Index::quantize(indexPath, parameters.max_number_of_edges);
+#else
NGTQG::Index::quantize(indexPath, parameters.dimension_of_subvector, parameters.max_number_of_edges);
+#endif
return true;
}catch(std::exception &err){
std::stringstream ss;
@@ -129,3 +139,285 @@ bool ngtqg_quantize(const char *indexPath, NGTQGQuantizationParameters parameter
}
}
+
+
+uint32_t qbg_get_result_size(QBGObjectDistances results, NGTError error) {
+ return ngt_get_result_size(results, error);
+}
+
+NGTObjectDistance qbg_get_result(const QBGObjectDistances results, const uint32_t idx, NGTError error) {
+ return ngt_get_result(results, idx, error);
+}
+
+void qbg_destroy_results(QBGObjectDistances results) {
+ ngt_destroy_results(results);
+}
+
+
+void qbg_initialize_construction_parameters(QBGConstructionParameters *parameters)
+{
+ parameters->extended_dimension = 0;
+ parameters->dimension = 0;
+ parameters->number_of_subvectors = 1;
+ parameters->number_of_blobs = 0;
+ parameters->internal_data_type = NGTQ::DataTypeFloat;
+ parameters->data_type = NGTQ::DataTypeFloat;
+ parameters->distance_type = NGTQ::DistanceType::DistanceTypeL2;
+}
+
+bool qbg_create(const char *indexPath, QBGConstructionParameters *parameters, NGTQGError error)
+{
+
+ try {
+ cerr << "qbgcapi: Create" << endl;
+ std::vector r;
+ NGTQ::Property property;
+ NGT::Property globalProperty;
+ NGT::Property localProperty;
+ property.dimension = parameters->extended_dimension;
+ if (property.dimension == 0) {
+ property.dimension = parameters->dimension;
+ }
+ property.genuineDimension = parameters->dimension;
+ property.globalRange = 0;
+ property.localRange = 0;
+ property.globalCentroidLimit = parameters->number_of_blobs;
+ property.localCentroidLimit = 16;
+ property.localDivisionNo = parameters->number_of_subvectors;
+ property.singleLocalCodebook = false;
+ property.centroidCreationMode = NGTQ::CentroidCreationModeStaticLayer;
+ property.localCentroidCreationMode = NGTQ::CentroidCreationModeStatic;
+ property.localIDByteSize = 1;
+ property.dataType = static_cast(parameters->internal_data_type);
+ property.genuineDataType = static_cast(parameters->data_type);
+ property.distanceType = static_cast(parameters->distance_type);
+
+ globalProperty.edgeSizeForCreation = 10;
+ globalProperty.edgeSizeForSearch = 40;
+ globalProperty.indexType = NGT::Property::GraphAndTree;
+ globalProperty.insertionRadiusCoefficient = 1.1;
+
+ localProperty.indexType = globalProperty.indexType;
+ localProperty.insertionRadiusCoefficient = globalProperty.insertionRadiusCoefficient;
+
+ std::vector *rotation = 0;
+ const std::string objectPath;
+ QBG::Index::create(indexPath, property, globalProperty, localProperty, rotation, objectPath);
+ } catch(NGT::Exception &err) {
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
+ operate_error_string_(ss, error);
+ return false;
+ }
+
+ return true;
+}
+
+QBGIndex qbg_open_index(const char *index_path, QBGError error) {
+ try {
+ std::string index_path_str(index_path);
+ auto *index = new QBG::Index(index_path_str, true);
+ return static_cast(index);
+ } catch(std::exception &err){
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
+ operate_error_string_(ss, error);
+ return NULL;
+ }
+}
+
+void qbg_close_index(QBGIndex index) {
+ if (index == NULL) return;
+ (static_cast(index))->close();
+ delete static_cast(index);
+ index = 0;
+}
+
+bool qbg_save_index(QBGIndex index, QBGError error) {
+ if (index == NULL) return false;
+ try {
+ (static_cast(index))->save();
+ } catch(std::exception &err) {
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
+ operate_error_string_(ss, error);
+ return false;
+ }
+ return true;
+}
+
+ObjectID qbg_append_object(QBGIndex index, float *obj, uint32_t obj_dim, QBGError error) {
+ if (index == NULL || obj == NULL || obj_dim == 0){
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : parametor error: index = " << index << " obj = " << obj << " obj_dim = " << obj_dim;
+ operate_error_string_(ss, error);
+ return 0;
+ }
+
+ try {
+ auto *pindex = static_cast(index);
+ std::vector vobj(&obj[0], &obj[obj_dim]);
+ return pindex->append(vobj);
+ } catch(std::exception &err) {
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
+ operate_error_string_(ss, error);
+ return 0;
+ }
+}
+
+void qbg_initialize_build_parameters(QBGBuildParameters *parameters) {
+ parameters->hierarchical_clustering_init_mode = static_cast(NGT::Clustering::InitializationModeKmeansPlusPlus);
+ parameters->number_of_first_objects = 0;
+ parameters->number_of_first_clusters = 0;
+ parameters->number_of_second_objects = 0;
+ parameters->number_of_second_clusters = 0;
+ parameters->number_of_third_clusters = 0;
+
+ parameters->number_of_objects = 0;
+ parameters->number_of_subvectors = 1;
+ parameters->optimization_clustering_init_mode = static_cast(NGT::Clustering::InitializationModeKmeansPlusPlus);
+ parameters->rotation_iteration = 2000;
+ parameters->subvector_iteration = 400;
+ parameters->number_of_matrices = 3;
+ parameters->rotation = true;
+ parameters->repositioning = false;
+}
+
+bool qbg_build_index(const char *index_path, QBGBuildParameters *parameters, QBGError error) {
+
+ QBG::HierarchicalKmeans hierarchicalKmeans;
+
+ hierarchicalKmeans.maxSize = 1000;
+ hierarchicalKmeans.numOfClusters = 2;
+ hierarchicalKmeans.numOfTotalClusters = 0;
+ hierarchicalKmeans.numOfTotalBlobs = 0;
+ hierarchicalKmeans.clusterID = -1;
+ hierarchicalKmeans.initMode = static_cast(parameters->hierarchical_clustering_init_mode);
+ hierarchicalKmeans.numOfRandomObjects = 0;
+ hierarchicalKmeans.extractCentroid = false;
+ hierarchicalKmeans.numOfFirstObjects = parameters->number_of_first_objects;
+ hierarchicalKmeans.numOfFirstClusters = parameters->number_of_first_clusters;
+ hierarchicalKmeans.numOfSecondObjects = parameters->number_of_second_objects;
+ hierarchicalKmeans.numOfSecondClusters = parameters->number_of_second_clusters;
+ hierarchicalKmeans.numOfThirdClusters = parameters->number_of_third_clusters;
+ hierarchicalKmeans.numOfObjects = 0;
+ hierarchicalKmeans.threeLayerClustering = true;
+ hierarchicalKmeans.silence = true;
+
+ try {
+ hierarchicalKmeans.clustering(index_path);
+ } catch (NGT::Exception &err) {
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
+ operate_error_string_(ss, error);
+ return false;
+ }
+
+ NGTQ::Optimizer optimizer;
+
+ optimizer.numberOfObjects = parameters->number_of_objects;
+ optimizer.numberOfClusters = 16;
+ optimizer.numberOfSubvectors = 0;
+ optimizer.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithNGT;
+ optimizer.initMode = static_cast(parameters->optimization_clustering_init_mode);
+ optimizer.convergenceLimitTimes = 5;
+ optimizer.iteration = parameters->rotation_iteration;
+ optimizer.clusterIteration = parameters->subvector_iteration;
+ optimizer.clusterSizeConstraint = false;
+ optimizer.nOfMatrices = parameters->number_of_matrices;
+ optimizer.seedStartObjectSizeRate = 0.1;
+ optimizer.seedStep = 2;
+ optimizer.reject = 0.9;
+ optimizer.timelimit = 24 * 2;
+ optimizer.timelimit *= 60.0 * 60.0;
+ optimizer.rotation = parameters->rotation;
+ optimizer.repositioning = parameters->repositioning;
+ optimizer.globalType = NGTQ::Optimizer::GlobalTypeNone;
+ optimizer.silence = true;
+
+ try {
+ bool random = true;
+ optimizer.optimize(index_path, random);
+ } catch (NGT::Exception &err) {
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
+ operate_error_string_(ss, error);
+ return false;
+ }
+
+ try {
+ auto silence = true;
+ QBG::Index::build(index_path, silence);
+ } catch (NGT::Exception &err) {
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
+ operate_error_string_(ss, error);
+ return false;
+ }
+ return true;
+}
+
+void qbg_initialize_query(QBGQuery *parameters) {
+ parameters->query = 0;
+ parameters->number_of_results = 20;
+ parameters->epsilon = 0.1;
+ parameters->blob_epsilon = 0.0;
+ parameters->result_expansion = 3.0;
+ parameters->number_of_explored_blobs = 256;
+ parameters->number_of_edges = 0;
+ parameters->radius = 0;
+}
+
+static bool qbg_search_index_(QBG::Index* pindex, std::vector &query, QBGQuery ¶m, NGTObjectDistances results) {
+ // set search parameters.
+ if (param.radius < 0.0){
+ param.radius = FLT_MAX;
+ }
+
+ QBG::SearchContainer sc;
+ sc.setObjectVector(query);
+ sc.setResults(static_cast(results));
+ if (param.result_expansion >= 1.0) {
+ sc.setSize(static_cast(param.number_of_results) * param.result_expansion);
+ sc.setExactResultSize(param.number_of_results);
+ } else {
+ sc.setSize(param.number_of_results);
+ sc.setExactResultSize(0);
+ }
+ sc.setEpsilon(param.epsilon);
+ sc.setBlobEpsilon(param.blob_epsilon);
+ sc.setEdgeSize(param.number_of_edges);
+ sc.setGraphExplorationSize(param.number_of_explored_blobs);
+
+ pindex->searchBlobGraph(sc);
+
+ return true;
+}
+
+bool qbg_search_index(QBGIndex index, QBGQuery query, NGTObjectDistances results, QBGError error) {
+ if (index == NULL || query.query == NULL || results == NULL) {
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : parametor error: index = " << index << " query = " << query.query << " results = " << results;
+ operate_error_string_(ss, error);
+ return false;
+ }
+
+ auto *pindex = static_cast(index);
+ int32_t dim = pindex->getQuantizer().property.genuineDimension;
+
+ try {
+ std::vector vquery(&query.query[0], &query.query[dim]);
+ qbg_search_index_(pindex, vquery, query, results);
+ } catch(std::exception &err) {
+ std::stringstream ss;
+ ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
+ operate_error_string_(ss, error);
+ return false;
+ }
+
+ return true;
+}
+
+
+#endif
diff --git a/lib/NGT/NGTQ/Capi.h b/lib/NGT/NGTQ/Capi.h
index 5a8b4e2..88b1a0c 100644
--- a/lib/NGT/NGTQ/Capi.h
+++ b/lib/NGT/NGTQ/Capi.h
@@ -14,85 +14,6 @@
// limitations under the License.
//
-/***
- {
- // simple quantization and search example
-
- std::string indexPath = "onng_index"; // ONNG
- std::string queryPath = "query.tsv"; // Query file.
- NGTError err = ngt_create_error_object();
-
- // quantize the specified existing index
- // build quantized objects and a quantized graph
- NGTQGQuantizationParameters quantizationParameters;
- ngtqg_initialize_quantization_parameters(&quantizationParameters);
- if (!ngtqg_quantize(indexPath.c_str(), quantizationParameters, err)) {
- std::cerr << ngt_get_error_string(err) << std::endl;
- return false;
- }
-
- // open the index (ANNG or ONNG).
- index = ngtqg_open_index(indexPath.c_str(), err);
- if (index == NULL) {
- std::cerr << ngt_get_error_string(err) << std::endl;
- return false;
- }
-
- std::ifstream is(queryPath); // open a query file.
- if (!is) {
- std::cerr << "Cannot open the specified file. " << queryPath << std::endl;
- return false;
- }
-
- // get the dimension of the index to check the dimension of the query
- NGTProperty property = ngt_create_property(err);
- ngt_get_property(index, property, err);
- size_t dimension = ngt_get_property_dimension(property, err);
- ngt_destroy_property(property);
-
- std::string line;
- float queryVector[dimension];
- if (!getline(is, line)) { // read a query object from the query file.
- std::cerr << "no data" << std::endl;
- }
- std::vector tokens;
- NGT::Common::tokenize(line, tokens, " \t"); // split a string into words by the separators.
- // create a query vector from the tokens.
- if (tokens.size() != dimension) {
- std::cerr << "dimension of the query is invalid. dimesion=" << tokens.size() << ":" << dimension << std::endl;
- return false;
- }
- for (std::vector::iterator ti = tokens.begin(); ti != tokens.end(); ++ti) {
- queryVector[distance(tokens.begin(), ti)] = NGT::Common::strtod(*ti);
- }
- // set search parameters.
- NGTObjectDistances result = ngt_create_empty_results(err);
- NGTQGQuery query;
- ngtqg_initialize_query(&query);
- query.query = queryVector;
- query.size = 20;
- query.epsilon = 0.03;
- query.result_expansion = 2;
-
- // search with the quantized graph
- bool status = ngtqg_search_index(index, query, result, err);
- NGTObjectSpace objectSpace = ngt_get_object_space(index, err);
- auto rsize = ngt_get_result_size(result, err);
- // show resultant objects.
- std::cout << "Rank\tID\tDistance\tObject" << std::endl;
- for (size_t i = 0; i < rsize; i++) {
- NGTObjectDistance object = ngt_get_result(result, i, err);
- std::cout << i + 1 << "\t" << object.id << "\t" << object.distance << "\t";
- float *objectVector = ngt_get_object_as_float(objectSpace, object.id, err);
- for (size_t i = 0; i < dimension; i++) {
- std::cout << objectVector[i] << " ";
- }
- std::cout << std::endl;
- }
- ngt_destroy_results(result);
- ngtqg_close_index(index);
- }
-***/
#pragma once
@@ -106,34 +27,106 @@ extern "C" {
#include "NGT/Capi.h"
-typedef void* NGTQGIndex;
-typedef NGTObjectDistance NGTObjectDistance;
-typedef NGTError NGTQGError;
+ typedef void* NGTQGIndex;
+ typedef NGTObjectDistance NGTObjectDistance;
+ typedef NGTError NGTQGError;
-typedef struct {
- float *query;
- size_t size; // # of returned objects
- float epsilon;
- float result_expansion;
- float radius;
-} NGTQGQuery;
+ typedef struct {
+ float *query;
+ size_t size; // # of returned objects
+ float epsilon;
+ float result_expansion;
+ float radius;
+ } NGTQGQuery;
-typedef struct {
- float dimension_of_subvector;
- size_t max_number_of_edges;
-} NGTQGQuantizationParameters;
+ typedef struct {
+ float dimension_of_subvector;
+ size_t max_number_of_edges;
+ } NGTQGQuantizationParameters;
-NGTQGIndex ngtqg_open_index(const char *, NGTError);
+ NGTQGIndex ngtqg_open_index(const char *, NGTQGError);
-void ngtqg_close_index(NGTQGIndex);
+ void ngtqg_close_index(NGTQGIndex);
-void ngtqg_initialize_quantization_parameters(NGTQGQuantizationParameters *);
+ void ngtqg_initialize_quantization_parameters(NGTQGQuantizationParameters *);
-bool ngtqg_quantize(const char *, NGTQGQuantizationParameters, NGTError);
+ bool ngtqg_quantize(const char *, NGTQGQuantizationParameters, NGTQGError);
-void ngtqg_initialize_query(NGTQGQuery *);
+ void ngtqg_initialize_query(NGTQGQuery *);
-bool ngtqg_search_index(NGTQGIndex, NGTQGQuery, NGTObjectDistances, NGTError);
+ bool ngtqg_search_index(NGTQGIndex, NGTQGQuery, NGTObjectDistances, NGTQGError);
+
+ // QBG CAPI
+
+ typedef void* QBGIndex;
+ typedef NGTError QBGError;
+ typedef NGTObjectDistances QBGObjectDistances;
+
+ uint32_t qbg_get_result_size(QBGObjectDistances results, NGTError error);
+
+ NGTObjectDistance qbg_get_result(const QBGObjectDistances results, const uint32_t idx, NGTError error);
+
+ void qbg_destroy_results(QBGObjectDistances results);
+
+ typedef struct {
+ size_t extended_dimension;
+ size_t dimension;
+ size_t number_of_subvectors;
+ size_t number_of_blobs;
+ int internal_data_type;
+ int data_type;
+ int distance_type;
+ } QBGConstructionParameters;
+
+ typedef struct {
+ // hierarchical kmeans
+ int hierarchical_clustering_init_mode;
+ size_t number_of_first_objects;
+ size_t number_of_first_clusters;
+ size_t number_of_second_objects;
+ size_t number_of_second_clusters;
+ size_t number_of_third_clusters;
+ // optimization
+ size_t number_of_objects;
+ size_t number_of_subvectors;
+ int optimization_clustering_init_mode;
+ size_t rotation_iteration;
+ size_t subvector_iteration;
+ size_t number_of_matrices;
+ bool rotation;
+ bool repositioning;
+ } QBGBuildParameters;
+
+ typedef struct {
+ float *query;
+ size_t number_of_results;
+ float epsilon;
+ float blob_epsilon;
+ float result_expansion;
+ size_t number_of_explored_blobs;
+ size_t number_of_edges;
+ float radius;
+ } QBGQuery;
+
+ void qbg_initialize_construction_parameters(QBGConstructionParameters *parameters);
+
+ bool qbg_create(const char *indexPath, QBGConstructionParameters *parameters, QBGError error);
+
+ QBGIndex qbg_open_index(const char *index_path, QBGError error);
+
+ void qbg_close_index(QBGIndex index);
+
+ bool qbg_save_index(QBGIndex index, QBGError error);
+
+ ObjectID qbg_append_object(QBGIndex index, float *obj, uint32_t obj_dim, QBGError error);
+
+ void qbg_initialize_build_parameters(QBGBuildParameters *parameters);
+
+ bool qbg_build_index(const char *index_path, QBGBuildParameters *parameters, QBGError error);
+
+ void qbg_initialize_query(QBGQuery *parameters);
+
+ bool qbg_search_index(QBGIndex index, QBGQuery query, NGTObjectDistances results, QBGError error);
#ifdef __cplusplus
}
diff --git a/lib/NGT/NGTQ/HierarchicalKmeans.h b/lib/NGT/NGTQ/HierarchicalKmeans.h
new file mode 100644
index 0000000..91b17f9
--- /dev/null
+++ b/lib/NGT/NGTQ/HierarchicalKmeans.h
@@ -0,0 +1,1243 @@
+//
+// Copyright (C) 2021 Yahoo Japan Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "Quantizer.h"
+
+namespace QBG {
+ class HierarchicalKmeans {
+ public:
+ typedef NGTQ::Quantizer::ObjectList QBGObjectList;
+
+ class HKNode {
+ public:
+ bool leaf;
+ };
+
+ class HKLeafNode : public HKNode {
+ public:
+ HKLeafNode():id(0){ leaf = true; }
+ std::vector members;
+ uint32_t id;
+ };
+
+ class HKInternalNode : public HKNode {
+ public:
+ HKInternalNode() { leaf = false; }
+ std::vector>> children;
+ };
+
+ HierarchicalKmeans() {
+ silence = false;
+ }
+
+ static int32_t searchLeaf(std::vector &nodes, int32_t rootID, float *object) {
+ auto nodeID = rootID;
+ while (true) {
+ auto *node = nodes[nodeID];
+ if (node->leaf) {
+ return nodeID;
+ } else {
+ HKInternalNode &internalNode = static_cast(*node);
+ float min = std::numeric_limits::max();
+ int32_t minid = 0;
+ for (auto &c : internalNode.children) {
+ auto d = NGT::PrimitiveComparator::compareL2(reinterpret_cast(&object[0]),
+ c.second.data(), c.second.size());
+ if (d < min) {
+ min = d;
+ minid = c.first;
+ }
+ }
+ nodeID = minid;
+ }
+ }
+ return -1;
+ }
+
+ static void aggregateObjects(HKLeafNode &leafNode, std::vector> &vectors,
+ NGT::ObjectSpace &objectSpace, QBGObjectList &objectList)
+ {
+ vectors.reserve(leafNode.members.size() + 1);
+ std::vector obj;
+ for (auto &m : leafNode.members) {
+ objectList.get(m, obj, &objectSpace);
+ vectors.push_back(obj);
+ }
+ }
+
+ static void aggregateObjects(HKLeafNode &leafNode, std::vector> &vectors,
+ NGT::ObjectSpace &objectSpace, QBGObjectList &objectList,
+ std::vector &object)
+ {
+ aggregateObjects(leafNode, vectors, objectSpace, objectList);
+ vectors.push_back(std::move(object));
+ }
+
+ static void split(uint32_t id, std::vector> &vectors,
+ std::vector &nodes, int32_t leafNodeID, NGT::Clustering &clustering)
+ {
+ HKLeafNode &leafNode = static_cast(*nodes[leafNodeID]);
+ std::vector clusters;
+ clustering.kmeans(vectors, clusters);
+ auto *newNode = new HKInternalNode;
+ for (auto &cluster : clusters) {
+ auto centroid = std::move(cluster.centroid);
+ centroid.resize(vectors[0].size());
+ newNode->children.push_back(std::make_pair(nodes.size(), std::move(centroid)));
+ auto *cnode = new HKLeafNode;
+ nodes.push_back(cnode);
+ for (auto &member : cluster.members) {
+ if (member.vectorID > leafNode.members.size()) {
+ std::cerr << "Fatal error. member:" << member.vectorID << ":" << leafNode.members.size() << std::endl;
+ abort();
+ }
+ if (member.vectorID == leafNode.members.size()) {
+ cnode->members.push_back(id);
+ } else {
+ cnode->members.push_back(leafNode.members[member.vectorID]);
+ }
+ }
+ }
+ delete nodes[leafNodeID];
+ nodes[leafNodeID] = newNode;
+ }
+
+ static double computeError(std::vector &nodes, NGT::ObjectSpace &objectSpace, QBGObjectList &objectList) {
+ std::cerr << "node size=" << nodes.size() << std::endl;
+ double distance = 0.0;
+ size_t dcount = 0;
+ for (auto *node : nodes) {
+ if (node->leaf) {
+ } else {
+ HKInternalNode &internalNode = static_cast(*node);
+ std::vector obj;
+ for (auto &child : internalNode.children) {
+ if (nodes[child.first]->leaf) {
+ if (dcount % 100000 == 0) {
+ std::cerr << "Processed leaves=" << dcount << std::endl;
+ }
+ auto centroid = child.second;
+ HKLeafNode &leafNode = static_cast(*nodes[child.first]);
+ for (auto &m : leafNode.members) {
+ objectList.get(m, obj, &objectSpace);
+ distance += NGT::Clustering::distanceL2(centroid, obj);
+ dcount++;
+ }
+ }
+ }
+ }
+ }
+ distance /= dcount;
+ std::cout << "# of vectors=" << dcount << std::endl;
+ std::cout << "Quantization error=" << distance << std::endl;
+ return distance;
+ }
+
+ static size_t extractCentroids(std::ostream &oStream, std::vector &nodes) {
+ std::cerr << "node size=" << nodes.size() << std::endl;
+ size_t clusterCount = 0;
+ size_t objectCount = 0;
+ size_t leafID = 0;
+ for (auto *node : nodes) {
+ if (node->leaf) {
+ HKLeafNode &leafNode = static_cast(*node);
+ objectCount += leafNode.members.size();
+ } else {
+ HKInternalNode &internalNode = static_cast(*node);
+ for (auto &child : internalNode.children) {
+ if (nodes[child.first]->leaf) {
+ if (static_cast(nodes[child.first])->id == 0) {
+ static_cast(nodes[child.first])->id = leafID;
+ } else if (static_cast(nodes[child.first])->id != leafID) {
+ std::cerr << "leaf ID is invalid?" << std::endl;
+ }
+ leafID++;
+ size_t count = 0;
+ clusterCount++;
+ for (auto &v : child.second) {
+ oStream << v;
+ if (++count == child.second.size()) {
+ oStream << std::endl;
+ } else {
+ oStream << "\t";
+ }
+ }
+ }
+ }
+ }
+ }
+ std::cerr << "# of clusters=" << clusterCount << std::endl;
+ return objectCount;
+ }
+
+ static size_t extractIndex(std::ostream &oStream, std::vector &nodes, size_t numOfObjects) {
+ std::vector clusterID(numOfObjects, -1);
+ std::cerr << "numOfObjects=" << numOfObjects << std::endl;
+ std::cerr << "node size=" << nodes.size() << std::endl;
+ for (auto *node : nodes) {
+ if (node->leaf) {
+ HKLeafNode &leafNode = static_cast(*node);
+ for (auto &member : leafNode.members) {
+ if (member > numOfObjects) {
+ std::cerr << "output index: Internal fatal error. " << member << ":" << numOfObjects - 1 << std::endl;
+ abort();
+ }
+ if (member == 0) {
+ std::cerr << "output index: Internal fatal error. Invalid ID" << std::endl;
+ abort();
+ }
+ clusterID[member - 1] = leafNode.id;
+ }
+ }
+ }
+ std::cerr << "clusterID.size=" << clusterID.size() << std::endl;
+ size_t count = 0;
+ for (auto cid : clusterID) {
+ count++;
+ oStream << cid << std::endl;
+ }
+ std::cerr << "# of id=" << count << std::endl;
+ return count;
+ }
+
+ static void extractBtoQAndQCentroid(std::ostream &btoqStream, std::ostream &qStream,
+ std::vector &nodes, size_t numOfThirdClusters) {
+ std::cerr << "extractBtoQ" << std::endl;
+ std::vector btoq(numOfThirdClusters);
+ std::cerr << "numOfThirdClusters=" << numOfThirdClusters << std::endl;
+ std::cerr << "node size=" << nodes.size() << std::endl;
+ size_t rootID = 0;
+ HKInternalNode &root = static_cast(*nodes[rootID]);
+ std::cerr << "first=" << root.children.size() << std::endl;
+ size_t secondCount = 0;
+ size_t thirdCount = 0;
+ size_t objectCount = 0;
+ size_t leafID = 0;
+ size_t qID = 0;
+ for (auto &c1 : root.children) {
+ HKInternalNode &node1 = static_cast(*nodes[c1.first]);
+ std::cerr << "second=" << node1.children.size() << std::endl;
+ secondCount += node1.children.size();
+ for (auto &c2 : node1.children) {
+ HKInternalNode &node2 = static_cast(*nodes[c2.first]);
+ std::cerr << "third=" << node2.children.size() << std::endl;
+ thirdCount += node2.children.size();
+ size_t count = 0;
+ for (auto &v : c2.second) {
+ qStream << v;
+ if (++count == c2.second.size()) {
+ qStream << std::endl;
+ } else {
+ qStream << "\t";
+ }
+ }
+ for (auto &c3 : node2.children) {
+ btoqStream << qID << std::endl;
+ HKLeafNode &leaf = static_cast(*nodes[c3.first]);
+ objectCount += leaf.members.size();
+ if (leaf.id != leafID++) {
+ std::cerr << "leaf is invalid" << leaf.id << ":" << leafID << std::endl;
+ abort();
+ }
+ }
+ qID++;
+ }
+ }
+ std::cerr << "second=" << secondCount << std::endl;
+ std::cerr << "third=" << thirdCount << std::endl;
+ std::cerr << "object=" << objectCount << std::endl;
+ }
+
+ static void extractRandomObjectsFromEachBlob(std::ostream &oStream, std::vector &nodes, size_t numOfObjects,
+ size_t numOfRandomObjects, NGTQ::QuantizerInstance& quantizer, bool extractCentroid) {
+ std::cerr << "node size=" << nodes.size() << std::endl;
+ std::vector>> randomObjects(numOfObjects);
+ std::vector> centroids(numOfObjects);
+ for (auto *node : nodes) {
+ if (node->leaf) {
+ HKLeafNode &leafNode = static_cast(*node);
+ std::vector randomObjectIDXs;
+ if (numOfRandomObjects >= leafNode.members.size()) {
+ randomObjectIDXs = leafNode.members;
+ while (randomObjectIDXs.size() < numOfRandomObjects) {
+ double random = ((double)rand() + 1.0) / ((double)RAND_MAX + 2.0);
+ uint32_t idx = floor(leafNode.members.size() * random);
+ if (idx >= leafNode.members.size()) {
+ std::cerr << "Internal error. " << idx << ":" << leafNode.members.size() << std::endl;
+ abort();
+ }
+ randomObjectIDXs.push_back(leafNode.members[idx]);
+ }
+ } else {
+ srand(leafNode.id);
+ while (randomObjectIDXs.size() < numOfRandomObjects) {
+ uint32_t idx = 0;
+ do {
+ double random = ((double)rand() + 1.0) / ((double)RAND_MAX + 2.0);
+ idx = floor(leafNode.members.size() * random);
+ if (idx >= leafNode.members.size()) {
+ std::cerr << "Internal error. " << idx << ":" << leafNode.members.size() << std::endl;
+ abort();
+ }
+ } while (std::find(randomObjectIDXs.begin(), randomObjectIDXs.end(), leafNode.members[idx]) != randomObjectIDXs.end());
+ std::cerr << "IDX=" << idx << "/" << leafNode.members.size() << std::endl;
+ randomObjectIDXs.push_back(leafNode.members[idx]);
+ }
+ }
+ std::cerr << "randomObjectIDXs=" << randomObjectIDXs.size() << std::endl;
+ for (auto member : randomObjectIDXs) {
+ if (member == 0) {
+ std::cerr << "output index: Internal fatal error. Invalid ID. " << member << std::endl;
+ abort();
+ }
+ std::vector object;
+ quantizer.objectList.get(member, object, &quantizer.globalCodebookIndex.getObjectSpace());
+ if (leafNode.id >= numOfObjects) {
+ std::cerr << "Internal error! Wrong leaf ID. " << leafNode.id << ":" << numOfObjects << std::endl;
+ abort();
+ }
+ randomObjects[leafNode.id].push_back(object);
+ }
+ } else {
+ if (extractCentroid) {
+ HKInternalNode &internalNode = static_cast(*node);
+ for (auto &child : internalNode.children) {
+ if (nodes[child.first]->leaf) {
+ HKLeafNode &leafNode = static_cast(*nodes[child.first]);
+ centroids[leafNode.id] = child.second;
+ }
+ }
+ }
+ }
+ }
+ for (size_t idx = 0; idx < centroids.size(); idx++) {
+ auto &c = centroids[idx];
+ if (extractCentroid && c.empty()) {
+ std::cerr << "qbg: Fatal error! The centroid is empty." << std::endl;
+ abort();
+ }
+ for (size_t i = 0; i < c.size(); i++) {
+ oStream << c[i];
+ if (i + 1 != c.size()) {
+ oStream << "\t";
+ } else {
+ oStream << std::endl;;
+ }
+ }
+ auto &ros = randomObjects[idx];
+ for (auto &ro : ros) {
+ if (ro.empty()) {
+ std::cerr << "qbg: Fatal error! The random object vector is empty." << std::endl;
+ abort();
+ }
+ for (size_t i = 0; i < ro.size(); i++) {
+ oStream << ro[i];
+ if (i + 1 != ro.size()) {
+ oStream << "\t";
+ } else {
+ oStream << std::endl;;
+ }
+ }
+ }
+ }
+ }
+
+ static void extractBtoQIndex(std::ofstream &of, std::vector &nodes, std::vector &qNodeIDs) {
+ size_t leafID = 0;
+ for (size_t qnidx = 0; qnidx < qNodeIDs.size(); qnidx++) {
+ if (nodes[qNodeIDs[qnidx]]->leaf) {
+ std::cerr << "Fatal error. this should be an internal node." << std::endl;
+ abort();
+ }
+ HKInternalNode &inode = static_cast(*nodes[qNodeIDs[qnidx]]);
+ for (auto &c : inode.children) {
+ if (!nodes[c.first]->leaf) {
+ std::cerr << "Fatal error. this should be a leaf." << std::endl;
+ abort();
+ }
+ HKLeafNode &leaf = static_cast(*nodes[c.first]);
+ if (leaf.id == 0) {
+ leaf.id = leafID;
+ }
+ of << qnidx << std::endl;
+ leafID++;
+ }
+ }
+}
+
+
+ static void hierarchicalKmeans(uint32_t id, int32_t rootID, std::vector &object,
+ QBGObjectList &objectList, NGT::ObjectSpace &objectSpace,
+ std::vector &nodes, NGT::Clustering &clustering, size_t maxSize) {
+ NGT::Timer timer;
+ objectList.get(id, object, &objectSpace);
+ int32_t nodeID = searchLeaf(nodes, rootID, reinterpret_cast(&object[0]));
+ if (nodeID < 0) {
+ std::cerr << "Fatal inner error! node ID=" << nodeID << std::endl;
+ exit(1);
+ }
+ auto *node = nodes[nodeID];
+ HKLeafNode &leafNode = static_cast(*node);
+ if (leafNode.members.size() >= maxSize) {
+ NGT::Timer subtimer;
+ subtimer.start();
+ std::vector> vectors;
+ aggregateObjects(leafNode, vectors, objectSpace, objectList, object);
+ subtimer.stop();
+ std::cerr << "aggregate time=" << subtimer << std::endl;
+ subtimer.start();
+ split(id, vectors, nodes, nodeID, clustering);
+ subtimer.stop();
+ std::cerr << "split time=" << subtimer << std::endl;
+ } else {
+ leafNode.members.push_back(id);
+ }
+ }
+
+ static void hierarchicalKmeansBatch(std::vector &batch, std::vector> &exceededLeaves,
+ int32_t rootID, std::vector &object,
+ QBGObjectList &objectList, NGT::ObjectSpace &objectSpace,
+ std::vector &nodes, NGT::Clustering &clustering, size_t maxSize, size_t &nleaves,
+ size_t maxExceededLeaves) {
+
+ if (batch.size() == 0) {
+ return;
+ }
+
+ int32_t nodeIDs[batch.size()];
+
+#pragma omp parallel for
+ for (size_t idx = 0; idx < batch.size(); idx++) {
+ auto id = batch[idx];
+#pragma omp critical
+ objectList.get(id, object, &objectSpace);
+ int32_t nodeID = searchLeaf(nodes, rootID, reinterpret_cast(&object[0]));
+ if (nodeID < 0) {
+ std::cerr << "Fatal inner error! node ID=" << nodeID << std::endl;
+ exit(1);
+ }
+ nodeIDs[idx] = nodeID;
+ }
+
+
+ for (size_t idx = 0; idx < batch.size(); idx++) {
+ auto id = batch[idx];
+ HKLeafNode &leafNode = static_cast(*nodes[nodeIDs[idx]]);
+ leafNode.members.push_back(id);
+ if (leafNode.members.size() > maxSize) {
+ auto i = exceededLeaves.begin();
+ for (; i != exceededLeaves.end(); i++) {
+ if (static_cast((*i).second) == nodeIDs[idx]) break;
+ }
+ if (i == exceededLeaves.end()) {
+ exceededLeaves.push_back(std::make_pair(batch[idx], nodeIDs[idx]));
+ }
+ }
+ }
+
+ batch.clear();
+
+ if (exceededLeaves.size() < maxExceededLeaves) {
+ return;
+ }
+
+ std::vector> clusters(exceededLeaves.size());
+#pragma omp parallel for
+ for (size_t idx = 0; idx < exceededLeaves.size(); idx++) {
+ HKLeafNode &leafNode = static_cast(*nodes[exceededLeaves[idx].second]);
+ std::vector> vectors;
+#pragma omp critical
+ aggregateObjects(leafNode, vectors, objectSpace, objectList);
+ clustering.kmeans(vectors, clusters[idx]);
+ }
+
+ std::cerr << "exceeded leaves=" << exceededLeaves.size() << std::endl;
+ for (size_t idx = 0; idx < exceededLeaves.size(); idx++) {
+ auto leafNodeID = exceededLeaves[idx].second;
+ HKLeafNode &leafNode = static_cast(*nodes[leafNodeID]);
+ auto *newNode = new HKInternalNode;
+ for (auto &cluster : clusters[idx]) {
+ newNode->children.push_back(std::make_pair(nodes.size(), std::move(cluster.centroid)));
+ auto *cnode = new HKLeafNode;
+ nodes.push_back(cnode);
+ for (auto &member : cluster.members) {
+ cnode->members.push_back(leafNode.members[member.vectorID]);
+ }
+ }
+ nleaves += clusters[idx].size() - 1;
+ delete nodes[leafNodeID];
+ nodes[leafNodeID] = newNode;
+ }
+ exceededLeaves.clear();
+
+ }
+
+ static void hierarchicalKmeansWithNumberOfClusters(size_t numOfTotalClusters, size_t numOfObjects, size_t numOfLeaves,
+ QBGObjectList &objectList, NGT::ObjectSpace &objectSpace,
+ std::vector &nodes, NGT::Clustering::InitializationMode initMode){
+ std::cerr << "numOfTotalClusters=" << numOfTotalClusters << std::endl;
+ std::cerr << "numOfLeaves=" << numOfLeaves << std::endl;
+ if (numOfLeaves > numOfTotalClusters) {
+ std::cerr << "# of clusters is invalid. " << numOfLeaves << ":" << numOfTotalClusters << std::endl;
+ abort();
+ }
+ auto numOfRemainingClusters = numOfTotalClusters;
+ auto numOfRemainingVectors = numOfObjects;
+ size_t leafCount = 0;
+ size_t nodeSize = nodes.size();
+ for (size_t nidx = 0; nidx < nodeSize; nidx++) {
+ if (nodes[nidx]->leaf) {
+ leafCount++;
+ if (numOfLeaves >= 100 && leafCount % (numOfLeaves / 100) == 0) {
+ std::cerr << "Processed leaves: " << leafCount << " " << leafCount * 100 / numOfLeaves << "%" << std::endl;
+ }
+ HKLeafNode &leafNode = static_cast(*nodes[nidx]);
+ std::vector> vectors;
+ aggregateObjects(leafNode, vectors, objectSpace, objectList);
+ size_t nClusters = round(static_cast(leafNode.members.size()) / numOfRemainingVectors * numOfRemainingClusters);
+ nClusters = nClusters == 0 ? 1 : nClusters;
+ numOfRemainingVectors -= leafNode.members.size();
+ numOfRemainingClusters -= nClusters;
+ NGT::Clustering clustering(initMode, NGT::Clustering::ClusteringTypeKmeansWithoutNGT, 1000, nClusters);
+ NGT::Timer timer;
+ timer.start();
+ split(0, vectors, nodes, nidx, clustering);
+ timer.stop();
+ if (nodes[nidx]->leaf) {
+ std::cerr << "At this moment, the second node should be an internal" << std::endl;
+ abort();
+ }
+ }
+ }
+ }
+
+ static void hierarchicalKmeansWithNumberOfClustersInParallel(size_t numOfTotalClusters, size_t numOfObjects, size_t numOfLeaves,
+ QBGObjectList &objectList, NGT::ObjectSpace &objectSpace,
+ std::vector &nodes, NGT::Clustering::InitializationMode initMode){
+ NGT::Timer timer;
+ timer.start();
+ auto numOfRemainingClusters = numOfTotalClusters;
+ auto numOfRemainingVectors = numOfObjects;
+ size_t leafCount = 0;
+
+ std::vector> leafNodes;
+ leafNodes.reserve(numOfLeaves);
+ for (size_t nidx = 0; nidx < nodes.size(); nidx++) {
+ if (nodes[nidx]->leaf) {
+ leafCount++;
+ {
+ size_t step = 10;
+ if (numOfLeaves >= step && leafCount % (numOfLeaves / step) == 0) {
+ std::cerr << "Processed leaves: " << leafCount << " " << leafCount * step / numOfLeaves << "%" << std::endl;
+ }
+ }
+ HKLeafNode &leafNode = static_cast(*nodes[nidx]);
+ size_t nClusters = round(static_cast(leafNode.members.size()) / numOfRemainingVectors * numOfRemainingClusters);
+ nClusters = nClusters == 0 ? 1 : nClusters;
+ numOfRemainingVectors -= leafNode.members.size();
+ numOfRemainingClusters -= nClusters;
+ leafNodes.push_back(std::make_pair(nidx, nClusters));
+ }
+ }
+ timer.stop();
+ std::cerr << "hierarchicalKmeansWithNumberOfClustersInParallel: extract leaves. Time=" << timer << std::endl;
+ timer.start();
+
+ std::cerr << "start kmeans..." << std::endl;
+ std::vector> clusters(leafNodes.size());
+#pragma omp parallel for
+ for (size_t nidx = 0; nidx < leafNodes.size(); nidx++) {
+ HKLeafNode &leafNode = static_cast(*nodes[leafNodes[nidx].first]);
+ std::vector> vectors;
+#pragma omp critical
+ aggregateObjects(leafNode, vectors, objectSpace, objectList);
+ NGT::Clustering clustering(initMode, NGT::Clustering::ClusteringTypeKmeansWithoutNGT, 1000, leafNodes[nidx].second);
+ clustering.kmeans(vectors, clusters[nidx]);
+ }
+
+ timer.stop();
+ std::cerr << "hierarchicalKmeansWithNumberOfClustersInParallel: kmeans. Time=" << timer << std::endl;
+ timer.start();
+
+ std::cerr << "add nodes..." << std::endl;
+ for (size_t idx = 0; idx < leafNodes.size(); idx++) {
+ auto leafNodeID = leafNodes[idx].first;
+ HKLeafNode &leafNode = static_cast(*nodes[leafNodeID]);
+ auto *newNode = new HKInternalNode;
+ for (auto &cluster : clusters[idx]) {
+ newNode->children.push_back(std::make_pair(nodes.size(), std::move(cluster.centroid)));
+ auto *cnode = new HKLeafNode;
+ nodes.push_back(cnode);
+ for (auto &member : cluster.members) {
+ cnode->members.push_back(leafNode.members[member.vectorID]);
+ }
+ }
+ delete nodes[leafNodeID];
+ nodes[leafNodeID] = newNode;
+ }
+ timer.stop();
+ std::cerr << "hierarchicalKmeansWithNumberOfClustersInParallel: add nodes. Time=" << timer << std::endl;
+
+ }
+
+ static void flattenClusters(std::vector &upperClusters,
+ std::vector> &lowerClusters,
+ size_t numOfLowerClusters,
+ std::vector &flatClusters) {
+
+
+ flatClusters.clear();
+ flatClusters.reserve(numOfLowerClusters);
+
+ for (size_t idx1 = 0; idx1 < lowerClusters.size(); idx1++) {
+ for (size_t idx2 = 0; idx2 < lowerClusters[idx1].size(); idx2++) {
+ for (auto &m : lowerClusters[idx1][idx2].members) {
+ m.vectorID = upperClusters[idx1].members[m.vectorID].vectorID;
+ }
+ flatClusters.push_back(lowerClusters[idx1][idx2]);
+ }
+ }
+
+ }
+
+#ifndef MULTIPLE_OBJECT_LISTS
+ void subclustering(std::vector &upperClusters, size_t numOfLowerClusters, size_t numOfObjects,
+ NGT::ObjectSpace &objectSpace, QBGObjectList &objectList,
+ NGT::Clustering::InitializationMode initMode, std::vector> &lowerClusters) {
+ std::vector nPartialClusters(upperClusters.size());
+ auto numOfRemainingClusters = numOfLowerClusters;
+ auto numOfRemainingVectors = numOfObjects;
+ size_t ts = 0;
+ for (size_t idx = 0; idx < upperClusters.size(); idx++) {
+ size_t ncs = round(static_cast(upperClusters[idx].members.size()) / numOfRemainingVectors *
+ numOfRemainingClusters);
+ ncs = ncs == 0 ? 1 : ncs;
+ numOfRemainingVectors -= upperClusters[idx].members.size();
+ if (numOfRemainingClusters >= ncs) {
+ numOfRemainingClusters -= ncs;
+ }
+ nPartialClusters[idx] = ncs;
+ ts += ncs;
+ }
+ std::cerr << "numOfRemainingClusters=" << numOfRemainingClusters << std::endl;
+ std::cerr << "numOfRemainingVectors=" << numOfRemainingVectors << std::endl;
+ std::cerr << "upperClusters=" << upperClusters.size() << std::endl;
+ std::cerr << "total=" << ts << ":" << numOfLowerClusters << std::endl;
+ if (ts < numOfLowerClusters || numOfRemainingClusters != 0) {
+ std::cerr << "subclustering: Internal error! " << std::endl;
+ exit(1);
+ }
+
+ lowerClusters.resize(upperClusters.size());
+#pragma omp parallel for schedule(dynamic)
+ for (size_t idx = 0; idx < upperClusters.size(); idx++) {
+ std::vector> partialVectors;
+ partialVectors.reserve(upperClusters[idx].members.size());
+ std::vector obj;
+#pragma omp critical
+ {
+ for (auto &m : upperClusters[idx].members) {
+ objectList.get(m.vectorID + 1, obj, &objectSpace);
+ partialVectors.push_back(obj);
+ }
+ }
+ if (upperClusters[idx].members.size() != partialVectors.size()) {
+ std::cerr << "the sizes of members are not consistent" << std::endl;
+ abort();
+ }
+ NGT::Clustering lowerClustering(initMode, NGT::Clustering::ClusteringTypeKmeansWithoutNGT, 1000);
+ lowerClustering.kmeans(partialVectors, nPartialClusters[idx], lowerClusters[idx]);
+ if (nPartialClusters[idx] != lowerClusters[idx].size()) {
+ std::cerr << "the sizes of cluster members are not consistent" << std::endl;
+ abort();
+ }
+ }
+ size_t nc = 0;
+ size_t mc = 0;
+ for (auto &cs : lowerClusters) {
+ nc += cs.size();
+ for (auto &c : cs) {
+ mc += c.members.size();
+ }
+ }
+ std::cerr << "# of clusters=" << nc << " # of members=" << mc << std::endl;
+ }
+ void subclustering(std::vector &upperClusters, size_t numOfLowerClusters, size_t numOfObjects,
+ NGT::ObjectSpace &objectSpace, QBGObjectList &objectList,
+ NGT::Clustering::InitializationMode initMode, std::vector &flatLowerClusters) {
+
+ std::vector> lowerClusters;
+ subclustering(upperClusters, numOfLowerClusters, numOfObjects, objectSpace, objectList, initMode, lowerClusters);
+
+ flattenClusters(upperClusters, lowerClusters, numOfLowerClusters, flatLowerClusters);
+
+ }
+
+#else
+ static void subclustering(std::vector &upperClusters, size_t numOfLowerClusters, size_t numOfObjects,
+ NGT::ObjectSpace &objectSpace, QBGObjectList &objectList,
+ NGT::Clustering::InitializationMode initMode, std::vector> &lowerClusters) {
+ std::vector nPartialClusters(upperClusters.size());
+ auto numOfRemainingClusters = numOfLowerClusters;
+ auto numOfRemainingVectors = numOfObjects;
+ size_t ts = 0;
+ for (size_t idx = 0; idx < upperClusters.size(); idx++) {
+ size_t ncs = round(static_cast(upperClusters[idx].members.size()) / numOfRemainingVectors *
+ numOfRemainingClusters);
+ ncs = ncs == 0 ? 1 : ncs;
+ numOfRemainingVectors -= upperClusters[idx].members.size();
+ if (numOfRemainingClusters >= ncs) {
+ numOfRemainingClusters -= ncs;
+ }
+ nPartialClusters[idx] = ncs;
+ ts += ncs;
+ }
+
+ std::cerr << "numOfRemainingClusters=" << numOfRemainingClusters << std::endl;
+ std::cerr << "numOfRemainingVectors=" << numOfRemainingVectors << std::endl;
+ std::cerr << "upperClusters=" << upperClusters.size() << std::endl;
+ std::cerr << "total=" << ts << ":" << numOfLowerClusters << std::endl;
+ if (ts < numOfLowerClusters || numOfRemainingClusters != 0) {
+ std::cerr << "subclustering: Internal error! " << std::endl;
+ exit(1);
+ }
+
+ auto nthreads = omp_get_max_threads();
+ if (!objectList.openMultipleStreams(nthreads)) {
+ std::cerr << "Cannot open multiple streams." << std::endl;
+ abort();
+ }
+
+ lowerClusters.resize(upperClusters.size());
+#pragma omp parallel for schedule(dynamic)
+ for (size_t idx = 0; idx < upperClusters.size(); idx++) {
+ std::vector> partialVectors;
+ partialVectors.reserve(upperClusters[idx].members.size());
+ std::vector obj;
+ auto threadid = omp_get_thread_num();
+ //#pragma omp critical
+ {
+ for (auto &m : upperClusters[idx].members) {
+ if (threadid >= nthreads) {
+ std::cerr << "inner fatal error. # of threads=" << nthreads << ":" << threadid << std::endl;
+ exit(1);
+ }
+ if (!objectList.get(threadid, m.vectorID + 1, obj, &objectSpace)) {
+ std::cerr << "subclustering: Fatal error! cannot get!!!! " << m.vectorID + 1 << std::endl;
+ abort();
+ }
+ partialVectors.push_back(obj);
+ }
+ }
+ if (upperClusters[idx].members.size() != partialVectors.size()) {
+ std::cerr << "the sizes of members are not consistent" << std::endl;
+ abort();
+ }
+ NGT::Clustering lowerClustering(initMode, NGT::Clustering::ClusteringTypeKmeansWithoutNGT, 1000);
+ lowerClustering.kmeans(partialVectors, nPartialClusters[idx], lowerClusters[idx]);
+ if (nPartialClusters[idx] != lowerClusters[idx].size()) {
+ std::cerr << "the sizes of cluster members are not consistent" << std::endl;
+ abort();
+ }
+ }
+ size_t nc = 0;
+ size_t mc = 0;
+ for (auto &cs : lowerClusters) {
+ nc += cs.size();
+ for (auto &c : cs) {
+ mc += c.members.size();
+ }
+ }
+ std::cerr << "# of clusters=" << nc << " # of members=" << mc << std::endl;
+ }
+
+ static void subclustering(std::vector &upperClusters, size_t numOfLowerClusters, size_t numOfObjects,
+ NGT::ObjectSpace &objectSpace, QBGObjectList &objectList,
+ NGT::Clustering::InitializationMode initMode, std::vector &flatLowerClusters) {
+
+ std::vector> lowerClusters;
+ subclustering(upperClusters, numOfLowerClusters, numOfObjects, objectSpace, objectList, initMode, lowerClusters);
+
+ flattenClusters(upperClusters, lowerClusters, numOfLowerClusters, flatLowerClusters);
+
+ }
+
+#endif
+
+
+ static void assign(std::vector &clusters, size_t beginID, size_t endID,
+ NGT::ObjectSpace &objectSpace, QBGObjectList &objectList) {
+
+#ifdef MULTIPLE_OBJECT_LISTS
+ if (!objectList.openMultipleStreams(omp_get_max_threads())) {
+ std::cerr << "Cannot open multiple streams." << std::endl;
+ abort();
+ }
+#endif
+
+ size_t count = 0;
+#pragma omp parallel for
+ for (size_t id = beginID; id <= endID; id++) {
+ std::vector obj;
+ //#pragma omp critical
+#ifdef MULTIPLE_OBJECT_LISTS
+ objectList.get(omp_get_thread_num(), id, obj, &objectSpace);
+#else
+ objectList.get(id, obj, &objectSpace);
+#endif
+ float min = std::numeric_limits::max();
+ int minidx = -1;
+ for (size_t cidx = 0; cidx != clusters.size(); cidx++) {
+ auto d = NGT::PrimitiveComparator::compareL2(reinterpret_cast(obj.data()),
+ clusters[cidx].centroid.data(), obj.size());
+ if (d < min) {
+ min = d;
+ minidx = cidx;
+ }
+ }
+ if (minidx < 0) {
+ std::cerr << "assign: Fatal error!" << std::endl;
+ abort();
+ }
+#pragma omp critical
+ {
+ clusters[minidx].members.push_back(NGT::Clustering::Entry(id - 1, minidx, min));
+ count++;
+ if (count % 1000000 == 0) {
+ std::cerr << "# of assigned objects=" << count << std::endl;
+ }
+ }
+ }
+
+ }
+
+ static void assignWithNGT(std::vector &clusters, size_t beginID, size_t endID,
+ NGT::ObjectSpace &objectSpace, QBGObjectList &objectList) {
+ if (beginID > endID) {
+ std::cerr << "assignWithNGT::Warning. beginID:" << beginID << " > endID:" << endID << std::endl;
+ return;
+ }
+
+ NGT::Property prop;
+ prop.dimension = objectSpace.getDimension();
+ prop.objectType = NGT::Index::Property::ObjectType::Float;
+ prop.distanceType = NGT::Property::DistanceType::DistanceTypeL2;
+ prop.edgeSizeForCreation = 10;
+ prop.edgeSizeForSearch = 40;
+
+#ifdef NGT_SHARED_MEMORY_ALLOCATOR
+ NGT::Index index(prop, "dummy");
+#else
+ NGT::Index index(prop);
+#endif
+ for (size_t cidx = 0; cidx < clusters.size(); cidx++) {
+ if (cidx % 100000 == 0) {
+ std::cerr << "# of appended cluster objects=" << cidx << std::endl;
+ }
+ index.append(clusters[cidx].centroid);
+ }
+ std::cerr << "createIndex..." << std::endl;
+ index.createIndex(500);
+
+ std::cerr << "assign with NGT..." << std::endl;
+ endID++;
+#ifdef MULTIPLE_OBJECT_LISTS
+ if (!objectList.openMultipleStreams(omp_get_max_threads())) {
+ std::cerr << "Cannot open multiple streams." << std::endl;
+ abort();
+ }
+#endif
+ std::vector> clusterIDs(endID - beginID);
+#pragma omp parallel for
+ for (size_t id = beginID; id < endID; id++) {
+ std::vector obj;
+#ifdef MULTIPLE_OBJECT_LISTS
+ objectList.get(omp_get_thread_num(), id, obj, &objectSpace);
+#else
+ objectList.get(id, obj, &objectSpace);
+#endif
+ NGT::SearchQuery sc(obj);
+ NGT::ObjectDistances objects;
+ sc.setResults(&objects);
+ sc.setSize(10);
+ sc.setEpsilon(0.12);
+ index.search(sc);
+ //index.linearSearch(sc);
+ clusterIDs[id - beginID] = make_pair(objects[0].id - 1, objects[0].distance);
+ }
+ std::cerr << "pushing..." << std::endl;
+ for (size_t id = beginID; id < endID; id++) {
+ auto cid = clusterIDs[id - beginID].first;
+ auto cdistance = clusterIDs[id - beginID].second;
+ clusters[cid].members.push_back(NGT::Clustering::Entry(id - 1, cid, cdistance));
+ }
+ }
+
+#ifdef NGTQ_QBG
+ void treeBasedTopdownClustering(std::string prefix, QBG::Index &index, uint32_t rootID, std::vector &object, std::vector &nodes, NGT::Clustering &clustering) {
+ auto &quantizer = static_cast&>(index.getQuantizer());
+ auto &objectSpace = quantizer.globalCodebookIndex.getObjectSpace();
+ QBGObjectList &objectList = quantizer.objectList;
+ NGT::Timer timer;
+ timer.start();
+ std::vector batch;
+ std::vector> exceededLeaves;
+ size_t nleaves = 1;
+ size_t nOfThreads = 32;
+ for (size_t id = 1; id <= numOfObjects; id++) {
+ if (id % (numOfObjects / 100) == 0) {
+ timer.stop();
+ std::cerr << "# of processed objects=" << id << " " << id * 100 / numOfObjects << "% " << timer << " # of leaves=" << nleaves << std::endl;
+ timer.start();
+ }
+ batch.push_back(id);
+ if (batch.size() > 100000) {
+ size_t kmeansBatchSize = nleaves < nOfThreads ? nleaves : nOfThreads;
+ hierarchicalKmeansBatch(batch, exceededLeaves, rootID, object, objectList, objectSpace, nodes,
+ clustering, maxSize, nleaves, kmeansBatchSize);
+
+ }
+ }
+ hierarchicalKmeansBatch(batch, exceededLeaves, rootID, object, objectList, objectSpace, nodes,
+ clustering, maxSize, nleaves, 0);
+
+ if (numOfTotalClusters != 0) {
+ NGT::Timer timer;
+ timer.start();
+ size_t numOfLeaves = 0;
+ for (auto node : nodes) {
+ if (node->leaf) {
+ numOfLeaves++;
+ }
+ }
+ std::cerr << "# of nodes=" << nodes.size() << std::endl;
+ std::cerr << "# of leaves=" << numOfLeaves << std::endl;
+ std::cerr << "clustering for quantization." << std::endl;
+ hierarchicalKmeansWithNumberOfClustersInParallel(numOfTotalClusters, numOfObjects, numOfLeaves,
+ objectList, objectSpace, nodes, initMode);
+ if (numOfTotalBlobs != 0) {
+ NGT::Timer timer;
+ timer.start();
+ size_t numOfLeaves = 0;
+ for (auto node : nodes) {
+ if (node->leaf) {
+ numOfLeaves++;
+ }
+ }
+ std::cerr << "# of leaves=" << numOfLeaves << ":" << numOfTotalClusters << std::endl;
+ if (numOfLeaves != numOfTotalClusters) {
+ std::cerr << "# of leaves is invalid " << numOfLeaves << ":" << numOfTotalClusters << std::endl;
+ abort();
+ }
+ {
+ std::ofstream of(prefix + "_qcentroid.tsv");
+ extractCentroids(of, nodes);
+ }
+ std::vector qNodeIDs;
+ for (uint32_t nid = 0; nid < nodes.size(); nid++) {
+ if (nodes[nid]->leaf) {
+ qNodeIDs.push_back(nid);
+ }
+ }
+ std::cerr << "clustering to make blobs." << std::endl;
+ hierarchicalKmeansWithNumberOfClustersInParallel(numOfTotalBlobs, numOfObjects, numOfTotalClusters,
+ objectList, objectSpace, nodes, initMode);
+ {
+ std::ofstream of(prefix + "_btoq_index.tsv");
+ extractBtoQIndex(of, nodes, qNodeIDs);
+ }
+ }
+ }
+
+ }
+
+ void multilayerClustering(std::string prefix, QBG::Index &index) {
+ auto &quantizer = static_cast&>(index.getQuantizer());
+ auto &objectSpace = quantizer.globalCodebookIndex.getObjectSpace();
+ {
+ std::cerr << "Three layer clustering..." << std::endl;
+ std::cerr << "HiearchicalKmeans::clustering: # of clusters=" << numOfThirdClusters << ":" << index.getQuantizer().property.globalCentroidLimit << std::endl;
+ if (index.getQuantizer().objectList.size() <= 1) {
+ NGTThrowException("optimize: No objects");
+ }
+ if (numOfThirdClusters == 0) {
+ if (index.getQuantizer().property.globalCentroidLimit == 0) {
+ numOfThirdClusters = index.getQuantizer().objectList.size() / 1000;
+ numOfThirdClusters = numOfThirdClusters == 0 ? 1 : numOfThirdClusters;
+ numOfThirdClusters = numOfThirdClusters > 1000000 ? 1000000 : numOfThirdClusters;
+ } else {
+ numOfThirdClusters = index.getQuantizer().property.globalCentroidLimit;
+ }
+ }
+ if (numOfThirdClusters != 0 && index.getQuantizer().property.globalCentroidLimit != 0 &&
+ numOfThirdClusters != index.getQuantizer().property.globalCentroidLimit) {
+ }
+ auto &quantizer = static_cast&>(index.getQuantizer());
+ QBGObjectList &objectList = quantizer.objectList;
+ if (numOfObjects == 0) {
+ numOfObjects = objectList.size() - 1;
+ }
+
+ std::cerr << "The first layer. " << numOfFirstClusters << ":" << numOfFirstObjects << std::endl;
+ if (numOfThirdClusters == 0 || numOfObjects == 0) {
+ NGTThrowException("numOfThirdClusters or numOfObjects are zero");
+ }
+ numOfSecondClusters = numOfSecondClusters == 0 ? numOfThirdClusters : numOfSecondClusters;
+ numOfFirstClusters = numOfFirstClusters == 0 ? static_cast(sqrt(numOfSecondClusters)) : numOfFirstClusters;
+ numOfSecondObjects = numOfSecondClusters * 100;
+ numOfSecondObjects = numOfSecondObjects > numOfObjects ? numOfObjects : numOfSecondObjects;
+ numOfFirstObjects = numOfFirstClusters * 2000;
+ numOfFirstObjects = numOfFirstObjects > numOfSecondObjects ? numOfSecondObjects : numOfFirstObjects;
+ if (numOfFirstObjects < numOfFirstClusters) {
+ std::stringstream msg;
+ msg << "# of objects for the first should be larger than # of the first clusters. " << numOfFirstObjects << ":" << numOfFirstClusters;
+ NGTThrowException(msg);
+ }
+ if (numOfFirstClusters > numOfSecondClusters) {
+ std::stringstream msg;
+ msg << "# of the first clusters should be larger than or equal to # of the second clusters. " << numOfFirstClusters << ":" << numOfSecondClusters;
+ NGTThrowException(msg);
+ }
+ if (numOfSecondClusters > numOfThirdClusters) {
+ std::stringstream msg;
+ msg << "# of the second clusters should be larger than or equal to # of the second clusters. " << numOfSecondClusters << ":" << numOfThirdClusters;
+ NGTThrowException(msg);
+ }
+ if (numOfFirstClusters > numOfSecondClusters) {
+ std::stringstream msg;
+ msg << "# of the second clusters should be larger than # of the first clusters. " << numOfFirstClusters << ":" << numOfSecondClusters;
+ NGTThrowException(msg);
+ }
+
+ std::cerr << "Three layer clustering:" << numOfFirstClusters << ":" << numOfFirstObjects << "," << numOfSecondClusters << ":" << numOfSecondObjects << "," << numOfThirdClusters << ":" << numOfObjects << std::endl;
+
+ NGT::Clustering firstClustering(initMode, NGT::Clustering::ClusteringTypeKmeansWithoutNGT, 300);
+ float clusterSizeConstraint = 5.0;
+ firstClustering.setClusterSizeConstraintCoefficient(clusterSizeConstraint);
+ std::cerr << "size constraint=" << clusterSizeConstraint << std::endl;
+ std::vector> vectors;
+ vectors.reserve(numOfFirstObjects);
+ std::vector obj;
+ for (size_t id = 1; id <= numOfFirstObjects; id++) {
+ if (id % 1000000 == 0) {
+ std::cerr << "# of prcessed objects is " << id << std::endl;
+ }
+ if (!objectList.get(id, obj, &objectSpace)) {
+ std::stringstream msg;
+ msg << "qbg: Cannot get object. ID=" << id;
+ NGTThrowException(msg);
+ }
+ vectors.push_back(obj);
+ }
+ std::cerr << "Kmeans... " << vectors.size() << " to " << numOfFirstClusters << std::endl;
+ std::vector firstClusters;
+ NGT::Timer timer;
+
+ timer.start();
+ firstClustering.kmeans(vectors, numOfFirstClusters, firstClusters);
+ timer.stop();
+ std::cerr << "# of clusters=" << firstClusters.size() << " time=" << timer << std::endl;
+
+ std::vector> otherVectors;
+ timer.start();
+ std::cerr << "Assign for the second. (" << numOfFirstObjects << "-" << numOfSecondObjects << ")..." << std::endl;
+ assign(firstClusters, numOfFirstObjects + 1, numOfSecondObjects, objectSpace, objectList);
+ timer.stop();
+ std::cerr << "Assign(1) time=" << timer << std::endl;
+
+ std::cerr << "subclustering for the second." << std::endl;
+ std::vector secondClusters;
+ timer.start();
+ subclustering(firstClusters, numOfSecondClusters, numOfSecondObjects, objectSpace, objectList, initMode, secondClusters);
+ timer.stop();
+ std::cerr << "subclustering(1) time=" << timer << std::endl;
+ std::cerr << "save quantization centroid" << std::endl;
+ NGT::Clustering::saveClusters(prefix + "_qcentroid.tsv", secondClusters);
+ timer.start();
+ std::cerr << "Assign for the third. (" << numOfSecondObjects << "-" << numOfObjects << ")..." << std::endl;
+ assignWithNGT(secondClusters, numOfSecondObjects + 1, numOfObjects, objectSpace, objectList);
+ timer.stop();
+ std::cerr << "Assign(2) time=" << timer << std::endl;
+ std::cerr << "subclustering for the third." << std::endl;
+ std::vector> thirdClusters;
+ timer.start();
+ subclustering(secondClusters, numOfThirdClusters, numOfObjects, objectSpace, objectList, initMode, thirdClusters);
+ timer.stop();
+ std::cerr << "subclustering(2) time=" << timer << std::endl;
+ {
+ std::vector bqindex;
+ for (size_t idx1 = 0; idx1 < thirdClusters.size(); idx1++) {
+ for (size_t idx2 = 0; idx2 < thirdClusters[idx1].size(); idx2++) {
+ bqindex.push_back(idx1);
+ }
+ }
+ std::cerr << "save bqindex..." << std::endl;
+ NGT::Clustering::saveVector(prefix + "_bqindex.tsv", bqindex);
+ }
+
+ std::vector thirdFlatClusters;
+ flattenClusters(secondClusters, thirdClusters, numOfThirdClusters, thirdFlatClusters);
+
+ std::cerr << "save centroid..." << std::endl;
+ NGT::Clustering::saveClusters(prefix + "_centroid.tsv", thirdFlatClusters);
+
+ {
+ std::vector cindex(numOfObjects);
+ for (size_t cidx = 0; cidx < thirdFlatClusters.size(); cidx++) {
+ for (auto mit = thirdFlatClusters[cidx].members.begin(); mit != thirdFlatClusters[cidx].members.end(); ++mit) {
+ size_t vid = (*mit).vectorID;
+ cindex[vid] = cidx;
+ }
+ }
+ std::cerr << "save index... " << cindex.size() << std::endl;
+ NGT::Clustering::saveVector(prefix + "_index.tsv", cindex);
+ }
+ std::cerr << "end of clustering" << std::endl;
+ return;
+ }
+
+ }
+
+ void clustering(std::string indexPath, std::string prefix = "", std::string objectIDsFile = "") {
+ NGT::StdOstreamRedirector redirector(silence);
+ redirector.begin();
+
+ QBG::Index index(indexPath, true);
+ if (threeLayerClustering) {
+
+ if (prefix.empty()) {
+ std::cerr << "Prefix is not specified." << std::endl;
+ prefix = indexPath + "/" + QBG::Index::getWorkspaceName();
+ try {
+ NGT::Index::mkdir(prefix);
+ } catch(...) {}
+ prefix +="/kmeans-cluster";
+ std::cerr << prefix << " is used" << std::endl;
+ }
+ auto &quantizer = static_cast&>(index.getQuantizer());
+ auto &objectSpace = quantizer.globalCodebookIndex.getObjectSpace();
+ size_t paddedDimension = objectSpace.getPaddedDimension();
+ size_t dimension = objectSpace.getDimension();
+ if (paddedDimension != dimension) {
+ std::cerr << "HierarachicalKmeans: Warning! Dimensions are inconsistent. Dimension=" << paddedDimension << ":" << dimension << std::endl;
+ }
+ multilayerClustering(prefix, index);
+ redirector.end();
+ return;
+ }
+
+ NGT::Clustering::ClusteringType clusteringType = NGT::Clustering::ClusteringTypeKmeansWithoutNGT;
+
+ uint32_t rootID = 0;
+ std::vector nodes;
+ nodes.push_back(new HKLeafNode);
+
+ std::vector object;
+ size_t iteration = 1000;
+ NGT::Clustering clustering(initMode, clusteringType, iteration, numOfClusters);
+ auto &quantizer = static_cast&>(index.getQuantizer());
+ QBGObjectList &objectList = quantizer.objectList;
+ if (objectIDsFile.empty()) {
+ treeBasedTopdownClustering(prefix, index, rootID, object, nodes, clustering);
+ } else {
+ std::cerr << "Cluster ID=" << clusterID << std::endl;
+ if (clusterID < 0) {
+ std::stringstream msg;
+ msg << "Any target cluster ID is not specified.";
+ NGTThrowException(msg);
+ }
+ std::ifstream objectIDs(objectIDsFile);
+ if (!objectIDs) {
+ std::stringstream msg;
+ msg << "Cannot open the object id file. " << objectIDsFile;
+ NGTThrowException(msg);
+ }
+ auto &objectSpace = quantizer.globalCodebookIndex.getObjectSpace();
+ uint32_t id = 1;
+ int32_t cid;
+ size_t ccount = 0;
+ while (objectIDs >> cid) {
+ std::cerr << cid << std::endl;
+ if (id % 100000 == 0) {
+ std::cerr << "# of processed objects=" << id << std::endl;
+ }
+ if (cid == -1) {
+ continue;
+ }
+ if (cid == clusterID) {
+ ccount++;
+ hierarchicalKmeans(id, rootID, object, objectList, objectSpace, nodes, clustering, maxSize);
+ }
+ id++;
+ }
+ }
+ size_t objectCount = 0;
+ if (prefix.empty()) {
+ objectCount = extractCentroids(std::cout, nodes);
+ } else {
+ {
+ std::ofstream of(prefix + "_centroid.tsv");
+ objectCount = extractCentroids(of, nodes);
+ }
+ {
+ std::ofstream of(prefix + "_index.tsv");
+ extractIndex(of, nodes, numOfObjects);
+ }
+ if (numOfFirstObjects > 0) {
+ std::ofstream btoqof(prefix + "_btoq.tsv");
+ std::ofstream qcof(prefix + "_qcentroid.tsv");
+ extractBtoQAndQCentroid(btoqof, qcof, nodes, numOfThirdClusters);
+ }
+ if (numOfRandomObjects > 0) {
+ std::ofstream of(prefix + "_random_object.tsv");
+ if (extractCentroid) {
+ extractRandomObjectsFromEachBlob(of, nodes, numOfObjects, numOfRandomObjects - 1, quantizer, extractCentroid);
+ } else {
+ extractRandomObjectsFromEachBlob(of, nodes, numOfObjects, numOfRandomObjects, quantizer, extractCentroid);
+ }
+ }
+ }
+ if (objectCount != numOfObjects) {
+ std::cerr << "# of objects is invalid. " << objectCount << ":" << numOfObjects << std::endl;
+ }
+ redirector.end();
+ }
+#endif
+
+ size_t maxSize;
+ size_t numOfObjects;
+ size_t numOfClusters;
+ size_t numOfTotalClusters;
+ size_t numOfTotalBlobs;
+ int32_t clusterID;
+
+ NGT::Clustering::InitializationMode initMode;
+
+ size_t numOfRandomObjects;
+
+ size_t numOfFirstObjects;
+ size_t numOfFirstClusters;
+ size_t numOfSecondObjects;
+ size_t numOfSecondClusters;
+ size_t numOfThirdClusters;
+ bool extractCentroid;
+
+ bool threeLayerClustering;
+ bool silence;
+ };
+}
diff --git a/lib/NGT/NGTQ/Matrix.h b/lib/NGT/NGTQ/Matrix.h
new file mode 100644
index 0000000..042968a
--- /dev/null
+++ b/lib/NGT/NGTQ/Matrix.h
@@ -0,0 +1,687 @@
+//
+// Copyright (C) 2021 Yahoo Japan Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+extern "C" {
+ // svd
+ void dgesvd_(char* jobu, char* jobvt, int* m, int* n, double* a,
+ int* lda, double* s, double* u, int* ldu, double* vt, int* ldvt,
+ double* work, int* lwork, int* info);
+ void sgesvd_(char* jobu, char* jobvt, int* m, int* n, float* a,
+ int* lda, float* s, float* u, int* ldu, float* vt, int* ldvt,
+ float* work, int* lwork, int* info);
+ // multiplication
+ void dgemm_(char *transa, char *transb, int *m, int *n, int *k,
+ double *alpha, double *a, int *lda, double *b, int *ldb,
+ double *beta , double *c, int *ldc);
+ void sgemm_(char *transa, char *transb, int *m, int *n, int *k,
+ float *alpha, float *a, int *lda, float *b, int *ldb,
+ float *beta , float *c, int *ldc);
+ // {D|S}GEQRF computes a QR factorization of a M-by-N matrix A: A = Q * R.
+ // {D|S}ORGQR return Q = H(1) H(2) . . . H(k) as returned by {D|S}GEQRF.
+ void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+ void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+
+ void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+ void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+}
+
+
+
+template class Matrix {
+public:
+ Matrix(size_t r = 0, size_t c = 0, const float *v = 0): row(r), col(c), matrix(0) { construct(r, c, v); }
+ Matrix(const Matrix &m): row(0), col(0), matrix(0) { *this = m; }
+ Matrix(const std::vector> &v): row(0), col(0), matrix(0) { construct(v); }
+
+ ~Matrix() { delete[] matrix; }
+
+ Matrix &operator=(const Matrix &m) {
+ allocate(m.row, m.col);
+ std::memcpy(matrix, m.matrix, row * col * sizeof(T));
+ return *this;
+ }
+
+ void construct(size_t r, size_t c, const double *v) {
+ allocate(r, c);
+ set(v);
+ }
+
+ void construct(size_t r, size_t c, const float *v) {
+ allocate(r, c);
+ set(v);
+ }
+
+ void construct(const std::vector> &v) {
+#if !defined(NGT_DISABLE_BLAS)
+ allocate(v[0].size(), v.size());
+#else
+ allocate(v.size(), v[0].size());
+#endif
+ set(v);
+ }
+
+ void allocate(size_t r, size_t c) {
+ if (matrix != 0) {
+ delete[] matrix;
+ }
+ row = r;
+ col = c;
+ if (r == 0 && c == 0) {
+ matrix = 0;
+ } else {
+ matrix = new T[r * c];
+ }
+ }
+
+ bool isEmpty() { return (col == 0) && (row == 0); }
+
+ static void
+ tokenize(const std::string &str, std::vector &token, const std::string seps) {
+ std::string::size_type current = 0;
+ std::string::size_type next;
+ while ((next = str.find_first_of(seps, current)) != std::string::npos) {
+ token.push_back(str.substr(current, next - current));
+ current = next + 1;
+ }
+ std::string t = str.substr(current);
+ token.push_back(t);
+ }
+
+ void set(const double *v) {
+ if (v == 0) {
+ return;
+ }
+ size_t l = row * col;
+ for (size_t p = 0; p < l; p++) {
+ matrix[p] = *v++;
+ }
+ }
+
+ void set(const float *v) {
+ if (v == 0) {
+ return;
+ }
+ size_t l = row * col;
+ for (size_t p = 0; p < l; p++) {
+ matrix[p] = *v++;
+ }
+ }
+
+ void set(const std::vector> &v) {
+ T *m = matrix;
+#if !defined(NGT_DISABLE_BLAS)
+ assert(row == v[0].size());
+ assert(col == v.size());
+ for (size_t c = 0; c < col; c++) {
+ for (size_t r = 0; r < row; r++) {
+ *m++ = v[c][r];
+ }
+ }
+#else
+ assert(row == v.size());
+ assert(col == v[0].size());
+ for (size_t r = 0; r < row; r++) {
+ for (size_t c = 0; c < col; c++) {
+ *m++ = v[r][c];
+ }
+ }
+#endif
+ }
+
+ void set(size_t pr, size_t pc, T v) {
+#if !defined(NGT_DISABLE_BLAS)
+ matrix[pc * row + pr] = v;
+#else
+ matrix[pr * col + pc] = v;
+#endif
+ }
+
+ void put(size_t pr, size_t pc, const Matrix &m) {
+ for (size_t r = 0; r < m.row; r++) {
+ if (pr + r < row) {
+ for (size_t c = 0; c < m.col; c++) {
+ if (pc + c < col) {
+ matrix[(pr + r) * col + (pc + c)] = m.matrix[r * m.col + c];
+ }
+ }
+ }
+ }
+ }
+
+ void horzcat(const Matrix &m){
+ assert(row == m.row);
+ size_t nc = col + m.col;
+ T *mtx = new T[row * nc];
+ for (size_t r = 0; r < row; r++) {
+ for (size_t c = 0; c < col; c++) {
+ mtx[r * nc + c] = matrix[r * col + c];
+ }
+ }
+ put(0, col, m);
+ col = nc;
+ delete[] matrix;
+ matrix = mtx;
+ }
+
+ void vert(const Matrix &m) {
+ if (row == 0 && col == 0) {
+ construct(m.row, m.col, m.matrix);
+ return;
+ }
+ assert(col == m.col);
+ size_t nr = row + m.row;
+ T *mtx = new T[nr * col];
+ for (size_t r = 0; r < row; r++) {
+ for (size_t c = 0; c < col; c++) {
+ mtx[r * col + c] = matrix[r * col + c];
+ }
+ }
+ put(row, 0, m);
+ row = nr;
+ delete[] matrix;
+ matrix = mtx;
+ }
+
+ void zero(size_t r, size_t c = 0) {
+ allocate(r, c);
+ zero();
+ }
+
+ void zero() {
+ size_t l = row * col;
+ for (size_t p = 0; p < l; p++) {
+ matrix[p] = 0.0;
+ }
+ }
+
+ void random(size_t r, size_t c = 0) {
+ allocate(r, c);
+ random();
+ }
+
+ void random() {
+ std::random_device seed_gen;
+ std::mt19937 engine(seed_gen());
+
+ auto maxNum = engine.max();
+ size_t l = row * col;
+ for (size_t p = 0; p < l; p++) {
+ auto randomNum = engine();
+ matrix[p] = static_cast(randomNum) / maxNum;
+ }
+ }
+
+ static void random(std::vector &a) {
+ std::random_device seed_gen;
+ std::mt19937 engine(seed_gen());
+ auto maxNum = engine.max();
+ for (auto &i : a) {
+ auto randomNum = engine();
+ i = static_cast(randomNum) / maxNum;
+ }
+ }
+
+ void transpose() {
+ T *m = new T[col * row];
+ T *msrc = matrix;
+ size_t nr = col;
+ size_t nc = row;
+#if !defined(NGT_DISABLE_BLAS)
+ for (size_t r = 0; r < nr; r++) {
+ for (size_t c = 0; c < nc; c++) {
+ m[c * nr + r] = *msrc++;
+ //std::cerr << r * nc + c << std::endl;
+ }
+ }
+#else
+ for (size_t c = 0; c < nc; c++) {
+ for (size_t r = 0; r < nr; r++) {
+ m[r * nc + c] = *msrc++;
+ //std::cerr << r * nc + c << std::endl;
+ }
+ }
+#endif
+ row = nr;
+ col = nc;
+ delete[] matrix;
+ matrix = m;
+ }
+
+#if !defined(NGT_DISABLE_BLAS)
+ void mul(const std::vector> &v) {
+ Matrix m(v);
+ mulBlas(m, true);
+ }
+
+
+ void mul(const Matrix &mtx) {
+ mulBlas(mtx);
+ }
+
+
+ void mulBlas(const Matrix &mtx, bool transpose = false) {
+ char transa = 'N';
+ char transb = 'N';
+ int m = row;
+ int n = mtx.col;
+ int k = col;
+ if (transpose) {
+ transb = 'T';
+ if (row != mtx.row) {
+ std::cerr << "mul:" << row << "x" << mtx.row << std::endl;
+ }
+ assert(row == mtx.row);
+ n = mtx.row;
+ row = m;
+ col = mtx.row;
+ } else {
+ if (col != mtx.row) {
+ std::cerr << "mul:" << col << "x" << mtx.row << std::endl;
+ }
+ assert(col == mtx.row);
+ row = m;
+ col = n;
+ }
+ float alpha = 1.0;
+ float beta = 0.0;
+ T *tmpmtx = new T[m * n];
+ if (transpose) {
+ int ldb = mtx.row;
+ gemm(&transa, &transb, &m, &n, &k, &alpha, matrix, &m, mtx.matrix, &ldb, &beta, tmpmtx, &m);
+ } else {
+ gemm(&transa, &transb, &m, &n, &k, &alpha, matrix, &m, mtx.matrix, &k, &beta, tmpmtx, &m);
+ }
+ delete[] matrix;
+ matrix = tmpmtx;
+ }
+#else
+ void mul(const Matrix &mtx) {
+ mulNaive(mtx);
+ }
+#endif
+
+
+ void mulNaive(const Matrix &mtx) {
+#ifdef MATRIX_TRACE
+ cerr << row << "x" << col << " mtx=" << mtx.row << "x" << mtx.col << std::endl;
+ std::cerr << mtx << std::endl;
+#endif
+ if (col != mtx.row) {
+ std::cerr << "mul:" << col << "x" << mtx.row << std::endl;
+ }
+ assert(col == mtx.row);
+ size_t nr = row;
+ size_t nc = mtx.col;
+ T *tmpmtx = new T[nr * nc];
+ for (size_t r = 0; r < nr; r++) {
+ for (size_t c = 0; c < nc; c++) {
+#if !defined(NGT_DISABLE_BLAS)
+ T &sum = tmpmtx[c * nr + r];
+ sum = 0;
+ for (size_t p = 0; p < col; p++) {
+ sum += matrix[p * row + r] * mtx.matrix[c * mtx.row + p];
+ }
+#else
+ T &sum = tmpmtx[r * nc + c];
+ sum = 0;
+ for (size_t p = 0; p < col; p++) {
+ sum += matrix[r * col + p] * mtx.matrix[p * mtx.col + c];
+ }
+#endif
+ }
+ }
+ row = nr;
+ col = nc;
+ delete[] matrix;
+ matrix = tmpmtx;
+ }
+
+ void diag(const Matrix &m) {
+ if (m.row != 1 && m.col != 1) {
+ std::cerr << "Error : not vector. " << m.row << "x" << m.col << std::endl;
+ return;
+ }
+ size_t length = m.row > m.col ? m.row : m.col;
+ zero(length, length);
+ for (size_t i = 0; i < length; i++) {
+#if !defined(NGT_DISABLE_BLAS)
+ matrix[i * row + i] = m.matrix[i];
+#else
+ matrix[i * col + i] = m.matrix[i];
+#endif
+ }
+ }
+
+ void reshape(size_t r, size_t c) {
+ if (r == row && c == col) {
+ return;
+ }
+ size_t l = r * c;
+ T *m = new T[l];
+ for (size_t i = 0; i < l; i++) {
+ m[i] = 0.0;
+ }
+#if !defined(NGT_DISABLE_BLAS)
+ for (size_t sc = 0; sc < col; sc++) {
+ for (size_t sr = 0; sr < row; sr++) {
+ m[sc * r + sr] = matrix[sc * row + sr];
+ }
+ }
+#else
+ for (size_t sr = 0; sr < row; sr++) {
+ for (size_t sc = 0; sc < col; sc++) {
+ m[sr * c + sc] = matrix[sr * col + sc];
+ }
+ }
+#endif
+ row = r;
+ col = c;
+ delete[] matrix;
+ matrix = m;
+ }
+
+ static void mulSquare(std::vector &a, Matrix &b) {
+ if (b.col != b.row) {
+ std::stringstream msg;
+ msg << "mulSquare : Invalid # of cols and rows. " << b.col << ":" << b.row << std::endl;
+ throw std::runtime_error(msg.str().c_str());
+ }
+ if (a.size() != b.row) {
+ std::stringstream msg;
+ msg << "mulSquare : Invalid # of rows and size. " << a.size() << ":" << b.row << std::endl;
+ throw std::runtime_error(msg.str().c_str());
+ }
+
+ std::vector vec;
+#if !defined(NGT_DISABLE_BLAS)
+ for (size_t c = 0; c < a.size(); c++) {
+ T sum = 0;
+ for (size_t p = 0; p < b.col; p++) {
+ sum += a[p] * b.matrix[c * b.row + p];
+ }
+ vec.push_back(sum);
+ }
+#else
+ for (size_t c = 0; c < a.size(); c++) {
+ T sum = 0;
+ for (size_t p = 0; p < b.col; p++) {
+ sum += a[p] * b.matrix[p * b.col + c];
+ }
+ vec.push_back(sum);
+ }
+#endif
+ a = vec;
+
+ }
+
+ static void mulSquare(std::vector> &a, Matrix &b) {
+ assert(b.col == b.row);
+ for (size_t r = 0; r < a.size(); r++) {
+ mulSquare(a[r], b);
+ }
+ }
+
+ static void gesvd(char* jobu, char* jobvt, int* m, int* n, double* a,
+ int* lda, double* s, double* u, int* ldu, double* vt, int* ldvt,
+ double* work, int* lwork, int* info) {
+ dgesvd_(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info);
+ }
+
+ static void gesvd(char* jobu, char* jobvt, int* m, int* n, float* a,
+ int* lda, float* s, float* u, int* ldu, float* vt, int* ldvt,
+ float* work, int* lwork, int* info) {
+ sgesvd_(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info);
+ }
+
+ static void gemm(char *transa, char *transb, int *m, int *n, int *k,
+ double *alpha, double *a, int *lda, double *b, int *ldb,
+ double *beta , double *c, int *ldc) {
+ dgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta , c, ldc);
+ }
+
+ static void gemm(char *transa, char *transb, int *m, int *n, int *k,
+ float *alpha, float *a, int *lda, float *b, int *ldb,
+ float *beta , float *c, int *ldc) {
+ sgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta , c, ldc);
+ }
+
+ static void geqrf(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info) {
+ dgeqrf_(m, n, a, lda, tau, work, lwork, info);
+ }
+
+ static void geqrf(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info) {
+ sgeqrf_(m, n, a, lda, tau, work, lwork, info);
+ }
+
+ static void orgqr(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info) {
+ dorgqr_(m, n, k, a, lda, tau, work, lwork, info);
+ }
+
+ static void orgqr(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info) {
+ sorgqr_(m, n, k, a, lda, tau, work, lwork, info);
+ }
+
+ static void svd(Matrix &a, Matrix &u, Matrix &s, Matrix &v) {
+ Matrix svda(a);
+#if !defined(NGT_DISABLE_BLAS)
+ int m = svda.row;
+ int n = svda.col;
+#else
+ svda.transpose();
+ int n = svda.row;
+ int m = svda.col;
+#endif
+ char jobu = 'A';
+ char jobvt = 'A';
+ int lda = m, ldu = m, ldvt = n, info;
+ int max, min;
+ if (m > n) {
+ max = m;
+ min = n;
+ } else {
+ max = n;
+ min = m;
+ }
+ int v1 = 3 * min + max;
+ int v2 = 5 * min;
+ int lwork = v1 > v2 ? v1 : v2;
+ T work[lwork];
+
+ Matrix sd;
+ sd.allocate(m, 1);
+ u.allocate(m, m);
+ v.allocate(n, n);
+ // S U VT
+ gesvd(&jobu, &jobvt, &m, &n, svda.matrix, &lda, sd.matrix, u.matrix, &ldu, v.matrix, &ldvt, work, &lwork, &info);
+ s.diag(sd);
+ s.reshape(m, n);
+#if !defined(NGT_DISABLE_BLAS)
+ v.transpose();
+#else
+ u.transpose();
+#endif
+ }
+
+ void eye(size_t d) {
+ zero(d, d);
+ for (size_t p = 0; p < row; p++) {
+ matrix[p * col + p] = 1.0;
+ }
+ }
+
+ void randomRotation(size_t d) {
+ random(d, d);
+ qr(d);
+ }
+
+ void qr(size_t d) {
+ auto di = static_cast(d);
+ T tau[di];
+ std::vector work(1);
+ int lwork = -1;
+ int info;
+ geqrf(&di, &di, matrix, &di, &tau[0], work.data(), &lwork, &info);
+ work.resize(static_cast(work[0]));
+
+ geqrf(&di, &di, matrix, &di, tau, work.data(), &lwork, &info);
+ orgqr(&di, &di, &di, matrix, &di, tau, work.data(), &lwork, &info);
+ }
+
+ void printmat() {
+ T mtmp;
+ printf("[ ");
+ for (size_t i = 0; i < row; i++) {
+ printf("[ ");
+ for (size_t j = 0; j < col; j++) {
+ mtmp = matrix[i + j * col];
+ printf("%5.2e", mtmp);
+ if (j < col - 1) printf(", ");
+ }
+ if (i < row - 1) printf("]; ");
+ else printf("] ");
+ }
+ printf("]");
+ std::cout << std::endl;
+ }
+
+ static void save(const std::string &file, const Matrix &m) {
+ std::ofstream os(file);
+ for (size_t r = 0; r < m.row; r++) {
+ for (size_t c = 0; c < m.col; c++) {
+#if !defined(NGT_DISABLE_BLAS)
+ os << m.matrix[c * m.row + r];
+#else
+ os << m.matrix[r * m.col + c];
+#endif
+ if (c + 1 != m.col) {
+ os << "\t";
+ }
+ }
+ os << std::endl;
+ }
+ }
+
+ static void
+ convert(std::vector &strings, std::vector &vector) {
+ vector.clear();
+ for (auto it = strings.begin(); it != strings.end(); ++it) {
+ try {
+ vector.push_back(stod(*it));
+ } catch(...) {
+ break;
+ }
+ }
+ }
+
+ static void
+ extractVector(const std::string &str, std::vector &vec)
+ {
+ std::vector tokens;
+ tokenize(str, tokens, " \t");
+ convert(tokens, vec);
+ }
+
+#if !defined(NGT_DISABLE_BLAS)
+ static
+ void load(const std::string &file, Matrix &m)
+ {
+ loadVectors(file, m);
+ m.transpose();
+ }
+
+ static
+ void loadVectors(const std::string &file, Matrix &m)
+#else
+ static
+ void load(const std::string &file, Matrix &m)
+#endif
+ {
+ std::ifstream is(file);
+ if (!is) {
+ std::stringstream msg;
+ msg << "Matrix::load: Cannot load. " << file;
+ throw std::runtime_error(msg.str().c_str());
+ }
+ std::string line;
+ size_t row = 0, col = 0;
+ std::vector tmpv;
+ while (getline(is, line)) {
+ std::vector v;
+ extractVector(line, v);
+#if !defined(NGT_DISABLE_BLAS)
+ if (row == 0) {
+ row = v.size();
+ } else if (row != v.size()) {
+ std::cerr << "somthing wrong." << std::endl;
+ abort();
+ }
+ col++;
+#else
+ if (col == 0) {
+ col = v.size();
+ } else if (col != v.size()) {
+ std::cerr << "somthing wrong." << std::endl;
+ abort();
+ }
+ row++;
+#endif
+ for (size_t i = 0; i < v.size(); i++) {
+ tmpv.push_back(v[i]);
+ }
+ }
+ m.construct(row, col, &tmpv[0]);
+ }
+
+ friend std::ostream& operator<<(std::ostream &os, const Matrix &m) {
+ os << m.row << " x " << m.col << "=" << std::endl;
+ os << "[";
+ for (size_t r = 0; r < m.row; r++) {
+ os << r << ":[";
+ for (size_t c = 0; c < m.col; c++) {
+#if !defined(NGT_DISABLE_BLAS)
+ os << m.matrix[c * m.row + r];
+#else
+ os << m.matrix[r * m.col + c];
+#endif
+ if (c + 1 != m.col) {
+ os << "\t";
+ }
+ }
+ os << "]";
+ if (r + 1 != m.row) {
+ os << std::endl;
+ }
+ }
+ os << "]";
+ return os;
+ }
+
+ size_t row;
+ size_t col;
+ T *matrix;
+};
+
+
diff --git a/lib/NGT/NGTQ/NGTQCommand.h b/lib/NGT/NGTQ/NGTQCommand.h
deleted file mode 100644
index b54649b..0000000
--- a/lib/NGT/NGTQ/NGTQCommand.h
+++ /dev/null
@@ -1,612 +0,0 @@
-//
-// Copyright (C) 2016 Yahoo Japan Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#include "NGT/NGTQ/Quantizer.h"
-
-#define NGTQ_SEARCH_CODEBOOK_SIZE_FLUCTUATION
-
-namespace NGTQ {
-
-class Command {
-public:
- class CreateParameters {
- public:
- CreateParameters() {}
- CreateParameters(NGT::Args &args, char centroidCreationMode = 'd', char localCentroidCreationMode = 'd') {
- try {
- index = args.get("#1");
- } catch (...) {
- std::stringstream msg;
- msg << "Command::CreateParameters: Error: An index is not specified.";
- NGTThrowException(msg);
- }
- try {
- objectPath = args.get("#2");
- } catch (...) {}
-
- char objectType = args.getChar("o", 'f');
- char distanceType = args.getChar("D", '2');
- numOfObjects = args.getl("n", 0);
-
- property.threadSize = args.getl("p", 24);
- property.dimension = args.getl("d", 0);
- property.globalRange = args.getf("R", 0);
- property.localRange = args.getf("r", 0);
- property.globalCentroidLimit = args.getl("C", 1000000);
- property.localCentroidLimit = args.getl("c", 65000);
- property.localDivisionNo = args.getl("N", 8);
- property.batchSize = args.getl("b", 1000);
- property.localClusteringSampleCoefficient = args.getl("s", 10);
- {
- char localCentroidType = args.getChar("T", 'f');
- property.singleLocalCodebook = localCentroidType == 't' ? true : false;
- }
- {
- centroidCreationMode = args.getChar("M", centroidCreationMode);
- switch(centroidCreationMode) {
- case 'd': property.centroidCreationMode = NGTQ::CentroidCreationModeDynamic; break;
- case 's': property.centroidCreationMode = NGTQ::CentroidCreationModeStatic; break;
- default:
- std::stringstream msg;
- msg << "Command::CreateParameters: Error: Invalid centroid creation mode. " << centroidCreationMode;
- NGTThrowException(msg);
- }
- }
- {
- localCentroidCreationMode = args.getChar("L", localCentroidCreationMode);
- switch(localCentroidCreationMode) {
- case 'd': property.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamic; break;
- case 's': property.localCentroidCreationMode = NGTQ::CentroidCreationModeStatic; break;
- case 'k': property.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamicKmeans; break;
- default:
- std::stringstream msg;
- msg << "Command::CreateParameters: Error: Invalid centroid creation mode. " << localCentroidCreationMode;
- NGTThrowException(msg);
- }
- }
-
- globalProperty.edgeSizeForCreation = args.getl("E", 10);
- globalProperty.edgeSizeForSearch = args.getl("S", 40);
- {
- char indexType = args.getChar("i", 't');
- globalProperty.indexType = indexType == 't' ? NGT::Property::GraphAndTree : NGT::Property::Graph;
- localProperty.indexType = globalProperty.indexType;
- }
- globalProperty.insertionRadiusCoefficient = args.getf("e", 0.1) + 1.0;
- localProperty.insertionRadiusCoefficient = globalProperty.insertionRadiusCoefficient;
-
-
- switch (objectType) {
- case 'f': property.dataType = NGTQ::DataTypeFloat; break;
- case 'c': property.dataType = NGTQ::DataTypeUint8; break;
- default:
- std::stringstream msg;
- msg << "Command::CreateParameters: Error: Invalid object type. " << objectType;
- NGTThrowException(msg);
- }
-
- switch (distanceType) {
- case '2': property.distanceType = NGTQ::DistanceTypeL2; break;
- case '1': property.distanceType = NGTQ::DistanceTypeL1; break;
- case 'a': property.distanceType = NGTQ::DistanceTypeAngle; break;
- case 'C': property.distanceType = NGTQ::DistanceTypeNormalizedCosine; break;
- case 'E': property.distanceType = NGTQ::DistanceTypeL2; break;
- default:
- std::stringstream msg;
- msg << "Command::CreateParameters: Error: Invalid distance type. " << distanceType;
- NGTThrowException(msg);
- }
- }
-
- std::string index;
- std::string objectPath;
- size_t numOfObjects;
- NGTQ::Property property;
- NGT::Property globalProperty;
- NGT::Property localProperty;
- };
-
- Command():debugLevel(0) {}
-
- void
- create(NGT::Args &args)
- {
- const string usage = "Usage: ngtq create "
- " -d dimension [-o object-type (f:float|c:unsigned char)] [-D distance-function] [-n data-size] "
- "[-p #-of-thread] [-R global-codebook-range] [-r local-codebook-range] "
- "[-C global-codebook-size-limit] [-c local-codebook-size-limit] [-N local-division-no] "
- "[-T single-local-centroid (t|f)] [-e epsilon] [-i index-type (t:Tree|g:Graph)] "
- "[-M global-centroid-creation-mode (d|s)] [-L local-centroid-creation-mode (d|k|s)] "
- "[-s local-sample-coefficient] "
- "index(output) data.tsv(input)";
-
- try {
- NGTQ::Command::CreateParameters createParameters(args);
-
- if (debugLevel >= 1) {
- cerr << "epsilon=" << createParameters.globalProperty.insertionRadiusCoefficient << endl;
- cerr << "data size=" << createParameters.numOfObjects << endl;
- cerr << "dimension=" << createParameters.property.dimension << endl;
- cerr << "thread size=" << createParameters.property.threadSize << endl;
- cerr << "batch size=" << createParameters.localProperty.batchSizeForCreation << endl;;
- cerr << "index type=" << createParameters.globalProperty.indexType << endl;
- }
-
- cerr << "ngtq: Create" << endl;
- NGTQ::Index::create(createParameters.index, createParameters.property, createParameters.globalProperty, createParameters.localProperty);
-
- cerr << "ngtq: Append" << endl;
- NGTQ::Index::append(createParameters.index, createParameters.objectPath, createParameters.numOfObjects);
- } catch(NGT::Exception &err) {
- std::cerr << err.what() << std::endl;
- cerr << usage << endl;
- }
- }
-
- void
- rebuild(NGT::Args &args)
- {
- const string usage = "Usage: ngtq rebuild "
- "[-o object-type (f:float|c:unsigned char)] [-D distance-function] [-n data-size] "
- "[-p #-of-thread] [-d dimension] [-R global-codebook-range] [-r local-codebook-range] "
- "[-C global-codebook-size-limit] [-c local-codebook-size-limit] [-N local-division-no] "
- "[-T single-local-centroid (t|f)] [-e epsilon] [-i index-type (t:Tree|g:Graph)] "
- "[-M centroid-creation_mode (d|s)] "
- "index(output) data.tsv(input)";
- string srcIndex;
- try {
- srcIndex = args.get("#1");
- } catch (...) {
- cerr << "DB is not specified." << endl;
- cerr << usage << endl;
- return;
- }
- string rebuiltIndex = srcIndex + ".tmp";
-
-
- NGTQ::Property property;
- NGT::Property globalProperty;
- NGT::Property localProperty;
-
- {
- NGTQ::Index index(srcIndex);
- property = index.getQuantizer().property;
- index.getQuantizer().globalCodebook.getProperty(globalProperty);
- index.getQuantizer().getLocalCodebook(0).getProperty(localProperty);
- }
-
- property.globalRange = args.getf("R", property.globalRange);
- property.localRange = args.getf("r", property.localRange);
- property.globalCentroidLimit = args.getl("C", property.globalCentroidLimit);
- property.localCentroidLimit = args.getl("c", property.localCentroidLimit);
- property.localDivisionNo = args.getl("N", property.localDivisionNo);
- {
- char localCentroidType = args.getChar("T", '-');
- if (localCentroidType != '-') {
- property.singleLocalCodebook = localCentroidType == 't' ? true : false;
- }
- }
- {
- char centroidCreationMode = args.getChar("M", '-');
- if (centroidCreationMode != '-') {
- property.centroidCreationMode = centroidCreationMode == 'd' ?
- NGTQ::CentroidCreationModeDynamic : NGTQ::CentroidCreationModeStatic;
- }
- }
-
- cerr << "global range=" << property.globalRange << endl;
- cerr << "local range=" << property.localRange << endl;
- cerr << "global centroid limit=" << property.globalCentroidLimit << endl;
- cerr << "local centroid limit=" << property.localCentroidLimit << endl;
- cerr << "local division no=" << property.localDivisionNo << endl;
-
- NGTQ::Index::create(rebuiltIndex, property, globalProperty, localProperty);
- cerr << "created a new db" << endl;
- cerr << "start rebuilding..." << endl;
- NGTQ::Index::rebuild(srcIndex, rebuiltIndex);
- {
- string src = srcIndex;
- string dst = srcIndex + ".org";
- if (std::rename(src.c_str(), dst.c_str()) != 0) {
- stringstream msg;
- msg << "ngtq::rebuild: Cannot rename. " << src << "=>" << dst ;
- NGTThrowException(msg);
- }
- }
- {
- string src = rebuiltIndex;
- string dst = srcIndex;
- if (std::rename(src.c_str(), dst.c_str()) != 0) {
- stringstream msg;
- msg << "ngtq::rebuild: Cannot rename. " << src << "=>" << dst ;
- NGTThrowException(msg);
- }
- }
- }
-
-
- void
- append(NGT::Args &args)
- {
- const string usage = "Usage: ngtq append [-n data-size] "
- "index(output) data.tsv(input)";
- string index;
- try {
- index = args.get("#1");
- } catch (...) {
- cerr << "DB is not specified." << endl;
- cerr << usage << endl;
- return;
- }
- string data;
- try {
- data = args.get("#2");
- } catch (...) {
- cerr << "Data is not specified." << endl;
- }
-
- size_t dataSize = args.getl("n", 0);
-
- if (debugLevel >= 1) {
- cerr << "data size=" << dataSize << endl;
- }
-
- NGTQ::Index::append(index, data, dataSize);
- }
-
- void
- search(NGT::Args &args)
- {
- const string usage = "Usage: ngtq search [-i g|t|s] [-n result-size] [-e epsilon] [-m mode(r|l|c|a)] "
- "[-E edge-size] [-o output-mode] [-b result expansion(begin:end:[x]step)] "
- "index(input) query.tsv(input)";
- string database;
- try {
- database = args.get("#1");
- } catch (...) {
- cerr << "DB is not specified" << endl;
- cerr << usage << endl;
- return;
- }
-
- string query;
- try {
- query = args.get("#2");
- } catch (...) {
- cerr << "Query is not specified" << endl;
- cerr << usage << endl;
- return;
- }
-
- int size = args.getl("n", 20);
- char outputMode = args.getChar("o", '-');
- float epsilon = 0.1;
-
- char mode = args.getChar("m", '-');
- NGTQ::AggregationMode aggregationMode;
- switch (mode) {
- case 'r': aggregationMode = NGTQ::AggregationModeExactDistanceThroughApproximateDistance; break; // refine
- case 'e': aggregationMode = NGTQ::AggregationModeExactDistance; break; // refine
- case 'l': aggregationMode = NGTQ::AggregationModeApproximateDistanceWithLookupTable; break; // lookup
- case 'c': aggregationMode = NGTQ::AggregationModeApproximateDistanceWithCache; break; // cache
- case '-':
- case 'a': aggregationMode = NGTQ::AggregationModeApproximateDistance; break; // cache
- default:
- cerr << "Invalid aggregation mode. " << mode << endl;
- cerr << usage << endl;
- return;
- }
-
- if (args.getString("e", "none") == "-") {
- // linear search
- epsilon = FLT_MAX;
- } else {
- epsilon = args.getf("e", 0.1);
- }
-
- size_t beginOfResultExpansion, endOfResultExpansion, stepOfResultExpansion;
- bool mulStep = false;
- {
- beginOfResultExpansion = stepOfResultExpansion = 1;
- endOfResultExpansion = 0;
- string str = args.getString("b", "16");
- vector tokens;
- NGT::Common::tokenize(str, tokens, ":");
- if (tokens.size() >= 1) { beginOfResultExpansion = NGT::Common::strtod(tokens[0]); }
- if (tokens.size() >= 2) { endOfResultExpansion = NGT::Common::strtod(tokens[1]); }
- if (tokens.size() >= 3) {
- if (tokens[2][0] == 'x') {
- mulStep = true;
- stepOfResultExpansion = NGT::Common::strtod(tokens[2].substr(1));
- } else {
- stepOfResultExpansion = NGT::Common::strtod(tokens[2]);
- }
- }
- }
- if (debugLevel >= 1) {
- cerr << "size=" << size << endl;
- cerr << "result expansion=" << beginOfResultExpansion << "->" << endOfResultExpansion << "," << stepOfResultExpansion << endl;
- }
-
- NGTQ::Index index(database);
- try {
- ifstream is(query);
- if (!is) {
- cerr << "Cannot open the specified file. " << query << endl;
- return;
- }
- if (outputMode == 's') { cout << "# Beginning of Evaluation" << endl; }
- string line;
- double totalTime = 0;
- int queryCount = 0;
- while(getline(is, line)) {
- NGT::Object *query = index.allocateObject(line, " \t", 0);
- queryCount++;
- size_t resultExpansion = 0;
- for (size_t base = beginOfResultExpansion;
- resultExpansion <= endOfResultExpansion;
- base = mulStep ? base * stepOfResultExpansion : base + stepOfResultExpansion) {
- resultExpansion = base;
- NGT::ObjectDistances objects;
- NGT::Timer timer;
- timer.start();
- // size : # of final resultant objects
- // resultExpansion : # of resultant objects by using codebook search
- index.search(query, objects, size, resultExpansion, aggregationMode, epsilon);
- timer.stop();
-
- totalTime += timer.time;
- if (outputMode == 'e') {
- cout << "# Query No.=" << queryCount << endl;
- cout << "# Query=" << line.substr(0, 20) + " ..." << endl;
- cout << "# Index Type=" << "----" << endl;
- cout << "# Size=" << size << endl;
- cout << "# Epsilon=" << epsilon << endl;
- cout << "# Result expansion=" << resultExpansion << endl;
- cout << "# Distance Computation=" << index.getQuantizer().distanceComputationCount << endl;
- cout << "# Query Time (msec)=" << timer.time * 1000.0 << endl;
- } else {
- cout << "Query No." << queryCount << endl;
- cout << "Rank\tIN-ID\tID\tDistance" << endl;
- }
-
- for (size_t i = 0; i < objects.size(); i++) {
- cout << i + 1 << "\t" << objects[i].id << "\t";
- cout << objects[i].distance << endl;
- }
-
- if (outputMode == 'e') {
- cout << "# End of Search" << endl;
- } else {
- cout << "Query Time= " << timer.time << " (sec), " << timer.time * 1000.0 << " (msec)" << endl;
- }
- }
- if (outputMode == 'e') {
- cout << "# End of Query" << endl;
- }
- index.deleteObject(query);
- }
- if (outputMode == 'e') {
- cout << "# Average Query Time (msec)=" << totalTime * 1000.0 / (double)queryCount << endl;
- cout << "# Number of queries=" << queryCount << endl;
- cout << "# End of Evaluation" << endl;
- } else {
- cout << "Average Query Time= " << totalTime / (double)queryCount << " (sec), "
- << totalTime * 1000.0 / (double)queryCount << " (msec), ("
- << totalTime << "/" << queryCount << ")" << endl;
- }
- } catch (NGT::Exception &err) {
- cerr << "Error " << err.what() << endl;
- cerr << usage << endl;
- } catch (...) {
- cerr << "Error" << endl;
- cerr << usage << endl;
- }
- index.close();
- }
-
- void
- remove(NGT::Args &args)
- {
- const string usage = "Usage: ngtq remove [-d object-ID-type(f|d)] index(input) object-ID(input)";
- string database;
- try {
- database = args.get("#1");
- } catch (...) {
- cerr << "DB is not specified" << endl;
- cerr << usage << endl;
- return;
- }
- try {
- args.get("#2");
- } catch (...) {
- cerr << "ID is not specified" << endl;
- cerr << usage << endl;
- return;
- }
- char dataType = args.getChar("d", 'f');
- if (debugLevel >= 1) {
- cerr << "dataType=" << dataType << endl;
- }
-
- try {
- vector objects;
- if (dataType == 'f') {
- string ids;
- try {
- ids = args.get("#2");
- } catch (...) {
- cerr << "Data file is not specified" << endl;
- cerr << usage << endl;
- return;
- }
- ifstream is(ids);
- if (!is) {
- cerr << "Cannot open the specified file. " << ids << endl;
- return;
- }
- string line;
- int count = 0;
- while(getline(is, line)) {
- count++;
- vector tokens;
- NGT::Common::tokenize(line, tokens, "\t ");
- if (tokens.size() == 0 || tokens[0].size() == 0) {
- continue;
- }
- char *e;
- size_t id;
- try {
- id = strtol(tokens[0].c_str(), &e, 10);
- objects.push_back(id);
- } catch (...) {
- cerr << "Illegal data. " << tokens[0] << endl;
- }
- if (*e != 0) {
- cerr << "Illegal data. " << e << endl;
- }
- cerr << "removed ID=" << id << endl;
- }
- } else {
- size_t id = args.getl("#2", 0);
- cerr << "removed ID=" << id << endl;
- objects.push_back(id);
- }
- NGT::Index::remove(database, objects);
- } catch (NGT::Exception &err) {
- cerr << "Error " << err.what() << endl;
- cerr << usage << endl;
- } catch (...) {
- cerr << "Error" << endl;
- cerr << usage << endl;
- }
- }
-
-
- void
- info(NGT::Args &args)
- {
- const string usage = "Usage: ngtq info index";
- string database;
- try {
- database = args.get("#1");
- } catch (...) {
- cerr << "DB is not specified" << endl;
- cerr << usage << endl;
- return;
- }
- NGTQ::Index index(database);
- index.info(cout, args.getChar("m", '-'));
-
- }
-
- void
- validate(NGT::Args &args)
- {
- const string usage = "parameter";
- string database;
- try {
- database = args.get("#1");
- } catch (...) {
- cerr << "DB is not specified" << endl;
- cerr << usage << endl;
- return;
- }
- NGTQ::Index index(database);
-
- index.getQuantizer().validate();
-
- }
-
-
-
-#ifdef NGTQ_SHARED_INVERTED_INDEX
- void
- compress(NGT::Args &args)
- {
- const string usage = "Usage: ngtq compress index)";
- string database;
- try {
- database = args.get("#1");
- } catch (...) {
- cerr << "DB is not specified" << endl;
- cerr << usage << endl;
- return;
- }
- try {
- NGTQ::Index::compress(database);
- } catch (NGT::Exception &err) {
- cerr << "Error " << err.what() << endl;
- cerr << usage << endl;
- } catch (...) {
- cerr << "Error" << endl;
- cerr << usage << endl;
- }
- }
-#endif
-
- void help() {
- cerr << "Usage : ngtq command database data" << endl;
- cerr << " command : create search remove append export import" << endl;
- }
-
- void execute(NGT::Args args) {
- string command;
- try {
- command = args.get("#0");
- } catch(...) {
- help();
- return;
- }
-
- debugLevel = args.getl("X", 0);
-
- try {
- if (debugLevel >= 1) {
- cerr << "ngt::command=" << command << endl;
- }
- if (command == "search") {
- search(args);
- } else if (command == "create") {
- create(args);
- } else if (command == "append") {
- append(args);
- } else if (command == "remove") {
- remove(args);
- } else if (command == "info") {
- info(args);
- } else if (command == "validate") {
- validate(args);
- } else if (command == "rebuild") {
- rebuild(args);
-#ifdef NGTQ_SHARED_INVERTED_INDEX
- } else if (command == "compress") {
- compress(args);
-#endif
- } else {
- cerr << "Illegal command. " << command << endl;
- }
- } catch(NGT::Exception &err) {
- cerr << "ngt: Fatal error: " << err.what() << endl;
- }
- }
-
- int debugLevel;
-
-};
-
-};
-
diff --git a/lib/NGT/NGTQ/NGTQGCommand.cpp b/lib/NGT/NGTQ/NGTQGCommand.cpp
deleted file mode 100644
index 0306f8e..0000000
--- a/lib/NGT/NGTQ/NGTQGCommand.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-//
-// Copyright (C) 2020 Yahoo Japan Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-
-#include "NGT/NGTQ/NGTQGCommand.h"
-
-#if !defined(NGT_SHARED_MEMORY_ALLOCATOR) && !defined(NGTQ_SHARED_INVERTED_INDEX)
-
-using namespace std;
-
-
-void
-NGTQG::Command::create(NGT::Args &args)
-{
-
- const string usage = "Usage: ngtqg create "
- "[-D distance-function] "
- "[-p #-of-thread] [-d dimension] [-R global-codebook-range] [-r local-codebook-range] "
- "[-C global-codebook-size-limit] [-c local-codebook-size-limit] "
- "[-Q dimension-of-subvector] [-i index-type (t:Tree|g:Graph)] "
- "[-M global-centroid-creation-mode (d|s)] [-l global-centroid-creation-mode (d|k|s)] "
- "[-s local-sample-coefficient] "
- "index(output)";
-
- try {
- NGT::Command::CreateParameters createParameters(args);
-
- switch (createParameters.indexType) {
- case 't':
- NGT::Index::createGraphAndTree(createParameters.index, createParameters.property, createParameters.objectPath, createParameters.numOfObjects);
- break;
- case 'g':
- NGT::Index::createGraph(createParameters.index, createParameters.property, createParameters.objectPath, createParameters.numOfObjects);
- break;
- }
- } catch(NGT::Exception &err) {
- std::cerr << err.what() << std::endl;
- cerr << usage << endl;
- }
-
- NGTQG::Command::CreateParameters createParameters(args);
-
- try {
- char localCentroidCreationMode = args.getChar("l", 'd');
- switch(localCentroidCreationMode) {
- case 'd': createParameters.property.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamic; break;
- case 's': createParameters.property.localCentroidCreationMode = NGTQ::CentroidCreationModeStatic; break;
- case 'k': createParameters.property.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamicKmeans; break;
- default:
- cerr << "ngt: Invalid centroid creation mode. " << localCentroidCreationMode << endl;
- cerr << usage << endl;
- return;
- }
-
- createParameters.index += "/qg";
-
- cerr << "ngtqg: Create" << endl;
- NGTQ::Index::create(createParameters.index, createParameters.property, createParameters.globalProperty, createParameters.localProperty);
- } catch(NGT::Exception &err) {
- std::cerr << err.what() << std::endl;
- cerr << usage << endl;
- }
-
-
-}
-
-void
-NGTQG::Command::build(NGT::Args &args)
-{
- NGT::Command::append(args);
-}
-
-void
-NGTQG::Command::quantize(NGT::Args &args)
-{
- const std::string usage = "Usage: ngtqg quantize [-Q dimension-of-subvector] [-E max-number-of-edges] index";
- string indexPath;
- try {
- indexPath = args.get("#1");
- } catch (...) {
- cerr << "An index is not specified." << endl;
- cerr << usage << endl;
- return;
- }
- size_t maxNumOfEdges = args.getl("E", 128);
- size_t dimensionOfSubvector = args.getl("Q", 0);
- NGTQG::Index::quantize(indexPath, dimensionOfSubvector, maxNumOfEdges);
-}
-
-void
-search(NGTQG::Index &index, NGTQG::Command::SearchParameters &searchParameters, ostream &stream)
-{
-
- std::ifstream is(searchParameters.query);
- if (!is) {
- std::cerr << "Cannot open the specified file. " << searchParameters.query << std::endl;
- return;
- }
-
- if (searchParameters.outputMode[0] == 'e') {
- stream << "# Beginning of Evaluation" << endl;
- }
-
- string line;
- double totalTime = 0;
- size_t queryCount = 0;
-
- while(getline(is, line)) {
- if (searchParameters.querySize > 0 && queryCount >= searchParameters.querySize) {
- break;
- }
- vector