diff --git a/.clang-format b/.clang-format
index 52bafe8..f140324 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,5 +1,15 @@
 ---
-Language:        Cpp
-BasedOnStyle:  Google
+Language: Cpp
+BasedOnStyle: Google
 DerivePointerAlignment: false
 PointerAlignment: Left
+ColumnLimit: 100
+IncludeBlocks: Preserve
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: False
+AllowShortBlocksOnASingleLine: Empty
+BreakBeforeBraces: Stroustrup
+AllowShortEnumsOnASingleLine: False
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortCaseLabelsOnASingleLine: False
+
diff --git a/.clangd b/.clangd
new file mode 100644
index 0000000..02a5d82
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,30 @@
+CompileFlags:
+  Add:
+    - -Wall
+    - -Wextra
+    # do not complain about features incompatible with sm_35
+    - --cuda-gpu-arch=sm_52
+  Remove:
+    - --expt-relaxed-constexpr
+    - --default-stream
+    - --generate-code=*
+    - --options-file
+    - -arch=*
+    - -forward-unknown-to-host-compiler
+    - -Xptxas=*
+    - -Xcompiler=*
+
+Diagnostics:
+  ClangTidy:
+    Add:
+      - bugprone-*
+      - performance-*
+    Remove:
+      - bugprone-macro-parentheses
+  UnusedIncludes: Strict
+  MissingIncludes: Strict
+  Includes:
+    IgnoreHeader:
+      - cub/.*
+      - cuda/.*
+      - glog/.*
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..0737109
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,11 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+end_of_line = lf
+trim_trailing_whitespace = true
+insert_final_newline = true
+indent_style = space
+indent_size = 2
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 0000000..19047ac
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,55 @@
+name: Compile
+
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+      - master
+      - release*
+
+concurrency:
+  group: compile-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  # Test the build with the latest CUDA toolkit and several Python versions
+  nvcc-ubuntu:
+    strategy:
+      fail-fast: false
+      matrix:
+        #python: ['3.8', '3.9', '3.10', '3.11', '3.12']
+        python: ['3.12']
+
+    name: "Python ${{ matrix.python }} / NVCC (CUDA 12.6.3) / ubuntu-latest"
+    runs-on: ubuntu-latest
+    # see https://hub.docker.com/r/nvidia/cuda
+    container: nvidia/cuda:12.6.3-devel-ubuntu24.04
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install dependencies
+      run: apt-get update && DEBIAN_FRONTEND="noninteractive" apt-get install -y lsb-release unzip git && apt-get clean all
+
+    - name: Setup Python ${{ matrix.python }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python }}
+
+    - name: Install the latest CMake
+      uses: lukka/get-cmake@latest
+
+    - name: Install Nanobind
+      run: |
+        python -m pip install nanobind typing_extensions
+
+    - name: Configure
+      run: >
+        cmake -S . -B build
+
+    - name: Build C++
+      run: cmake --build build -j 2
+
+    - name: Build Wheel
+      run: python -m pip wheel .
diff --git a/.gitignore b/.gitignore
index 8c503ea..aad44a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,13 @@
-3rdparty/
-build/
-build_11/
-build_docker/
-build_local/
+build*/
 .vscode
 data/**
+evaluation/
+.idea/
 
-include/ggnn/alternatives
+run_Project.txt
+
+.cache
+.py-build-cmake_cache
 
 ### CMake ###
 CMakeLists.txt.user
@@ -21,43 +22,3 @@ compile_commands.json
 CTestTestfile.cmake
 _deps
 
-### CMake Patch ###
-# External projects
-*-prefix/
-
-### C++ ###
-# Prerequisites
-*.d
-
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
-
-
-cmake-build-debug
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 1821a8f..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,9 +0,0 @@
-[submodule "3rdparty/pybind11"]
-	path = 3rdparty/pybind11
-	url = https://github.com/pybind/pybind11.git
-[submodule "3rdparty/cub"]
-	path = 3rdparty/cub
-	url = https://github.com/NVlabs/cub.git
-[submodule "3rdparty/glog"]
-	path = 3rdparty/glog
-	url = https://github.com/google/glog.git
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..5bd329a
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,22 @@
+version: "2"
+
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.10"
+
+python:
+  install:
+    - requirements: docs/requirements.txt
+    # Install our python package before building the docs
+    - method: pip
+      path: .
+
+sphinx:
+  configuration: docs/source/conf.py
+  fail_on_warning: true
+
+  
+formats:
+  - pdf
+  - epub
diff --git a/3rdparty/cub b/3rdparty/cub
deleted file mode 160000
index c3cceac..0000000
--- a/3rdparty/cub
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c3cceac115c072fb63df1836ff46d8c60d9eb304
diff --git a/3rdparty/glog b/3rdparty/glog
deleted file mode 160000
index 0a2e593..0000000
--- a/3rdparty/glog
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6
diff --git a/3rdparty/pybind11 b/3rdparty/pybind11
deleted file mode 160000
index b886369..0000000
--- a/3rdparty/pybind11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b8863698d6f53ea86dd26c681eeaa837888c66d6
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5eae10c..734312c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,191 +1,245 @@
-CMAKE_MINIMUM_REQUIRED( VERSION 3.3.0 )
-
-project (GGNN)
-set(CMAKE_CXX_STANDARD 14)
-
-
-find_package(CUDA REQUIRED)
-
-set(CUDA_ARCH_LIST Auto)
-# set(CUDA_ARCH_LIST "7.0")
-# set(CUDA_ARCH_LIST "7.5 6.1 8.6")
-cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS ${CUDA_ARCH_LIST})
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_ARCH_FLAGS})
-
-set(CUB_INCLUDE_PATH "3rdparty/cub")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++14 -Xptxas=-v --expt-relaxed-constexpr --default-stream per-thread" )
-# debug
-# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -G -g")
-# profile
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo")
-
-add_subdirectory("3rdparty/pybind11")
-set(GFLAGS_NAMESPACE "google")
-add_subdirectory("3rdparty/glog")
-
-message("CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}")
-message("CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
-
-# configurations for specific datasets
-# TODO: generate from template using CMake?
-cuda_add_executable(sift1m src/sift1m.cu)
-target_include_directories(sift1m
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_multi src/sift1m_multi_gpu.cu)
-target_include_directories(sift1m_multi
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_multi ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1b_subsets src/sift1b_subsets.cu)
-target_include_directories(sift1b_subsets
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1b_subsets ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1b_multi src/sift1b_multi_gpu.cu)
-target_include_directories(sift1b_multi
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1b_multi ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1b_multi_top10 src/sift1b_multi_gpu_top10.cu)
-target_include_directories(sift1b_multi_top10
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1b_multi_top10 ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(deep1b_multi src/deep1b_multi_gpu.cu)
-target_include_directories(deep1b_multi
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(deep1b_multi ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_stats src/sift1m_stats.cu)
-target_include_directories(sift1m_stats
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_stats ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_no_slack_query src/sift1m_no_slack_query.cu)
-target_include_directories(sift1m_no_slack_query
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_no_slack_query ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_subsets src/sift1m_subsets.cu)
-target_include_directories(sift1m_subsets
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_subsets ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_batchsize src/sift1m_batchsize.cu)
-target_include_directories(sift1m_batchsize
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_batchsize ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_base_vs_base src/sift1m_base_vs_base.cu)
-target_include_directories(sift1m_base_vs_base
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_base_vs_base ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_load_hnsw src/sift1m_load_hnsw.cu)
-target_include_directories(sift1m_load_hnsw
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_load_hnsw ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_top10 src/sift1m_top10.cu)
-target_include_directories(sift1m_top10
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_top10 ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(sift1m_top100 src/sift1m_top100.cu)
-target_include_directories(sift1m_top100
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(sift1m_top100 ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(nytimes src/nytimes.cu)
-target_include_directories(nytimes
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(nytimes ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(nytimes_top10 src/nytimes_top10.cu)
-target_include_directories(nytimes_top10
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(nytimes_top10 ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(glove200 src/glove200.cu)
-target_include_directories(glove200
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(glove200 ${CUDA_curand_LIBRARY} glog::glog)
-
-cuda_add_executable(gist src/gist.cu)
-target_include_directories(gist
-  PRIVATE
-  ${CUDA_NVCC_FLAGS}
-  ${CUB_INCLUDE_PATH}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
-target_link_libraries(gist ${CUDA_curand_LIBRARY} glog::glog)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.23 FATAL_ERROR)
+
+project (GGNN LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CUDA_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+
+# CUDA toolkit version 12 required for C++20 support
+find_package(CUDAToolkit 12 REQUIRED)
+
+# CMake does not reject g++9 which only has incomplete experimental support for C++20
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10)
+        message(FATAL_ERROR "GCC or Clang version 10 or higher required for C++20 support!")
+    endif()
+else()
+    message(WARNING "Compilation has only been tested with GCC and Clang.")
+endif()
+
+# glog requires libunwind but does not look in the libunwind folder containing LLVM's version.
+# This is a workaround for using e.g. libunwind-18-dev on Ubuntu 24.04.
+find_path (Unwind_INCLUDE_DIR NAMES unwind.h libunwind.h PATH_SUFFIXES libunwind DOC "unwind include directory")
+
+include(FetchContent)
+# if not installed, fetch glog sources from github
+FetchContent_Declare(
+  glog
+  GIT_REPOSITORY https://github.com/google/glog.git
+  GIT_TAG        7b134a5c82c0c0b5698bb6bf7a835b230c5638e4 # release 0.7.1
+  FIND_PACKAGE_ARGS
+)
+FetchContent_Declare(
+  gflags
+  GIT_REPOSITORY https://github.com/gflags/gflags.git
+  GIT_TAG        e171aa2d15ed9eb17054558e0b3a6a413bb01067 # release 2.2.2
+  FIND_PACKAGE_ARGS
+)
+FetchContent_MakeAvailable(glog gflags)
+
+find_package(glog REQUIRED)
+find_package(gflags)
+
+# optional (required for nanobind bindings)
+find_package(Python 3.8 COMPONENTS Interpreter Development.Module)
+
+if (Python_FOUND)
+    execute_process(
+      COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
+      OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
+    list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
+
+    find_package(nanobind CONFIG)
+endif()
+
+
+# Set a default configuration if none was specified
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    message(STATUS "No release type specified. Setting to 'Release'.")
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
+endif()
+
+message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
+
+
+# Configure CUDA architectures to compile for
+if (NOT DEFINED GGNN_CUDA_ARCHITECTURES)
+  if (DEFINED ENV{TORCH_CUDA_ARCH_LIST})
+    message(STATUS "Parsing TORCH_CUDA_ARCH_LIST: $ENV{TORCH_CUDA_ARCH_LIST}")
+    string(REPLACE "." "" TORCH_CUDA_ARCHITECTURES $ENV{TORCH_CUDA_ARCH_LIST})
+    string(REPLACE "  " " " TORCH_CUDA_ARCHITECTURES ${TORCH_CUDA_ARCHITECTURES})
+    string(REPLACE " " "-real " TORCH_CUDA_ARCHITECTURES "${TORCH_CUDA_ARCHITECTURES} ")
+    string(REPLACE "+PTX-real" " " TORCH_CUDA_ARCHITECTURES ${TORCH_CUDA_ARCHITECTURES})
+    string(REPLACE " " ";" TORCH_CUDA_ARCHITECTURES ${TORCH_CUDA_ARCHITECTURES})
+    set(GGNN_CUDA_ARCHITECTURES ${TORCH_CUDA_ARCHITECTURES} CACHE STRING "CUDA architecture(s) to compile for.")
+  else()
+    # adjust this based on your available GPUs
+    # see https://developer.nvidia.com/cuda-gpus
+    # 6.1 - 1080Ti
+    # 7.0 - V100
+    # 7.5 - 2080Ti
+    # 8.6 - 3090
+    # 8.9 - 4090
+    # 9.0 - H100
+    # the following are also allowed:
+    # all-major
+    # native
+    set(GGNN_CUDA_ARCHITECTURES all-major CACHE STRING "CUDA architecture(s) to compile for.")
+    mark_as_advanced(CMAKE_CUDA_ARCHITECTURES)
+    # set_property(CACHE GGNN_CUDA_ARCHITECTURES PROPERTY STRINGS "all-major" "native" "61" "70" "75" "86" "89" "90")
+  endif()
+endif()
+
+set(CMAKE_CUDA_ARCHITECTURES ${GGNN_CUDA_ARCHITECTURES} CACHE STRING "CUDA architecture(s) to compile for." FORCE)
+message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
+
+
+# Set CUDA flags based on build type
+set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr")
+if (CMAKE_BUILD_TYPE MATCHES "Debug")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G -g -Xptxas=-v")
+elseif(CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo -Xptxas=-v")
+endif()
+
+message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+
+
+# make compile commands available to clangd
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+# don't hide the include paths in a separate file
+set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES OFF)
+
+add_library(GGNNBase SHARED)
+target_sources(GGNNBase PRIVATE
+    "src/ggnn/base/graph_config.cpp"
+    "src/ggnn/base/graph.cpp"
+
+    "src/ggnn/base/data.cu"
+    "src/ggnn/base/dataset.cu"
+    "src/ggnn/base/gpu_instance.cu"
+    "src/ggnn/base/ggnn.cu"
+    "src/ggnn/base/result_merger.cpp"
+    "src/ggnn/base/eval.cpp"
+
+    "include/ggnn/base/lib.h"
+    "include/ggnn/base/result_merger.h"
+
+    "include/ggnn/base/graph_config.h"
+    "include/ggnn/base/graph.h"
+    "include/ggnn/base/gpu_instance.cuh"
+    )
+  target_sources(GGNNBase PUBLIC
+    "include/ggnn/base/def.h"
+    "include/ggnn/base/fwd.h"
+
+    "include/ggnn/base/data.cuh"
+    "include/ggnn/base/dataset.cuh"
+    "include/ggnn/base/ggnn.cuh"
+    "include/ggnn/base/eval.h"
+    )
+
+add_library(GGNNConstruction SHARED)
+target_sources(GGNNConstruction PRIVATE
+    "src/ggnn/construction/graph_construction.cu"
+
+    "src/ggnn/construction/graph_buffer.cu"
+
+    "src/ggnn/construction/top_merge_layer.cu"
+    "src/ggnn/construction/merge_layer.cu"
+    "src/ggnn/construction/wrs_select_layer.cu"
+    "src/ggnn/construction/sym_buffer_merge_layer.cu"
+    "src/ggnn/construction/sym_query_layer.cu"
+
+    "include/ggnn/construction/graph_buffer.cuh"
+
+    "include/ggnn/construction/top_merge_layer.cuh"
+    "include/ggnn/construction/merge_layer.cuh"
+    "include/ggnn/construction/wrs_select_layer.cuh"
+    "include/ggnn/construction/sym_query_layer.cuh"
+    "include/ggnn/construction/sym_buffer_merge_layer.cuh"
+
+    "include/ggnn/cuda_utils/distance.cuh"
+    "include/ggnn/cuda_utils/k_best_list.cuh"
+    "include/ggnn/cuda_utils/simple_knn_cache.cuh"
+    "include/ggnn/cuda_utils/simple_knn_sym_cache.cuh"
+    "include/ggnn/cuda_utils/check.cuh"
+    )
+target_sources(GGNNConstruction PUBLIC
+    "include/ggnn/construction/graph_construction.cuh"
+    )
+add_library(GGNNQuery SHARED)
+target_sources(GGNNQuery PRIVATE
+    "src/ggnn/query/query_kernels.cu"
+
+    "src/ggnn/query/bf_query_layer.cu"
+    "src/ggnn/query/query_layer.cu"
+
+    "include/ggnn/query/bf_query_layer.cuh"
+    "include/ggnn/query/query_layer.cuh"
+
+    "include/ggnn/cuda_utils/distance.cuh"
+    "include/ggnn/cuda_utils/k_best_list.cuh"
+    "include/ggnn/cuda_utils/simple_knn_cache.cuh"
+    "include/ggnn/cuda_utils/check.cuh"
+    )
+target_sources(GGNNQuery PUBLIC
+    "include/ggnn/query/query_kernels.cuh"
+    )
+
+target_link_libraries(GGNNBase PRIVATE CUDA::curand glog::glog)
+target_link_libraries(GGNNConstruction PRIVATE CUDA::curand glog::glog)
+target_link_libraries(GGNNQuery PRIVATE glog::glog)
+target_include_directories(GGNNBase PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+target_include_directories(GGNNConstruction PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+target_include_directories(GGNNQuery PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+if (nanobind_FOUND)
+    message(STATUS "building nanobind module")
+    nanobind_add_module(GGNN NB_STATIC NOMINSIZE "src/ggnn/python/nanobind.cu")
+    target_link_libraries(GGNN PRIVATE GGNNBase GGNNConstruction GGNNQuery glog::glog)
+    nanobind_add_stub(GGNN_stub MODULE GGNN OUTPUT GGNN.pyi MARKER_FILE py.typed DEPENDS GGNN)
+
+    if (PY_BUILD_CMAKE_MODULE_NAME)
+        message(STATUS "configuring python module installation")
+        set_property(TARGET GGNN APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
+        install(TARGETS GGNNBase GGNNConstruction GGNNQuery GGNN
+            EXCLUDE_FROM_ALL
+            COMPONENT python_modules
+            DESTINATION ${PY_BUILD_CMAKE_MODULE_NAME})
+        # install glog if we had to compile it ourselves
+        if (TARGET glog)
+            install(TARGETS glog
+                LIBRARY
+                EXCLUDE_FROM_ALL
+                COMPONENT python_modules
+                DESTINATION ${PY_BUILD_CMAKE_MODULE_NAME})
+        endif()
+        install(FILES ${CMAKE_BINARY_DIR}/py.typed ${CMAKE_BINARY_DIR}/GGNN.pyi
+            EXCLUDE_FROM_ALL
+            COMPONENT python_modules
+            DESTINATION ${PY_BUILD_CMAKE_MODULE_NAME})
+    endif()
+endif()
+
+if (NOT PY_BUILD_CMAKE_MODULE_NAME)
+  if (gflags_FOUND)
+    set(files
+      "${CMAKE_CURRENT_SOURCE_DIR}/examples/cpp-and-cuda/ggnn_main.cpp"
+      "${CMAKE_CURRENT_SOURCE_DIR}/examples/cpp-and-cuda/ggnn_main_gpu_data.cu"
+      "${CMAKE_CURRENT_SOURCE_DIR}/examples/cpp-and-cuda/ggnn_main_multi_gpu.cpp"
+      "${CMAKE_CURRENT_SOURCE_DIR}/examples/cpp-and-cuda/ggnn_benchmark.cpp"
+    )
+    foreach(filename ${files})
+        get_filename_component(EXECUTABLE_NAME "${filename}" NAME_WLE)
+        add_executable(${EXECUTABLE_NAME} ${filename})
+        target_link_libraries(${EXECUTABLE_NAME} PRIVATE GGNNBase GGNNConstruction GGNNQuery glog::glog gflags_nothreads_static)
+    endforeach()
+
+    target_link_libraries(ggnn_main_gpu_data PRIVATE CUDA::curand)
+  else()
+    message(STATUS "gflags not found. Skipping example projects.")
+  endif()
+endif()
diff --git a/LICENSE b/LICENSE
index d739a04..32d64ec 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,201 @@
-MIT License
-
-Copyright (c) 2019 Computergraphics (University of Tübingen)
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+        To apply the Apache License to your work, attach the following
+        boilerplate notice, with the fields enclosed by brackets "[]"
+        replaced with your own identifying information. (Don't include
+        the brackets!)  The text should be enclosed in the appropriate
+        comment syntax for the file format. We also recommend that a
+        file or class name and description of purpose be included on the
+        same "printed page" as the copyright notice for easier
+        identification within third-party archives.
+
+    Copyright 2025 ComputerGraphics Tuebingen
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
diff --git a/README.md b/README.md
index af6f431..71a6b8a 100644
--- a/README.md
+++ b/README.md
@@ -1,86 +1,181 @@
 # GGNN: Graph-based GPU Nearest Neighbor Search
-*Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
 
-Approximate nearest neighbor (ANN) search in high dimensions is an integral part of several computer vision systems and gains importance in deep learning with explicit memory representations. Since PQT and FAISS started to leverage the massive parallelism offered by GPUs, GPU-based implementations are a crucial resource for today’s state-of-the-art ANN methods. While most of these methods allow for faster queries, less emphasis is devoted to accelerate the construction of the underlying index structures. In this paper, we propose a novel search structure based on nearest neighbor graphs and information propagation on graphs. Our method is designed to take advantage of GPU architectures to accelerate the hierarchical building of the index structure and for performing the query. Empirical evaluation shows that GGNN significantly surpasses the state-of-the-art GPU- and CPU-based systems in terms of build-time, accuracy and search speed.
+GGNN performs nearest-neighbor computations on CUDA-capable GPUs.
+It supports billion-scale, high-dimensional datasets
+and can execute on multiple GPUs through sharding.
+When using just a single GPU, data can be exchanged directly with other code (e.g., torch tensors)
+without copying through CPU memory.
+GGNN is implemented using C++ and CUDA.
+It can also be used from Python (>=3.8) via its [nanobind](https://github.com/wjakob/nanobind) bindings.
 
----
+GGNN is based on the method proposed in the paper [GGNN: Graph-based GPU Nearest Neighbor Search](#citing-this-project)
+by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, and Hendrik P.A. Lensch.
+The original/official code corresponding to the published paper can be found in the [release_0.5](https://github.com/cgtuebingen/ggnn/tree/release_0.5) branch.
+
+<!-- #ToDo: Insert link to docu
+For more detailed information see our [documentation]().-->
+
+## Installing the Python Module
+
+### Prerequisites
+
+GGNN is implemented in C++20/CUDA.
+To compile and install the GGNN library, you need the [CUDA Toolkit](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) in version 12 or newer
+and GCC/Clang C++ compilers version 10 or newer.
 
-# Update: 03/14/21
-This work is currently under review. 
-We've updated the code. Some new results are shown below.
-![GGNN Plots](ggnn_plots.png)
+To run GGNN, a CUDA-capable GPU is required.
 
+### Installing from PyPI
 
+We're currently setting this up and will update the readme once this is ready.
 
-## How to run the code?
+### Manual Installation
+
+Simply clone or download the repository and use pip to install the GGNN python module:
 
 ```bash
-# Get the repository and dependencies
-git clone --recursive https://github.com/cgtuebingen/ggnn.git
+git clone https://github.com/cgtuebingen/ggnn.git
 cd ggnn
+python -m pip install .
+```
+
+## Compiling the C++/CUDA Code
 
-# get the SIFT1M data
-cd data
-./get_sift1m
-cd ..
+To build the example programs for running benchmarks,
+or to use GGNN with your own C++ or CUDA code, compile it using CMake:
 
-# Build the demo
-mkdir build_local
-cd build_local
+```bash
+git clone https://github.com/cgtuebingen/ggnn.git
+cd ggnn
+mkdir build
+cd build
 cmake ..
-make
-
-# Example for SIFT1M on GPU 0:
-./sift1m_multi  --base_filename ../data/sift/sift_base.fvecs 
-                --query_filename ../data/sift/sift_query.fvecs 
-                --groundtruth_filename ../data/sift/sift_groundtruth.ivecs 
-                --gpu_ids="0"
-
-# Example usage with 2 GPUs and 4 shards in total (4x250k = 1M):
-./sift1m_multi  --base_filename ../data/sift/sift_base.fvecs 
-                --query_filename ../data/sift/sift_query.fvecs 
-                --groundtruth_filename ../data/sift/sift_groundtruth.ivecs 
-                --gpu_ids="0 1" 
-                --factor 10000 
-                --base 100 
-                --shard 25
+make -j4
 ```
 
+### Prerequisites
 
-### Native build
+- NVCC 12 or newer (CUDA Toolkit 12 or newer)
+- either GCC (>=10) or Clang (>=10)
+  (e.g., `g++-10` `libstdc++-10-dev` or `clang-10` `libc++-10-dev` `libc++abi-10-dev` on Ubuntu)
+- `cmake` (>= 3.23)
+- [nanobind](https://github.com/wjakob/nanobind)
+  (`python -m pip install nanobind`)
+- [glog](https://github.com/google/glog)
+  (`libgoogle-glog-dev` on Ubuntu)
+- [gflags](https://github.com/gflags/gflags)
+  (`libgflags-dev` on Ubuntu)
 
-Requirements:
-* CUDA (>10.2)
-* libgflags-dev (`sudo apt install libgflags-dev`)
+The glog and gflags development libraries will be automatically fetched by CMake, if not installed.
 
+### Troubleshooting
 
-### Docker build
-An alternative to the native build is to use nvidia-docker. Follow instruction on https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce
-
-Prepare the docker image by
+If your default C/C++ compilers are too old,
+you may need to manually specify a newer version before running `cmake`:
 
 ```bash
-cd docker
-make
-cd ../
+export CC=gcc-10
+export CXX=g++-10
+export CUDAHOSTCXX=g++-10
 ```
 
-Make sure you can run
+<!--#ToDo: Insert link to Installation
+For installation in C++, please see the [documentation]().-->
+
+## Example Usage
+
+The GGNN python module can be used to perform GPU-accelerated approximate nearest-neighbor (ANN) queries using search graph or brute-force queries for determining the ground truth results.
+
+* First, you need to setup a GGNN instance.
+* Then, set the base dataset.
+  * Datasets can be given as CPU/CUDA torch tensors or as numpy arrays.
+* Given the base, you can build a search graph.
+* Using the search graph, you can run queries.
+* You can also run brute-force queries (no search graph required).
+* The brute-force results can be used to evaluate the accuracy of the ANN queries.
+
+```python
+#! /usr/bin/python3
+
+import ggnn
+import torch
+
+# get detailed logs
+ggnn.set_log_level(4)
+
+
+# create data
+base = torch.rand((10_000, 128), dtype=torch.float32, device='cpu')
+query = torch.rand((10_000, 128), dtype=torch.float32, device='cpu')
+
+
+# initialize ggnn
+my_ggnn = ggnn.GGNN()
+my_ggnn.set_base(base)
+
+# choose a distance measure
+measure=ggnn.DistanceMeasure.Euclidean
+
+# build the graph
+my_ggnn.build(k_build=24, tau_build=0.5, refinement_iterations=2, measure=measure)
+
+
+# run query
+k_query: int = 10
+tau_query: float = 0.64
+max_iterations: int = 400
+
+indices, dists = my_ggnn.query(query, k_query, tau_query, max_iterations, measure)
+
+
+# run brute-force query to get a ground truth and evaluate the results of the query
+gt_indices, gt_dists = my_ggnn.bf_query(query, k_gt=k_query, measure=measure)
+evaluator = ggnn.Evaluator(base, query, gt_indices, k_query=k_query)
+print(evaluator.evaluate_results(indices))
+
+# print the indices of the 10 NN of the first five queries and their squared euclidean distances
+print('indices:', indices[:5], '\n squared dists:',  dists[:5], '\n')
 
-```bash
-sudo docker run --gpus all cgtuebingen/ggnn:v1 nvidia-smi
 ```
 
-Now build the code via
+<!--#ToDo: Insert link to Usage
+For more examples in Python and in C++ see the [examples]() folder. For more information about the parameters, on how to deal with data that is already on a GPU and on how to utilize multiple GPUs, check out the [documentation](). We also provide scripts that load typical [benchmark datasets]().-->
 
-```bash
-user@host $ sudo docker run --rm -it --user "$(id -u):$(id -g)" -v ${PWD}:/ggnn:rw --gpus all cgtuebingen/ggnn:v1 bash
-user@container $ ./build.sh
 
-cd build_docker
-make
+## Capabilities and Limitations
+
+The GGNN library supports...
+
+- Billion-scale datasets with up to 2^31-1 vectors.
+- Data with up to 4096 dimensions.
+- Building search graphs with up to 512 edges per node.
+- Searching for up to 6000 nearest neighbors.
+- Two distance measures: cosine and euclidean (L2) distance.
+
+## Citing this Project
+
+You can use the following BibTeX entry to cite GGNN:
+
+```bibtex
+@ARTICLE{groh2022ggnn,
+  author={Groh, Fabian and Ruppert, Lukas and Wieschollek, Patrick and Lensch, Hendrik P. A.},
+  journal={IEEE Transactions on Big Data},
+  title={GGNN: Graph-Based GPU Nearest Neighbor Search},
+  year={2023},
+  volume={9},
+  number={1},
+  pages={267-279},
+  doi={10.1109/TBDATA.2022.3161156}
+}
 ```
 
-## More Resources
+The official article can be found on IEEE, following this DOI: [10.1109/TBDATA.2022.3161156](https://doi.org/10.1109/TBDATA.2022.3161156).
+Alternatively, see the [ArXiV preprint](https://arxiv.org/abs/1912.01059).
+
+---
+We hope this library makes your life easier and helps you to solve your problem!
+
+Happy programming,
+[Lukas Ruppert](https://github.com/LukasRuppert) and [Deborah Kornwolf](https://github.com/XDeboratti)
 
-- [Arxiv Pre-Print](https://arxiv.org/abs/1912.01059)
+PS: If you have a question or a problem that occurs, please feel free to open an issue, we will be happy to help you.
diff --git a/build.sh b/build.sh
deleted file mode 100755
index 4ffe357..0000000
--- a/build.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-cd /code
-rm -rf build_docker
-mkdir -p build_docker
-cd build_docker
-cmake .. -DCMAKE_BUILD_TYPE=Release
diff --git a/build_local/detect_cuda_compute_capabilities.cpp b/build_local/detect_cuda_compute_capabilities.cpp
deleted file mode 100644
index eb1bc19..0000000
--- a/build_local/detect_cuda_compute_capabilities.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <cuda_runtime.h>
-#include <cstdio>
-int main()
-{
-  int count = 0;
-  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;
-  if (count == 0) return -1;
-  for (int device = 0; device < count; ++device)
-  {
-    cudaDeviceProp prop;
-    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))
-      std::printf("%d.%d ", prop.major, prop.minor);
-  }
-  return 0;
-}
diff --git a/data/get_sift1m.sh b/data/get_sift1m.sh
deleted file mode 100755
index f3c5c0a..0000000
--- a/data/get_sift1m.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
-tar -xzf sift.tar.gz
diff --git a/docker/Dockerfile b/docker/Dockerfile
deleted file mode 100644
index d0c7a07..0000000
--- a/docker/Dockerfile
+++ /dev/null
@@ -1,18 +0,0 @@
-FROM nvidia/cuda:10.2-cudnn7-devel
-
-RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
-    rm -rf /var/lib/apt/lists/* \
-    /etc/apt/sources.list.d/cuda.list \
-    /etc/apt/sources.list.d/nvidia-ml.list && \
-    apt-get update && \
-    DEBIAN_FRONTEND=noninteractive $APT_INSTALL \
-    build-essential \
-    ca-certificates \
-    cmake \
-    python3.5 \
-    python-dev \
-    python3-dev \
-    libgflags-dev \
-    git
-
-WORKDIR /ggnn
diff --git a/docker/makefile b/docker/makefile
deleted file mode 100644
index 786c5bf..0000000
--- a/docker/makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-.PHONY: run
-
-build:
-	sudo docker build . -t cgtuebingen/ggnn:v1
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..53fc1f3
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+sphinx==7.1.2
+sphinx-rtd-theme==1.3.0rc1
diff --git a/docs/source/FAQ.rst b/docs/source/FAQ.rst
new file mode 100644
index 0000000..2bdfc94
--- /dev/null
+++ b/docs/source/FAQ.rst
@@ -0,0 +1,41 @@
+FAQ
+===
+
+- Installing the ``ggnn`` Python module fails.
+
+  Make sure you have the necessary :ref:`dependencies` installed
+  and see the :ref:`troubleshooting` section.
+
+- How do I benchmark datasets in HDF5 format, e.g. `ANN-Benchmarks`_?
+
+  See the :ref:`ann-benchmarks-hdf5` section in the :doc:`benchmarking` page
+  and the example file :file:`examples/python/sift1m_hdf5.py`
+  for an example of how to load and process a HDF5 file with GGNN using Python.
+
+.. _SIFT1B: http://corpus-texmex.irisa.fr/
+.. _ANN-Benchmarks: https://github.com/erikbern/ann-benchmarks/
+
+.. TODO: we should have an example Python script
+
+Known Issues
+------------
+
+- Calling a GGNN function with invalid parameters crashes my program
+  rather than returning an error / throwing an exception.
+
+  GGNN was initially designed just to run benchmarks and is not yet fully transformed to a library.
+  We are working on providing better user feedback in case of errors
+  but sometimes the hard sanity checks will trigger and ``abort()`` the program.
+
+  If there is any particular error case which should be handled better, please let us know.
+
+- GGNN copies data through the CPU even though my multi-GPU setup supports peer to peer copies.
+
+  Copying data from one GPU to another is not yet implemented.
+  Typically, this only affects the query, which is quite small.
+  When using a multi-GPU configuration, prefer to provide data on pinned CPU memory, whenever possible.
+
+I can't find the answer to my question here
+-------------------------------------------
+
+Please feel free to open a new issue, we are happy to help!
diff --git a/docs/source/ann.rst b/docs/source/ann.rst
new file mode 100644
index 0000000..9cab6d4
--- /dev/null
+++ b/docs/source/ann.rst
@@ -0,0 +1,67 @@
+Approximate Nearest Neighbor Search
+===================================
+
+Approximate nearest neighbor (ANN) search is of great importance in various fields including databases, data mining, and machine learning. ANN is derived from *k*-nearest-neighbor search.
+In contrast to kNN methods, ANN methods deliver approximate results,
+but typically allow for much faster queries.
+
+.. _search graph parameters:
+
+Search Graph Parameters
+-----------------------
+
+In order to build a search graph for ANN search,
+GGNN requires two parameters:
+
+``k_build: int``
+  The number of outgoing edges per point in the dataset, typically :math:`k_{build} \in [20,96]`.
+  The maximum number of edges per point is 512 and the minimum is 2.
+  Higher values increase construction time and memory consumption
+  but typically improve query performance.
+``tau_build: float``
+  A cost factor, typically :math:`\tau_{build} \in [0.3,1.0]`
+  Higher values increase construction time
+  but typically improve query performance.
+
+
+.. _query parameters:
+
+Query Parameters
+----------------
+
+``k_query: int``
+  The number of nearest neighbors to search for. Typically, :math:`10-100`.
+  Technically, up to :math:`6000` neighbors can be searched for before reaching the shared memory limit.
+
+.. note::
+  Due to the design of the stopping criterion,
+  it is advisable to always search for at least 10 nearest neighbors,
+  even when fewer results are required.
+
+``tau_query: float``
+  A cost factor, typically :math:`\tau_{query} \in [0.7,2.0]`
+  Higher values increase query time but produce more accurate results.
+  There are diminishing returns in query accuracy when increasing this value.
+
+``max_iterations: int``
+  A hard limit of search iterations to perform per query.
+  Each iteration visits one point in the search graph.
+  Typically, :math:`200-2000` iterations are approximate.
+
+.. note::
+  If increasing ``tau_query`` and ``max_iterations`` does not yield sufficient accuracy,
+  try increasing ``k_build`` and ``tau_build`` during search graph construction.
+
+.. caution::
+   Increasing ``k_query`` and ``max_iterations`` increases the shared memory consumption
+   and may limit on-GPU parallelism (SM occupancy).
+   Increasing the query parameters too much can slow down the query significantly.
+
+
+.. _distance measures:
+
+Distance Measures
+-----------------
+
+GGNN supports ``Euclidean`` (L2) and ``Cosine`` distance measures.
+The ``measure`` can be specified during search graph construction and query and will default to ``Euclidean``.
diff --git a/docs/source/benchmarking.rst b/docs/source/benchmarking.rst
new file mode 100644
index 0000000..acc0e23
--- /dev/null
+++ b/docs/source/benchmarking.rst
@@ -0,0 +1,162 @@
+Benchmarking
+============
+
+Running Standardized Benchmarks
+-------------------------------
+
+In order to run standardized ANN benchmarks, you can use the Python module
+to run your own benchmark scripts
+or you can run the example program :program:`ggnn_benchmark`
+which is compiled alongside the C++ library
+and can be applied to arbitrary datasets.
+
+Everything dataset-specific can be configured via the following command line parameters:
+
+``base``
+  Path to the base dataset ``.fvecs`` or ``.bvecs`` file.
+
+``subset`` (optional)
+  In case you want to only load a subset of the base dataset,
+  you can specify the size of that subset here.
+  Only the first ``subset`` many points will be loaded.
+  By default, or if set to ``0``, the entire base dataset file will be loaded.
+
+``query``
+  Path to the query dataset ``.fvecs`` or ``.bvecs`` file.
+
+``gt`` (optional)
+  Path to the ground truth indices ``.ivecs`` file.
+
+  .. note::
+
+    If not given, the ground truth will be brute-forced, if possible.
+
+    If a file name is given, but the file does not exist, the brute-forced result will be stored.
+
+``graph_dir`` (optional)
+  Directory for loading/storing the GGNN graph or graph shards.
+
+
+  .. note::
+
+    If the directory already contains a GGNN graph, it will be loaded and construction will be skipped.
+    Otherwise, the constructed graph will be stored in this directory.
+
+  .. note::
+    If left empty, the graph will be discarded when the program ends.
+
+    If necessary (i.e., if GPU memory is insufficient to keep all shards loaded),
+    GGNN will swap out shards from GPUs to RAM and disk automatically in multi-shard settings.
+
+    In that case, GGNN graph shards will be stored in the current working directory.
+
+``k_build`` (optional, default ``24``)
+  Number of neighbors per point in the search graph (see :ref:`search graph parameters`).
+
+``tau_build`` (optional, default ``0.5``)
+  Slack factor for search graph construction (see :ref:`search graph parameters`).
+
+``refinement_iterations`` (optional, default: ``2``)
+  Number of iterations for search graph refinement.
+
+``k_query`` (optional, default ``10``)
+  Number of neighbors to search for (see :ref:`query parameters`).
+
+``measure`` (optional, default ``euclidean``)
+  Distance measure (``euclidean`` or ``cosine``) (see :ref:`distance measures`).
+
+``shard_size`` (optional)
+  Number of points per shard.
+  With sharding, the base datasets is split into equally-sized shards.
+  This parameter defines the size of one shard.
+
+  .. caution::
+
+    The base dataset needs to be evenly divisible by the shard size.
+    The resulting number of shards needs to be evenly divisible by the number of GPUs.
+
+``gpu_ids`` (optional)
+  CUDA device indices of the GPUs to be used by GGNN, separated by spaces.
+  E.g., ``'0 1 2 3'``.
+
+  .. note::
+
+    Using multiple GPUs requires sharding (see ``shard_size``).
+
+  .. tip::
+
+    CUDA device indices can be influenced by the `CUDA Environment Variables`_
+    ``CUDA_VISIBLE_DEVICES`` and ``CUDA_DEVICE_ORDER``.
+
+``grid_search`` (optional)
+  If set, run a larger sweep of queries with :math:`\tau_{query} \in [0.7, 2.0]`
+  rather than just a small set of queries.
+
+``v`` (optional)
+  Verbosity level between ``0`` and ``4`` (maximum verbosity).
+
+
+.. _CUDA Environment Variables: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
+
+
+.. code:: bash
+
+  ./build/ggnn_benchmark \
+    --base /path/to/sift_base.fvecs \
+    --query /path/to/sift_query.fvecs \
+    --gt /path/to/sift_groundtruth.ivecs \
+    --graph_dir ./ \
+    --tau_build 0.5 \
+    --refinement_iterations 2 \
+    --k_build 24 \
+    --k_query 10 \
+    --measure euclidean \
+    --shard_size 0 \
+    --subset 0 \
+    --gpu_ids 0 \
+    --grid_search false
+
+
+.. _ann-benchmarks-hdf5:
+
+ANN-Benchmarks / HDF5
+---------------------
+
+In order to run a benchmark from `ANN-Benchmarks`_, you might want to load a dataset from an HDF5 file.
+You can do so with a simple Python script:
+
+.. code:: python
+
+  import h5py
+  import numpy as np
+
+  # load ANN-benchmark-style HDF5 dataset
+  with h5py.File(path_to_dataset, 'r') as f:
+    base = np.array(f['train'])
+    query = np.array(f['test'])
+    gt = np.array(f['neighbors'])
+
+
+See also the example file :file:`examples/python/sift1m_hdf5.py`.
+
+.. _ANN-Benchmarks: https://github.com/erikbern/ann-benchmarks/
+
+
+Reference Configurations
+------------------------
+
+The default values set in the :program:`ggnn_benchmark` program are set for the `SIFT1M`_ dataset.
+For other datasets, set the parameters as documented in the GGNN paper.
+
+.. TODO: parameters per dataset and some expected query results.
+
+.. note::
+  We will update this documentation shortly to reference all necessary configurations.
+
+  For now, check the ``.cu`` files per dataset under ``src`` in the `release_0.5`_ branch
+  and the official paper :ref:`GGNN: Graph-based GPU Nearest Neighbor Search <citing-this-project>`.
+
+
+.. _SIFT1M: http://corpus-texmex.irisa.fr/
+.. _release_0.5: https://github.com/cgtuebingen/ggnn/tree/release_0.5
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..030679c
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,35 @@
+# Configuration file for the Sphinx documentation builder.
+
+# -- Project information
+
+project = 'GGNN'
+copyright = '2025, Computer Graphics, University of Tübingen'
+author = 'Lukas Ruppert, Deborah Kornwolf'
+
+version = '0.9'
+release = '0.9.0'
+
+# -- General configuration
+
+extensions = [
+    'sphinx.ext.duration',
+    'sphinx.ext.doctest',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+]
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3/', None),
+    'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
+}
+intersphinx_disabled_domains = ['std']
+
+templates_path = ['_templates']
+
+# -- Options for HTML output
+
+html_theme = 'sphinx_rtd_theme'
+
+# -- Options for EPUB output
+epub_show_urls = 'footnote'
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..962828a
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,71 @@
+GGNN Documentation
+==================
+
+`GGNN`_  performs nearest-neighbor computations on CUDA-capable GPUs.
+It supports billion-scale, high-dimensional datasets
+and can execute on multiple GPUs through sharding.
+When using just a single GPU, data can be exchanged directly with other code (e.g., torch tensors)
+without copying through CPU memory.
+GGNN is implemented using C++ and CUDA.
+It can also be used from Python (>=3.8) via its `nanobind`_ bindings.
+
+GGNN is based on the method proposed in the paper :ref:`GGNN: Graph-based GPU Nearest Neighbor Search <citing-this-project>`
+by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, and Hendrik P.A. Lensch.
+The original/official code corresponding to the published paper can be found in the `release_0.5`_ branch.
+
+The :doc:`install` section explains how to install the library, and the :doc:`usage_python` and :doc:`usage_cpp` sections provides short tutorials and code examples.
+
+.. _GGNN: https://github.com/cgtuebingen/ggnn
+.. _release_0.5: https://github.com/cgtuebingen/ggnn/tree/release_0.5
+.. _nanobind: https://github.com/wjakob/nanobind
+
+Contents
+--------
+
+.. toctree::
+
+  Home <self>
+  install
+  ann
+  usage_python
+  usage_cpp
+  benchmarking
+  FAQ
+
+Capabilities and Limitations
+----------------------------
+
+The GGNN library supports...
+
+- Billion-scale datasets with up to :math:`2^{31}-1` vectors.
+- Data with up to 4096 dimensions.
+- Building search graphs with up to 512 edges per node.
+- Searching for up to 6000 nearest neighbors.
+- Two distance measures: cosine and euclidean (L2) distance.
+
+.. _citing-this-project:
+
+Citing this Project
+-------------------
+
+You can use the following BibTeX entry to cite GGNN:
+
+.. code-block:: bibtex
+
+  @ARTICLE{groh2022ggnn,
+    author={Groh, Fabian and Ruppert, Lukas and Wieschollek, Patrick and Lensch, Hendrik P. A.},
+    journal={IEEE Transactions on Big Data},
+    title={GGNN: Graph-Based GPU Nearest Neighbor Search},
+    year={2023},
+    volume={9},
+    number={1},
+    pages={267-279},
+    doi={10.1109/TBDATA.2022.3161156}
+  }
+
+
+The official article can be found on IEEE, following this DOI: `10.1109/TBDATA.2022.3161156`_.
+Alternatively, see the `ArXiV preprint`_.
+
+.. _10.1109/TBDATA.2022.3161156: https://doi.org/10.1109/TBDATA.2022.3161156
+.. _ArXiV preprint: https://arxiv.org/abs/1912.01059
diff --git a/docs/source/install.rst b/docs/source/install.rst
new file mode 100644
index 0000000..36044bb
--- /dev/null
+++ b/docs/source/install.rst
@@ -0,0 +1,134 @@
+Installation
+============
+
+GGNN can be installed as a Python module or compiled as a library for C++/CUDA code.
+
+.. _dependencies:
+
+Dependencies
+------------
+
+The following dependencies are required to install the library:
+
+- A C++20 compiler and standard library (GCC or Clang version 10 or higher)
+- `CUDA Toolkit`_ version 12 or higher
+
+  - This includes the Nvidia CUDA compiler ``nvcc``
+
+The existence and version of these dependencies can be checked with::
+
+   nvcc --version
+
+and::
+
+   c++ --version
+
+Installing the GGNN Python Module
+---------------------------------
+
+To install GGNN, first the repository has to be cloned::
+
+  git clone https://github.com/cgtuebingen/ggnn.git
+
+The easiest way to install GGNN is from the folder containing the repository::
+
+  cd ggnn
+
+The `ggnn` module can then be installed using the package manager pip::
+
+  python3 -m pip install .
+
+
+.. note::
+   Automatic installation via ``pip install ggnn`` is under development.
+
+
+Installing the GGNN C++ Library
+-------------------------------
+
+To install GGNN, first the repository has to be cloned::
+
+  git clone https://github.com/cgtuebingen/ggnn.git
+
+The easiest way to install GGNN is from the folder containing the repository::
+
+  cd ggnn
+
+The GGNN library can then be built::
+
+  mkdir build
+  cd build
+  cmake ..
+  make -j4
+
+
+.. _troubleshooting:
+
+Troubleshooting
+---------------
+
+In case GGNN does not compile, check your CUDA and C++ compilers:
+
+CUDA
+  In case ``nvcc`` cannot be found by ``cmake``, you may get one of the following errors:
+
+    - ``Failed to find nvcc.``
+    - ``Compiler requires the CUDA toolkit.``
+    - ``-- The CUDA compiler identification is unknown``
+    - ``Failed to detect a default CUDA architecture.``
+
+
+  Set the ``PATH`` and ``LD_LIBRARY_PATH`` to your installed `CUDA Toolkit`_, e.g.:
+
+    .. code-block:: bash
+
+      export PATH="/usr/local/cuda-12.4/bin/:${PATH}"
+      export LD_LIBRARY_PATH="/usr/local/cuda-12.4/lib64/:${LD_LIBRARY_PATH}"
+
+  Now, ``nvcc --version`` should print something like this::
+
+      nvcc: NVIDIA (R) Cuda compiler driver
+      Copyright (c) 2005-2024 NVIDIA Corporation
+      Built on Thu_Mar_28_02:18:24_PDT_2024
+      Cuda compilation tools, release 12.4, V12.4.131
+      Build cuda_12.4.r12.4/compiler.34097967_0
+
+C++
+  For compilation, we support GCC's ``g++`` >= 10 and LLVM's ``clang++`` >= 10.
+
+  If you use a different compiler or an outdated version, you may get the following message:
+
+    - ``GCC or Clang version 10 or higher required for C++20 support!``
+
+  You can define which C/C++ compilers should be used with the following environment variables:
+
+    .. code-block:: bash
+
+      # E.g., to use GCC 10, set the following:
+      export CC=gcc-10
+      export CXX=g++-10
+      export CUDAHOSTCXX=g++-10
+
+  Also, make sure to have the matching C++ standard library version installed.
+
+    For GCC 10, install the following on Ubuntu:
+
+      ``g++-10`` and ``libstdc++-10-dev``
+
+    For Clang 10, install the following on Ubuntu:
+
+      ``clang-10``, ``libc++-10-dev``, and ``libc++abi-10-dev``
+
+    Similarly for newer versions.
+
+  This has been tested on Ubuntu 20.04.
+  Newer versions will ship newer versions by default.
+  E.g., GCC 13 and Clang 18 on Ubuntu 24.04,
+  which should work out-of-the-box.
+
+CMake
+  Make sure to re-run ``cmake`` in a fresh ``build`` folder after exporting these environment variables.
+  Otherwise, ``cmake`` may use settings from a cached configuration.
+
+
+.. _CUDA Toolkit: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/
diff --git a/docs/source/usage_cpp.rst b/docs/source/usage_cpp.rst
new file mode 100644
index 0000000..236d8bb
--- /dev/null
+++ b/docs/source/usage_cpp.rst
@@ -0,0 +1,294 @@
+Using the GGNN C++ Library
+==========================
+
+This section explains how to use the GGNN C++ library.
+
+You can find all the code from this tutorial and additional example files in the :file:`examples/cpp-and-cuda/` folder of the GGNN repository.
+
+Including GGNN
+--------------
+
+Before using GGNN, the ``ggnn/base/ggnn.cuh`` header has to be included from the GGNN library.
+For convenience we include some parts of the standard library
+and we use the ``ggnn`` namespace to avoid prefixing all GGNN classes with ``ggnn::``.
+
+.. code:: c++
+
+  #include <ggnn/base/ggnn.cuh>
+  #include <array>
+  #include <iostream>
+  #include <cstdint>
+  #include <random>
+
+  using namespace ggnn;
+
+The header files from the standard library are only for demonstration purposes and are not required for using the library.
+
+Using CPU Data
+--------------
+
+Then, some data to search in and some data to search the *k* nearest neighbors for is needed.
+Instead of a ``std:array`` you can also use a ``std::vector``
+or any other standard C++ container which can be mapped to a ``std::span``:
+
+.. code:: c++
+
+  int main() {
+
+    const size_t N_base = 10'000;
+    const size_t N_query = 10'000;
+    const uint32_t dim = 128;
+
+    // the data to query on
+    std::array<float, N_base*dim> base_data;
+    // the data to query for
+    std::array<float, N_query*dim> query_data;
+
+    // setup a random number generator
+    std::default_random_engine prng {};
+    std::uniform_real_distribution<float> uniform{0.0f, 1.0f};
+
+    // generate the random data
+    for(float& x : base_data)
+      x = uniform(prng);
+    for (float& x : query_data)
+      x = uniform(prng);
+
+Then, a GGNN instance and the datasets can be initialized:
+
+.. code:: c++
+
+    /// data type for addressing points
+    using KeyT = int32_t;
+    /// data type of computed distances
+    using ValueT = float;
+    using GGNN = GGNN<KeyT, ValueT>;
+
+    // Initialize GGNN
+    GGNN ggnn{};
+
+    // Initialize the datasets containing the base data and query data
+    Dataset<float> base = Dataset<float>::copy(base_data, dim, true);
+    Dataset<float> query = Dataset<float>::copy(query_data, dim, true);
+
+Instead of copying the data, data on the host can also be referenced with ``referenceCPUData()`` and data on the GPU can be referenced with ``referenceGPUData()``.
+
+.. caution::
+
+  When referencing data, make sure its lifetime exceeds the lifetime of the GGNN instance.
+
+If the data is a dataset in fvecs or bvecs format it can be loaded with ``Dataset<BaseT>::load(path_to_file)``.
+
+The base has to be passed to GGNN:
+
+.. code:: c++
+
+    ggnn.setBaseReference(base);
+
+Now, GGNN is ready to be used and a graph can be built:
+
+.. code:: c++
+
+    // build the search graph
+    ggnn.build(/*k_build*/ 24, /*tau_build*/ 0.5f);
+
+The parameters are the same as when :doc:`usage_python` and are also further explained in the :ref:`search graph parameters` section.
+In addition to ``k_build`` and ``tau_build``, you can also specify the number of ``refinement_iterations`` and the ``measure``.
+The measure can either be ``DistanceMeasure::Euclidean`` or ``DistanceMeasure::Cosine``.
+
+.. code:: c++
+
+    // run the query and store indices & squared distances
+    const uint32_t KQuery = 10;
+    const auto [indices, dists] = ggnn.query(query, KQuery, /*tau_query*/ 0.5f);
+
+The parameters of the query are again the same as when :doc:`usage_python` and further explained in the :ref:`query parameters` section.
+You can specify the ``query``, ``KQuery``, ``tau_query``, ``max_iterations``, and the ``measure``.
+
+Finally, the example program prints the indices and squared euclidean distances of the 10 nearest neighbors of the first query:
+
+.. code:: c++
+
+    // print the results for the first query
+    std::cout << "Result for the first query vector: \n";
+    for(uint32_t i=0; i < KQuery; i++){
+        //std::cout << "Base Idx: ";
+        std::cout << "Distance to vector at base[";
+        std::cout.width(5);
+        std::cout << indices[i];
+        std::cout << "]: " << dists[i] << "\n";
+    }
+    return 0;
+  }
+
+
+Using GPU Data
+--------------
+
+In the following, the data is assumed to already be located on the GPU.
+For demonstration purposes, we generate some random data using `cuRAND`_:
+
+.. code:: c++
+
+  #include <ggnn/base/ggnn.cuh>
+  #include <ggnn/base/eval.h>
+
+  #include <cstdint>
+  #include <iostream>
+
+  #include <cuda_runtime.h>
+  #include <curand.h>
+
+  using namespace ggnn;
+
+  int main() {
+
+    /// data type for addressing points
+    using KeyT = int32_t;
+    /// data type of computed distances
+    using ValueT = float;
+    using GGNN = GGNN<KeyT, ValueT>;
+
+    //create data on gpu
+    size_t N_base {10'000};
+    size_t N_query {10'000};
+    uint32_t D {128};
+
+    float* base;
+    float* query;
+
+    // allocate GPU data
+    cudaMalloc(&base, N_base*D*sizeof(float));
+    cudaMalloc(&query, N_query*D*sizeof(float));
+
+    // setup the random number generator
+    curandGenerator_t generator;
+    curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
+
+    // generate some random data
+    curandGenerateUniform(generator, base, N_base*D);
+    curandGenerateUniform(generator, query, N_query*D);
+
+Next, GGNN has to be initialized and, to avoid a copy, the data can be referenced:
+
+.. code:: c++
+
+  // Initialize GGNN
+  GGNN ggnn{};
+
+  // Set the data on the GPU as the base dataset on which the graph should be built on.
+  // To reference existing data, specify its pointer, the number of base vectors N_base,
+  // the dimensionality of base vectors D and the gpu_id of the GPU containing the data.
+  int32_t gpu_id = 0;
+  ggnn.setBase(ggnn::Dataset<float>::referenceGPUData(base, N_base, D, gpu_id));
+
+  // Also reference the query data already existing on the GPU
+  auto d_query = ggnn::Dataset<float>::referenceGPUData(query, N_query, D, gpu_id);
+
+Now, build a search graph using GGNN and run a query:
+
+.. code:: c++
+
+    // build the search graph
+    const uint32_t KBuild = 24;
+    const float tau_build = 0.5f;
+    ggnn.build(KBuild, tau_build);
+
+    // run the query and store indices & distances
+    const int32_t KQuery = 10;
+    const auto [indices, dists] = ggnn.query(d_query, KQuery, 0.5);
+
+    // print the results for the first query
+    std::cout << "Result for the first query verctor: \n";
+    for(uint32_t i=0; i < KQuery; i++){
+      //std::cout << "Base Idx: ";
+      std::cout << "Distance to vector at base[";
+      std::cout.width(5);
+      std::cout << indices[i];
+      std::cout << "]: " << dists[i] << "\n";
+    }
+
+.. note::
+
+  While the query data is given on the GPU, results are still returned to the CPU by default.
+
+Finally, some cleanup.
+
+.. code:: c++
+
+    // cleanup
+    curandDestroyGenerator(generator);
+    cudaFree(base);
+    cudaFree(query);
+
+    return 0;
+  }
+
+.. _cuRAND: https://docs.nvidia.com/cuda/curand/index.html
+
+Using multiple GPUs
+-------------------
+
+To work on multiple GPUs, GGNN uses sharding.
+
+A shard is a portion of the base dataset, for which an individual search graph "graph shard" is built.
+To make sure no base vector is left out, the base dataset needs to be evenly divisible by ``shard_size``.
+During query, all graph shards are being searched and the results of all shards are then merged on the CPU.
+Shards are equally distributed across all GPUs.
+Therefore, the number of shards has to be evenly divisible by the number of GPUs used.
+
+To tell GGNN which GPUs to use, use the ``setGPUs`` method.
+To set the shard size, use ``setShardSize``:
+
+.. code:: c++
+
+  // initialize GGNN
+  GGNN ggnn;
+
+  ggnn.setBaseReference(base);
+
+  // configure which GPUs to use
+  ggnn.setGPUs({0,1});
+
+  // split dataset into shards of this size
+  ggnn.setShardSize(25'000);
+
+In case the GPU memory is insufficient to keep all assigned graph and base shards in memory,
+shards will automatically be swapped out to CPU memory and to disk.
+You can specify a CPU memory limit and the directory in which the swapped out shards will be stored.
+
+.. code:: c++
+
+  // use 64 GB of CPU memory for swapping out shards
+  const size_t available_memory = 64UL * 1024 * 1024 * 1024;
+  ggnn.setCPUMemoryLimit(available_memory);
+
+  ggnn.setWorkingDirectory("/some/path/for/swapping/out/shards");
+
+Once everything is setup, build and query the search graph as usual:
+
+.. code:: c++
+
+  // build a search graph for all shards
+  ggnn.build(/*KBuild*/ 24, /*tau_build*/ 0.5f);
+
+  // query all shards and return the merged result
+  const auto [indices, dists] = ggnn.query(query, KQuery, /*tau_query*/ 0.5f);
+
+
+Loading Datasets (e.g. SIFT1M)
+------------------------------
+
+GGNN can load datasets in ``.fvecs``, ``.bvecs``, and ``.ivecs`` format
+for benchmark datasets such as `SIFT1M`_ and `SIFT1B`_.
+
+.. code:: c++
+
+  Dataset<float> sift1m_base = Dataset<float>::load("/path/to/sift_base.fvecs");
+  Dataset<unsigned char> sift1b_base = Dataset<unsigned char>::load("/path/to/bigann_base.bvecs");
+  Dataset<int> sift1m_gt = Dataset<int>::load("/path/to/sift_groundtruth.ivecs");
+
+
+.. _SIFT1M: http://corpus-texmex.irisa.fr/
+.. _SIFT1B: http://corpus-texmex.irisa.fr/
+
diff --git a/docs/source/usage_python.rst b/docs/source/usage_python.rst
new file mode 100644
index 0000000..f1859d1
--- /dev/null
+++ b/docs/source/usage_python.rst
@@ -0,0 +1,249 @@
+Using the GGNN Python Module
+============================
+
+This section explains how to use the ``ggnn`` Python module.
+
+While written in C++/CUDA, GGNN can be used from Python via its `nanobind`_ bindings.
+
+.. _nanobind: https://github.com/wjakob/nanobind
+
+
+The code from this tutorial and additional examples can be found in the :file:`ggnn/examples/python/ggnn_pytorch.py` file of the GGNN repository.
+
+Importing GGNN
+--------------
+
+After installing the ggnn module, it needs to be imported.
+``ggnn.set_log_level(4)`` enables verbose logging of information into the console during the execution of the algorithm.
+The higher the log level (``0`` to ``4``), the more information is printed.
+By default, the log level is set to ``0``:
+
+.. code:: python
+
+  #! /usr/bin/python3
+
+  import ggnn
+
+  #get detailed logs
+  ggnn.set_log_level(4)
+
+Using CPU Data
+--------------
+
+For demonstration purposes, we will create some random example data using torch:
+
+.. code:: python
+
+  import torch
+
+  #create data
+  base = torch.rand((10_000, 128), dtype=torch.float32, device='cpu')
+  query = torch.rand((10_000, 128), dtype=torch.float32, device='cpu')
+
+.. note::
+
+  You can also use numpy arrays instead of torch tensors.
+
+
+The next step is to create an instance of the GGNN class from the ggnn module. The GGNN class needs the base data (``my_ggnn.set_base(base)``) and can then build the graph:
+
+.. code:: python
+
+  # initialize ggnn
+  my_ggnn = ggnn.GGNN()
+  my_ggnn.set_base(base)
+
+  # choose a distance measure
+  measure=ggnn.DistanceMeasure.Euclidean
+
+  # build the graph
+  my_ggnn.build(k_build=24, tau_build=0.5, refinement_iterations=2, measure=measure)
+
+
+The parameters of the ``build(k_build, tau_build, measure)`` function need some explanation.
+``k_build >= 2`` describes the number of outgoing edges per node in the graph.
+The larger ``k_build``, the longer the build time and the query.
+``tau_build`` influences the stopping criterion during the creation of the graph.
+The larger the ``tau_build``, the longer the build time.
+Typically, :math:`0.3 < \tau_{build} < 1` is enough to get good results during search.
+It is recommended to experiment with these parameters to get the best possible trade-off between build time and accuracy out of the search.
+See the paper :ref:`GGNN: Graph-based GPU Nearest Neighbor Search <citing-this-project>` and the :ref:`search graph parameters` section for more information on parameters and some examples.
+``measure`` is the distance measure to compare the distances of the vectors.
+The ggnn module supports cosine and euclidean (L2) distance, euclidean distance is the default, so passing this parameter is optional.
+
+.. caution::
+
+  The distance measure for building, querying and computing the ground truth should be the same.
+  If set explicitly, make sure to provide its value to all functions.
+
+
+Now, the approximate nearest neighbor search can be performed:
+
+.. code:: python
+
+  # run query
+  k_query: int = 10
+  tau_query: float = 0.64
+  max_iterations: int = 400
+
+  indices, dists = my_ggnn.query(query, k_query, tau_query, max_iterations, measure)
+
+
+The parameters of the ``query(query, k_query, tau_query, max_iterations, measure)`` are:
+
+- ``query`` are all the vectors, to search the *k* nearest neighbors for.
+- ``k_query`` tells the search algorithm how many neighbors it should return per query vector.
+  Generally, the higher ``k_query``, the longer the search.
+  The ggnn module supports up to 6000 neighbors, but it is recommended to search only for 10-1000 neighbors.
+- ``tau_query`` and ``max_iterations`` determine the stopping criterion.
+  For both parameters it holds that the larger the parameter, the longer the search.
+  Typically, :math:`0.7 < \tau_{query} < 2` and :math:`200 < max\_iterations < 2000` is enough to get good results during search.
+- ``measure`` is the distance measure that is used to compute the distances between vectors. ``Euclidean`` is the default, so this parameter is optional. To set cosine similarity you can pass ``measure=ggnn.DistanceMeasure.Cosine`` as parameter.
+
+
+In this example, a ground truth is computed via a brute-force query and the result of the ANN search is evaluated:
+
+.. code:: python
+
+  # run brute-force query to get a ground truth and evaluate the results of the query
+  gt_indices, gt_dists = my_ggnn.bf_query(query, k_gt=k_query, measure=measure)
+  evaluator = ggnn.Evaluator(base, query, gt_indices, k_query=k_query)
+  print(evaluator.evaluate_results(indices))
+
+For computing a ground truth, we need  to pass ``k_gt`` which should be at least as many as ``k_query`` if we want to compare properly.
+In case of duplicates in the dataset, a larger set of ground truth indices can be used to accurately determine the accuracy.
+
+.. note::
+
+  The brute-force query can only be run in single-GPU mode.
+
+
+After evaluating the example program prints the indices of the *k* nearest neighbors for the first five queries and their squared euclidean distances:
+
+.. code:: python
+
+  # print the indices of the 10 NN of the first five queries and their squared euclidean distances
+  print('indices:', indices[:5], '\n squared dists:',  dists[:5], '\n')
+
+
+Using GPU Data
+--------------
+
+This works just like with data on the host, but the device of the torch tensors must be set to ``device='cuda'``
+and possibly the respective GPU index must be added, e.g. ``device='cuda:1'``.
+
+GGNN can return the result of the *k* nearest neighbor search on the GPU with ``my_ggnn.set_return_results_on_gpu(True)``.
+If not set, the results will be on the host.
+
+.. note::
+
+  Returning the results on the GPU is not possible in a multi-GPU setup.
+  When using sharding, sorted results of all shards are returned (since merging would be performed on the CPU).
+
+
+.. code:: python
+
+  #create data
+  base = torch.rand((10_000, 128), dtype=torch.float32, device='cuda')
+  query = torch.rand((10_000, 128), dtype=torch.float32, device='cuda')
+
+  # initialize GGNN
+  my_ggnn = ggnn.GGNN()
+  my_ggnn.set_base(base)
+  my_ggnn.set_return_results_on_gpu(True)
+
+
+
+Using Multiple GPUs
+-------------------
+
+To work on multiple GPUs, GGNN uses sharding.
+
+A shard is a portion of the base dataset, for which an individual search graph "graph shard" is built.
+To make sure no base vector is left out, the base dataset needs to be evenly divisible by ``shard_size``.
+During query, all graph shards are being searched and the results of all shards are then merged on the CPU.
+Shards are equally distributed across all GPUs.
+Therefore, the number of shards has to be evenly divisible by the number of GPUs used.
+
+To tell the ggnn instance which GPUs to use, use the ``set_gpus(gpu_ids)`` function, which expects a list of CUDA device ids.
+To set the shard size, use ``set_shard_size(n_shard)``, where ``n_shard`` describes the number of base vectors that should be processed at once.
+
+Otherwise, this works the same way as above.
+
+.. code:: python
+
+  #! /usr/bin/python3
+
+  import ggnn
+  import torch
+
+  # create data
+  base = torch.rand((100_000, 128), dtype=torch.float32, device='cpu')
+  query = torch.rand((10_000, 128), dtype=torch.float32, device='cpu')
+
+  # initialize ggnn and prepare multi-GPU
+  my_ggnn = ggnn.GGNN()
+  my_ggnn.set_base(base)
+  my_ggnn.set_shard_size(n_shard=25_000)
+  my_ggnn.set_gpus(gpu_ids=[0,1])
+
+  # build the graph
+  my_ggnn.build(k_build=24, tau_build=0.9)
+
+  # run query
+  indices, dists = my_ggnn.query(query, k_query=10, tau_query=0.64, max_iterations=400)
+
+  print('indices:', indices[:5], '\n squared dists:',  dists[:5], '\n')
+
+.. caution::
+
+  Copying data between different GPUs is not supported.
+  Instead, data is automatically copied from GPU to CPU and then to a different GPU, if necessary.
+  When using multiple GPUs, it should therefore be preferred to provide the data on the CPU.
+
+  Since query results from multiple GPUs are merged on the CPU,
+  returning them on the GPU is also not possible.
+
+.. tip::
+
+  To achieve a multi-GPU, GPU-only setup,
+  setup multiple independent instances of GGNN, one per GPU,
+  and merge the query results yourself.
+
+
+Loading Datasets (e.g. SIFT1M)
+------------------------------
+
+If the data is provided in :file:`.fvecs` or  :file:`.bvecs` format, as for example the `SIFT1M`_ and `SIFT1B`_ datasets,
+the dataset can be loaded using the ``.load('/path/to/file')`` function.
+Besides a ``FloatDataset`` (``float``), the ggnn module can also load a base and query as ``UCharDataset`` (``unsigned char``).
+If a ground truth is provided as an :file:`.ivecs` file, it can be loaded as an ``IntDataset`` (``int``)
+and passed to the ``Evaluator`` directly.
+
+.. code:: python
+
+  #! /usr/bin/python3
+
+  import ggnn
+
+  path_to_dataset = '/path/to/sift/'
+
+  base = ggnn.FloatDataset.load(path_to_dataset + 'sift_base.fvecs')
+  query = ggnn.FloatDataset.load(path_to_dataset + 'sift_query.fvecs')
+  gt = ggnn.IntDataset.load(path_to_dataset + 'sift_groundtruth.ivecs')
+
+  k_query: int = 10
+
+  evaluator = ggnn.Evaluator(base, query, gt, k_query)
+
+  my_ggnn = ggnn.GGNN()
+  my_ggnn.set_base(base)
+  my_ggnn.build(k_build=24, tau_build=0.5)
+
+  indices, dists = my_ggnn.query(query, k_query, tau_query=0.64, max_iterations=400)
+  print(evaluator.evaluate_results(indices))
+
+
+.. _SIFT1M: http://corpus-texmex.irisa.fr/
+.. _SIFT1B: http://corpus-texmex.irisa.fr/
+
diff --git a/examples/cpp-and-cuda/ggnn_benchmark.cpp b/examples/cpp-and-cuda/ggnn_benchmark.cpp
new file mode 100644
index 0000000..e139785
--- /dev/null
+++ b/examples/cpp-and-cuda/ggnn_benchmark.cpp
@@ -0,0 +1,203 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/eval.h>
+#include <ggnn/base/ggnn.cuh>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+// only needed for getTotalSystemMemory()
+#include <unistd.h>
+
+using namespace ggnn;
+
+DEFINE_string(base, "", "Path to file with base vectors (fvecs/bvecs).");
+DEFINE_uint32(subset, 0, "Number of base vectors to use.");
+DEFINE_string(query, "", "Path to file with query vectors (fvecs/bvecs).");
+DEFINE_string(gt, "", "Path to file with groundtruth vectors (ivecs).");
+DEFINE_string(graph_dir, "", "Directory to store and load ggnn graph files.");
+DEFINE_uint32(k_build, 24, "Number of neighbors for graph construction");
+DEFINE_double(tau_build, 0.5, "Search graph construction slack factor.");
+DEFINE_uint32(refinement_iterations, 2, "Number of refinement iterations");
+DEFINE_uint32(k_query, 10, "Number of neighbors to query for");
+DEFINE_string(measure, "euclidean", "distance measure (euclidean or cosine)");
+DEFINE_uint32(shard_size, 0, "Number of vectors per shard");
+DEFINE_string(gpu_ids, "0", "GPU id");
+DEFINE_bool(grid_search, false, "Perform queries for a wide range of parameters.");
+
+size_t getTotalSystemMemory()
+{
+  size_t pages = sysconf(_SC_PHYS_PAGES);
+  size_t page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+}
+
+int main(int argc, char* argv[])
+{
+  google::InitGoogleLogging(argv[0]);
+  google::LogToStderr();
+  google::InstallFailureSignalHandler();
+
+  gflags::SetUsageMessage(
+      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
+      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
+      "Lensch\n"
+      "(c) 2025 Computer Graphics University of Tuebingen");
+  gflags::SetVersionString("1.0.0");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  LOG(INFO) << "Reading files";
+  CHECK(std::filesystem::exists(FLAGS_base))
+      << "File for base vectors has to exist: " << FLAGS_base;
+  CHECK(std::filesystem::exists(FLAGS_query))
+      << "File for query vectors has to exist: " << FLAGS_query;
+  CHECK(std::filesystem::exists(FLAGS_gt))
+      << "File for groundtruth vectors has to exist: " << FLAGS_gt;
+
+  CHECK_GE(FLAGS_tau_build, 0) << "tau_build has to be bigger or equal 0.";
+  CHECK_GE(FLAGS_refinement_iterations, 0)
+      << "The number of refinement iterations has to be non-negative.";
+
+  /// data type for addressing points (needs to be able to represent N)
+  using KeyT = int32_t;
+  /// data type of computed distances
+  using ValueT = float;
+
+  using GGNN = ggnn::GGNN<KeyT, ValueT>;
+  using Results = ggnn::Results<KeyT, ValueT>;
+  using Evaluator = ggnn::Evaluator<KeyT, ValueT>;
+
+  /// distance measure (Euclidean or Cosine)
+  const DistanceMeasure measure = []() {
+    if (FLAGS_measure == "euclidean") {
+      return DistanceMeasure::Euclidean;
+    }
+    else if (FLAGS_measure == "cosine") {
+      return DistanceMeasure::Cosine;
+    }
+    LOG(FATAL) << "invalid measure: " << FLAGS_measure;
+  }();
+
+  // vector of GPU ids
+  std::istringstream iss(FLAGS_gpu_ids);
+  std::vector<std::string> results(std::istream_iterator<std::string>{iss},
+                                   std::istream_iterator<std::string>());
+
+  std::vector<int> gpus;
+  for (auto&& r : results) {
+    int gpu_id = std::atoi(r.c_str());
+    gpus.push_back(gpu_id);
+  }
+
+  // base & query datasets
+  GenericDataset base = GenericDataset::load(
+      FLAGS_base, 0, FLAGS_subset ? FLAGS_subset : std::numeric_limits<uint32_t>::max(), true);
+  GenericDataset query =
+      GenericDataset::load(FLAGS_query, 0, std::numeric_limits<uint32_t>::max(), true);
+
+  // initialize GGNN
+  GGNN ggnn;
+
+  const size_t total_memory = getTotalSystemMemory();
+  // guess the available memory (assume 1/8 used elsewhere, subtract dataset)
+  const size_t available_memory = total_memory - total_memory / 8 - base.required_size_bytes();
+  ggnn.setCPUMemoryLimit(available_memory);
+
+  ggnn.setWorkingDirectory(FLAGS_graph_dir);
+  // reference the dataset to avoid a copy
+  ggnn.setBaseReference(base);
+
+  // only necessary in multi-GPU mode
+  ggnn.setGPUs(gpus);
+  ggnn.setShardSize(FLAGS_shard_size);
+
+  // build the graph
+  if (!FLAGS_graph_dir.empty() &&
+      std::filesystem::is_regular_file(std::filesystem::path{FLAGS_graph_dir} / "part_0.ggnn")) {
+    ggnn.load(FLAGS_k_build);
+  }
+  else {
+    ggnn.build(FLAGS_k_build, static_cast<float>(FLAGS_tau_build), FLAGS_refinement_iterations,
+               measure);
+
+    if (!FLAGS_graph_dir.empty()) {
+      ggnn.store();
+    }
+  }
+
+  // load or compute ground truth
+  const bool loadGT = std::filesystem::is_regular_file(FLAGS_gt);
+  Dataset<KeyT> gt = loadGT ? Dataset<KeyT>::load(FLAGS_gt) : Dataset<KeyT>{};
+
+  if (!gt.data()) {
+    gt = ggnn.bfQuery(query).ids;
+    if (!FLAGS_gt.empty()) {
+      LOG(INFO) << "exporting brute-forced ground truth data.";
+      gt.store(FLAGS_gt);
+    }
+  }
+
+  Evaluator eval{base, query, gt, FLAGS_k_query, measure};
+
+  // query
+  auto query_function = [&ggnn, &eval, &query, measure](const float tau_query) {
+    Results results;
+    LOG(INFO) << "--";
+    LOG(INFO) << "Query with tau_query " << tau_query;
+    // faster for C@1 = 99%
+    LOG(INFO) << "fast query (good for C@1)";
+    results = ggnn.query(query, FLAGS_k_query, tau_query, 200, measure);
+    LOG(INFO) << eval.evaluateResults(results.ids);
+    // better for C@10 > 99%
+    LOG(INFO) << "regular query (good for C@10)";
+    results = ggnn.query(query, FLAGS_k_query, tau_query, 400, measure);
+    LOG(INFO) << eval.evaluateResults(results.ids);
+    // expensive, can get to 99.99% C@10
+    // ggnn.queryLayer<KQuery, 2000, 2048, 256>();
+  };
+
+  if (FLAGS_grid_search) {
+    LOG(INFO) << "--";
+    LOG(INFO) << "grid-search:";
+    for (int i = 0; i < 70; ++i)
+      query_function(static_cast<float>(i) * 0.01f);
+    for (int i = 7; i <= 20; ++i)
+      query_function(static_cast<float>(i) * 0.1f);
+  }
+  else {  // by default, just execute a few queries
+    LOG(INFO) << "--";
+    LOG(INFO)
+        << "Querying for 90, 95, 99% R@1, 99% C@10 (if running on SIFT1M with default parameters):";
+    query_function(0.34f);
+    query_function(0.41f);
+    query_function(0.51f);
+    query_function(0.64f);
+  }
+
+  VLOG(1) << "done!";
+  gflags::ShutDownCommandLineFlags();
+  return 0;
+}
diff --git a/examples/cpp-and-cuda/ggnn_main.cpp b/examples/cpp-and-cuda/ggnn_main.cpp
new file mode 100644
index 0000000..cd3df77
--- /dev/null
+++ b/examples/cpp-and-cuda/ggnn_main.cpp
@@ -0,0 +1,79 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/ggnn.cuh>
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
+
+using namespace ggnn;
+
+int main()
+{
+  // create some data
+  const size_t N_base = 10'000;
+  const size_t N_query = 10'000;
+  const uint32_t dim = 128;
+
+  std::vector<float> base_data(N_base * dim);
+  std::vector<float> query_data(N_query * dim);
+
+  std::default_random_engine prng{};
+  std::uniform_real_distribution<float> uniform{0.0f, 1.0f};
+
+  for (float& x : base_data) {
+    x = uniform(prng);
+  }
+  for (float& x : query_data) {
+    x = uniform(prng);
+  }
+
+  /// data type for addressing points
+  using KeyT = int32_t;
+  /// data type of computed distances
+  using ValueT = float;
+  using GGNN = GGNN<KeyT, ValueT>;
+
+  // Initialize GGNN
+  GGNN ggnn{};
+
+  // Initialize the datasets containing the base data and query data
+  Dataset<float> base = Dataset<float>::copy(base_data, dim, true);
+  Dataset<float> query = Dataset<float>::copy(query_data, dim, true);
+
+  // pass the base to GGNN as reference
+  ggnn.setBaseReference(base);
+
+  // build the search graph
+  ggnn.build(24, 0.5f);
+
+  // run the query and store indices & squared distances
+  const uint32_t KQuery = 10;
+  const auto [indices, dists] = ggnn.query(query, KQuery, 0.5f);
+
+  // print the results for the first query
+  std::cout << "Result for the first query vector: \n";
+  for (uint32_t i = 0; i < KQuery; i++) {
+    // std::cout << "Base Idx: ";
+    std::cout << "Distance to vector at base[";
+    std::cout.width(5);
+    std::cout << indices[i];
+    std::cout << "]: " << dists[i] << "\n";
+  }
+
+  return 0;
+}
diff --git a/examples/cpp-and-cuda/ggnn_main_gpu_data.cu b/examples/cpp-and-cuda/ggnn_main_gpu_data.cu
new file mode 100644
index 0000000..ffcaafa
--- /dev/null
+++ b/examples/cpp-and-cuda/ggnn_main_gpu_data.cu
@@ -0,0 +1,83 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/ggnn.cuh>
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+
+#include <cuda_runtime.h>
+#include <curand.h>
+
+using namespace ggnn;
+int main()
+{
+  using GGNN = ggnn::GGNN<int32_t, float>;
+
+  // create data on the GPU
+  size_t N_base {10'000};
+  size_t N_query {10'000};
+  uint32_t D {128};
+
+  float* base;
+  float* query;
+
+  cudaMalloc(&base, N_base * D * sizeof(float));
+  cudaMalloc(&query, N_query * D * sizeof(float));
+
+  curandGenerator_t generator;
+  curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
+
+  curandGenerateUniform(generator, base, N_base * D);
+  curandGenerateUniform(generator, query, N_query * D);
+
+  // initialize GGNN
+  GGNN ggnn{};
+
+  // Set the data on the GPU as the base dataset on which the graph should be built on.
+  // To reference existing data, specify its pointer, the number of base vectors N_base,
+  // the dimensionality of base vectors D and the gpu_id of the GPU containing the data.
+  int32_t gpu_id = 0;
+  ggnn.setBase(ggnn::Dataset<float>::referenceGPUData(base, N_base, D, gpu_id));
+
+  // reference the query data which already exists on the gpu
+  ggnn::Dataset<float> d_query = ggnn::Dataset<float>::referenceGPUData(query, N_query, D, 0);
+
+  // build the search graph
+  const uint32_t KBuild = 24;
+  const float tau_build = 0.5f;
+  ggnn.build(KBuild, tau_build);
+
+  // run the query and store indices & distances
+  const int32_t KQuery = 10;
+  const auto [indices, dists] = ggnn.query(d_query, KQuery, 0.5f);
+
+  // print the results for the first query
+  std::cout << "Result for the first query verctor: \n";
+  for (uint32_t i = 0; i < KQuery; i++) {
+    // std::cout << "Base Idx: ";
+    std::cout << "Distance to vector at base[";
+    std::cout.width(5);
+    std::cout << indices[i];
+    std::cout << "]: " << dists[i] << "\n";
+  }
+
+  // cleanup
+  curandDestroyGenerator(generator);
+  cudaFree(base);
+  cudaFree(query);
+
+  return 0;
+}
diff --git a/examples/cpp-and-cuda/ggnn_main_multi_gpu.cpp b/examples/cpp-and-cuda/ggnn_main_multi_gpu.cpp
new file mode 100644
index 0000000..4593c22
--- /dev/null
+++ b/examples/cpp-and-cuda/ggnn_main_multi_gpu.cpp
@@ -0,0 +1,85 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/ggnn.cuh>
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
+
+using namespace ggnn;
+
+int main()
+{
+  // create some data
+  const size_t N_base = 100'000;
+  const size_t N_query = 10'000;
+  const uint32_t dim = 128;
+
+  std::vector<float> base_data(N_base * dim);
+  std::vector<float> query_data(N_query * dim);
+
+  std::default_random_engine prng{};
+  std::uniform_real_distribution<float> uniform{0.0f, 1.0f};
+
+  for (float& x : base_data) {
+    x = uniform(prng);
+  }
+  for (float& x : query_data) {
+    x = uniform(prng);
+  }
+
+  /// data type for addressing points
+  using KeyT = int32_t;
+  /// data type of computed distances
+  using ValueT = float;
+  using GGNN = GGNN<KeyT, ValueT>;
+
+  // Initialize GGNN
+  GGNN ggnn{};
+
+  // Initialize the datasets containing the base data and query data
+  Dataset<float> base = Dataset<float>::copy(base_data, dim, true);
+  Dataset<float> query = Dataset<float>::copy(query_data, dim, true);
+
+  // pass the base to ggnn as reference
+  ggnn.setBaseReference(base);
+
+  // configure which GPUs to use
+  ggnn.setGPUs({0, 1});
+
+  // split dataset into shards of this size
+  ggnn.setShardSize(25'000);
+
+  // build the search graph
+  ggnn.build(24, 0.5f);
+
+  // run the query and store indices & squared distances
+  const uint32_t KQuery = 10;
+  const auto [indices, dists] = ggnn.query(query, KQuery, 0.5f);
+
+  // print the results for the first query
+  std::cout << "Result for the first query vector: \n";
+  for (uint32_t i = 0; i < KQuery; i++) {
+    // std::cout << "Base Idx: ";
+    std::cout << "Distance to vector at base[";
+    std::cout.width(5);
+    std::cout << indices[i];
+    std::cout << "]: " << dists[i] << "\n";
+  }
+
+  return 0;
+}
diff --git a/examples/python/ggnn_pytorch.py b/examples/python/ggnn_pytorch.py
new file mode 100644
index 0000000..4e5c6d6
--- /dev/null
+++ b/examples/python/ggnn_pytorch.py
@@ -0,0 +1,40 @@
+#! /usr/bin/python3
+
+import ggnn
+import torch
+
+# get detailed logs
+ggnn.set_log_level(4)
+
+
+# create data
+base = torch.rand((10_000, 128), dtype=torch.float32, device='cpu')
+query = torch.rand((10_000, 128), dtype=torch.float32, device='cpu')
+
+
+# initialize ggnn
+my_ggnn = ggnn.GGNN()
+my_ggnn.set_base(base)
+
+# choose a distance measure
+measure=ggnn.DistanceMeasure.Euclidean
+
+# build the graph
+my_ggnn.build(k_build=24, tau_build=0.5, refinement_iterations=2, measure=measure)
+
+
+# run query
+k_query: int = 10
+tau_query: float = 0.64
+max_iterations: int = 400
+
+indices, dists = my_ggnn.query(query, k_query, tau_query, max_iterations, measure)
+
+
+# run brute-force query to get a ground truth and evaluate the results of the query
+gt_indices, gt_dists = my_ggnn.bf_query(query, k_gt=k_query, measure=measure)
+evaluator = ggnn.Evaluator(base, query, gt_indices, k_query=k_query)
+print(evaluator.evaluate_results(indices))
+
+# print the indices of the 10 NN of the first five queries and their squared euclidean distances 
+print('indices:', indices[:5], '\n squared dists:',  dists[:5], '\n')
diff --git a/examples/python/ggnn_pytorch_gpu_data.py b/examples/python/ggnn_pytorch_gpu_data.py
new file mode 100644
index 0000000..1c76f0f
--- /dev/null
+++ b/examples/python/ggnn_pytorch_gpu_data.py
@@ -0,0 +1,33 @@
+#! /usr/bin/python3
+
+import ggnn
+import torch
+
+# get detailed logs
+ggnn.set_log_level(4)
+
+
+# create data
+base = torch.rand((10_000, 128), dtype=torch.float32, device='cuda')
+query = torch.rand((10_000, 128), dtype=torch.float32, device='cuda')
+
+
+# initialize GGNN
+my_ggnn = ggnn.GGNN()
+my_ggnn.set_base(base)
+my_ggnn.set_return_results_on_gpu(True)
+
+# choose a distance measure
+measure=ggnn.DistanceMeasure.Euclidean
+
+# build the graph
+my_ggnn.build(k_build=24, tau_build=0.5, measure=measure)
+
+# run query
+k_query: int = 10
+tau_query: float = 0.64
+max_iterations: int = 400
+
+indices, dists = my_ggnn.query(query, k_query, tau_query, max_iterations, measure)
+
+print('indices:', indices[:5], '\n squared dists:',  dists[:5], '\n')
diff --git a/examples/python/ggnn_pytorch_multi_gpu.py b/examples/python/ggnn_pytorch_multi_gpu.py
new file mode 100644
index 0000000..dd18389
--- /dev/null
+++ b/examples/python/ggnn_pytorch_multi_gpu.py
@@ -0,0 +1,24 @@
+#! /usr/bin/python3
+
+import ggnn
+import torch
+
+# create data
+base = torch.rand((100_000, 128), dtype=torch.float32, device='cpu')
+query = torch.rand((10_000, 128), dtype=torch.float32, device='cpu')
+
+# initialize GGNN and prepare multi-GPU
+my_ggnn = ggnn.GGNN()
+my_ggnn.set_base(base)
+my_ggnn.set_shard_size(n_shard=25_000)
+my_ggnn.set_gpus(gpu_ids=[0,1])
+
+# build the graph
+my_ggnn.build(k_build=24, tau_build=0.5)
+
+# run query
+k_query: int = 10
+
+indices, dists = my_ggnn.query(query, k_query=k_query, tau_query=0.64, max_iterations=400)
+
+print('indices:', indices[:5], '\n squared dists:',  dists[:5], '\n')
diff --git a/examples/python/sift1m_fvecs.py b/examples/python/sift1m_fvecs.py
new file mode 100644
index 0000000..84e6695
--- /dev/null
+++ b/examples/python/sift1m_fvecs.py
@@ -0,0 +1,30 @@
+#! /usr/bin/python3
+
+import ggnn
+
+path_to_dataset = '/graphics/scratch/datasets/ANN_datasets/SIFT1M/sift/'
+
+base = ggnn.FloatDataset.load(path_to_dataset + 'sift_base.fvecs')
+query = ggnn.FloatDataset.load(path_to_dataset + 'sift_query.fvecs')
+gt = ggnn.IntDataset.load(path_to_dataset + 'sift_groundtruth.ivecs')
+
+k_query: int = 10
+
+evaluator = ggnn.Evaluator(base, query, gt=gt, k_query=k_query)
+
+my_ggnn = ggnn.GGNN()
+my_ggnn.set_base(base)
+my_ggnn.build(k_build=24, tau_build=0.5)
+
+# 90% C@1 / R@1
+indices, dists = my_ggnn.query(query, k_query, tau_query=0.34, max_iterations=200)
+print(evaluator.evaluate_results(indices))
+# 95% C@1 / R@1
+indices, dists = my_ggnn.query(query, k_query, tau_query=0.41, max_iterations=200)
+print(evaluator.evaluate_results(indices))
+# 99% C@1 / R@1
+indices, dists = my_ggnn.query(query, k_query, tau_query=0.51, max_iterations=200)
+print(evaluator.evaluate_results(indices))
+# 99% C@10
+indices, dists = my_ggnn.query(query, k_query, tau_query=0.64, max_iterations=400)
+print(evaluator.evaluate_results(indices))
diff --git a/examples/python/sift1m_hdf5.py b/examples/python/sift1m_hdf5.py
new file mode 100644
index 0000000..c93c518
--- /dev/null
+++ b/examples/python/sift1m_hdf5.py
@@ -0,0 +1,36 @@
+#! /usr/bin/python3
+
+import ggnn
+
+import h5py
+import numpy as np
+
+path_to_dataset = '/graphics/scratch/ruppert/sift-128-euclidean.hdf5'
+
+# load ANN-benchmark-style HDF5 dataset
+with h5py.File(path_to_dataset, 'r') as f:
+  base = np.array(f['train'])
+  query = np.array(f['test'])
+  gt = np.array(f['neighbors'])
+  # gt_dist = np.array(f['distances'])
+
+k_query: int = 10
+
+evaluator = ggnn.Evaluator(base, query, gt=gt, k_query=k_query)
+
+my_ggnn = ggnn.GGNN()
+my_ggnn.set_base(base)
+my_ggnn.build(k_build=24, tau_build=0.5)
+
+# 90% C@1 / R@1
+indices, dists = my_ggnn.query(query, k_query, tau_query=0.34, max_iterations=200)
+print(evaluator.evaluate_results(indices))
+# 95% C@1 / R@1
+indices, dists = my_ggnn.query(query, k_query, tau_query=0.41, max_iterations=200)
+print(evaluator.evaluate_results(indices))
+# 99% C@1 / R@1
+indices, dists = my_ggnn.query(query, k_query, tau_query=0.51, max_iterations=200)
+print(evaluator.evaluate_results(indices))
+# 99% C@10
+indices, dists = my_ggnn.query(query, k_query, tau_query=0.64, max_iterations=400)
+print(evaluator.evaluate_results(indices))
diff --git a/ggnn_plots.png b/ggnn_plots.png
deleted file mode 100644
index a912e5a..0000000
Binary files a/ggnn_plots.png and /dev/null differ
diff --git a/include/ggnn/base/data.cuh b/include/ggnn/base/data.cuh
new file mode 100644
index 0000000..44652ff
--- /dev/null
+++ b/include/ggnn/base/data.cuh
@@ -0,0 +1,164 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_DATA_CUH
+#define INCLUDE_GGNN_DATA_CUH
+
+#include <cstddef>
+#include <cstdint>
+#include <iosfwd>
+
+namespace ggnn {
+
+enum class DataType : uint16_t {
+  UNKNOWN,
+  BYTE,
+  UINT8,
+  INT32,
+  UINT32,
+  FLOAT
+};
+
+enum class DataLocation : uint16_t {
+  UNKNOWN,      //< unset
+  GPU,          //< regular GPU memory
+  MANAGED,      //< managed GPU memory
+  CPU_PINNED,   //< pinned CPU memory
+  CPU_MALLOC,   //< regular/pageable CPU memory
+  FOREIGN_GPU,  //< foreign GPU memory - do not free
+  FOREIGN_CPU,  //< foreign CPU memory - do not free
+};
+
+std::ostream& operator<<(std::ostream& stream, DataType type);
+std::ostream& operator<<(std::ostream& stream, DataLocation location);
+
+namespace detail {
+
+template <typename T>
+struct DataTypeAssignment;
+
+template <>
+struct DataTypeAssignment<std::byte> {
+  static constexpr DataType value{DataType::BYTE};
+};
+template <>
+struct DataTypeAssignment<uint8_t> {
+  static constexpr DataType value{DataType::UINT8};
+};
+template <>
+struct DataTypeAssignment<int32_t> {
+  static constexpr DataType value{DataType::INT32};
+};
+template <>
+struct DataTypeAssignment<uint32_t> {
+  static constexpr DataType value{DataType::UINT32};
+};
+template <>
+struct DataTypeAssignment<float> {
+  static constexpr DataType value{DataType::FLOAT};
+};
+
+DataLocation disown(DataLocation location);
+size_t dataSize(DataType type);
+
+};  // namespace detail
+
+template <typename T>
+constexpr DataType DataType_v = detail::DataTypeAssignment<T>::value;
+
+struct Allocation {
+  uint64_t N{};
+  uint32_t D{};
+
+  DataType type{};
+  DataLocation location{};
+  int32_t gpu_id{};
+
+  void* mem{nullptr};
+
+  // transfer ownership away from this object
+  // when this object is to be deallocated by the GPUContext, deallocation will be skipped
+  void releaseOwnership()
+  {
+    location = detail::disown(location);
+  }
+
+  bool isCPUAccessible() const
+  {
+    switch (location) {
+      case DataLocation::CPU_MALLOC:
+      case DataLocation::CPU_PINNED:
+      case DataLocation::FOREIGN_CPU:
+      case DataLocation::MANAGED:
+        return true;
+      default:;
+    }
+    return false;
+  }
+
+  bool isGPUAccessible() const
+  {
+    switch (location) {
+      case DataLocation::GPU:
+      case DataLocation::FOREIGN_GPU:
+      case DataLocation::MANAGED:
+        return true;
+      default:;
+    }
+    return false;
+  }
+
+  size_t element_size() const
+  {
+    return detail::dataSize(type);
+  }
+
+  size_t numel() const
+  {
+    return static_cast<size_t>(N) * D;
+  }
+
+  size_t required_size_bytes() const
+  {
+    return element_size() * numel();
+  }
+
+  explicit operator void*()
+  {
+    return mem;
+  }
+
+  explicit operator const void*() const
+  {
+    return mem;
+  }
+};
+
+struct Allocator {
+  static std::byte* cudaMallocChecked(const size_t size);
+  static std::byte* cudaMallocManagedChecked(const size_t size);
+  static std::byte* cudaMallocHostChecked(const size_t size, const unsigned int flags);
+  static std::byte* mallocChecked(const size_t size);
+  static void allocateData(Allocation& alloc, uint32_t flags = 0);
+  static void freeData(Allocation& alloc);
+};
+
+std::ostream& operator<<(std::ostream& stream, Allocation alloc);
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_DATA_CUH
diff --git a/include/ggnn/base/dataset.cuh b/include/ggnn/base/dataset.cuh
new file mode 100644
index 0000000..840e8a8
--- /dev/null
+++ b/include/ggnn/base/dataset.cuh
@@ -0,0 +1,170 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_DATASET_CUH
+#define INCLUDE_GGNN_DATASET_CUH
+
+#include <ggnn/base/data.cuh>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <filesystem>
+#include <limits>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+struct CUstream_st;
+typedef struct CUstream_st* cudaStream_t;
+
+namespace ggnn {
+
+struct GenericDataset : protected Allocation {
+  GenericDataset() : Allocation{} {}
+  GenericDataset(const Allocation& alloc) : Allocation{alloc} {}
+  GenericDataset(const GenericDataset& other) = delete;
+  GenericDataset& operator=(const GenericDataset& other) = delete;
+  explicit GenericDataset(GenericDataset&& other) noexcept;
+  GenericDataset& operator=(GenericDataset&& other) noexcept;
+  virtual ~GenericDataset();
+
+  using Allocation::D;
+  using Allocation::element_size;
+  using Allocation::gpu_id;
+  using Allocation::location;
+  using Allocation::N;
+  using Allocation::numel;
+  using Allocation::type;
+
+  GenericDataset reference() const;
+  GenericDataset referenceRange(uint64_t from, uint64_t num) const;
+
+  using Allocation::isCPUAccessible;
+  using Allocation::isGPUAccessible;
+  using Allocation::releaseOwnership;
+
+  using Allocation::required_size_bytes;
+
+  template <typename T>
+  std::span<T> reinterpret()
+  {
+    return {reinterpret_cast<T*>(mem), numel()};
+  }
+  template <typename T>
+  std::span<const T> reinterpret() const
+  {
+    return {reinterpret_cast<const T*>(mem), numel()};
+  }
+
+  template <typename T>
+  std::span<T> access()
+  {
+    assert(DataType_v<T> == type);
+    return reinterpret<T>();
+  }
+  template <typename T>
+  std::span<const T> access() const
+  {
+    assert(DataType_v<T> == type);
+    return reinterpret<T>();
+  }
+
+  static GenericDataset load(const std::filesystem::path& file, uint32_t from = 0,
+                             uint32_t num = std::numeric_limits<uint32_t>::max(),
+                             bool pin_memory = false);
+};
+
+template <typename T>
+struct Dataset : public GenericDataset, public std::span<T> {
+  Dataset() : GenericDataset{}, std::span<T>(reinterpret_cast<T*>(0), 0UL) {}
+  Dataset(GenericDataset&& data)
+      : GenericDataset{std::move(data)}, std::span<T>{GenericDataset::access<T>()}
+  {
+  }
+  Dataset(Dataset& other) = delete;
+  Dataset& operator=(Dataset& other) = delete;
+  Dataset(Dataset&& other) noexcept = default;
+  Dataset& operator=(Dataset&& other) noexcept = default;
+  virtual ~Dataset() = default;
+
+  using GenericDataset::D;
+  using GenericDataset::element_size;
+  using GenericDataset::gpu_id;
+  using GenericDataset::location;
+  using GenericDataset::N;
+  using GenericDataset::numel;
+  using GenericDataset::type;
+
+  using GenericDataset::isCPUAccessible;
+  using GenericDataset::isGPUAccessible;
+  using GenericDataset::reference;
+  using GenericDataset::referenceRange;
+  using GenericDataset::releaseOwnership;
+
+  operator T*()
+  {
+    return std::span<T>::data();
+  }
+  operator const T*() const
+  {
+    return std::span<T>::data();
+  }
+
+#if __cpp_lib_span < 202311L
+  T& at(size_t index)
+  {
+    if (index >= this->size())
+      throw std::out_of_range("Index " + std::to_string(index) + " is out of bounds (size " +
+                              std::to_string(this->size()) + ").");
+    return (*this)[index];
+  }
+  const T& at(size_t index) const
+  {
+    if (index >= this->size())
+      throw std::out_of_range("Index " + std::to_string(index) + " is out of bounds (size " +
+                              std::to_string(this->size()) + ").");
+    return (*this)[index];
+  }
+#endif
+
+  static Dataset empty(const uint64_t N, const uint32_t D, bool pin_memory = false);
+  static Dataset emptyOnGPU(const uint64_t N, const uint32_t D, int32_t gpu_id);
+  static Dataset copy(const std::span<const T>& data, uint32_t D, bool pin_memory = false);
+  static Dataset load(const std::filesystem::path& file, uint32_t from = 0,
+                      uint32_t num = std::numeric_limits<uint32_t>::max(), bool pin_memory = false);
+  static Dataset referenceCPUData(T* data, const uint64_t N, const uint32_t D);
+  static Dataset referenceGPUData(T* data, const uint64_t N, const uint32_t D, int32_t gpu_id);
+
+  void store(const std::filesystem::path& path) const;
+  void copyTo(Dataset& other, cudaStream_t stream = 0) const;
+  void copyRangeTo(uint64_t from, uint64_t num, Dataset& other, cudaStream_t stream = 0) const;
+
+  Dataset clone(cudaStream_t stream = 0) const;
+  Dataset referenceOnGPU(int gpu_id, cudaStream_t stream = 0) const;
+};
+
+template <typename KeyT, typename ValueT>
+struct Results {
+  Dataset<KeyT> ids{};
+  Dataset<ValueT> dists{};
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_DATASET_CUH
diff --git a/include/ggnn/base/def.h b/include/ggnn/base/def.h
new file mode 100644
index 0000000..0a8af6b
--- /dev/null
+++ b/include/ggnn/base/def.h
@@ -0,0 +1,71 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_DEF_H
+#define INCLUDE_GGNN_DEF_H
+
+#include <bit>
+#include <cstddef>
+#include <cstdint>
+
+namespace ggnn {
+
+enum class DistanceMeasure : int {
+  Euclidean = 0,
+  Cosine = 1
+};
+
+inline float sizeInGB(const size_t bytes)
+{
+  return static_cast<float>(bytes / (1024UL * 1024UL)) / 1024.0f;
+}
+
+#if __cpp_lib_int_pow2 < 202002L
+#include <type_traits>
+
+template <typename T, std::enable_if_t<std::is_same_v<T, uint32_t>, int> = 0>
+constexpr uint32_t bit_ceil(T v) noexcept
+{
+  // from https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+#else
+using std::bit_ceil;
+#endif
+
+template <typename T, T base, std::enable_if_t<std::is_same_v<T, uint32_t>, int> = 0>
+constexpr T next_multiple(T v) noexcept
+{
+  return v % base == 0 ? v : base * (v / base + 1);
+};
+
+// just to make sure that everything is sufficiently aligned
+inline size_t align8(size_t size)
+{
+  return ((size + 7) / 8) * 8;
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_DEF_H
diff --git a/include/ggnn/base/eval.h b/include/ggnn/base/eval.h
new file mode 100644
index 0000000..5d3bab3
--- /dev/null
+++ b/include/ggnn/base/eval.h
@@ -0,0 +1,69 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_EVAL_H
+#define INCLUDE_GGNN_EVAL_H
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/fwd.h>
+
+#include <cstdint>
+#include <ostream>
+
+#include <vector>
+
+namespace ggnn {
+
+struct GTDuplicates {
+  // indices within the ground truth list per point up to which result ids
+  // need to be compared.
+  // without duplicates in the dataset, each entry should just be 1 / KQuery
+  std::vector<uint32_t> top1DuplicateEnd{};
+  std::vector<uint32_t> topKDuplicateEnd{};
+};
+
+struct Evaluation {
+  uint32_t KQuery{};
+
+  float c1{0};
+  float c1_dup{0};
+  float cKQuery{0};
+  float cKQuery_dup{0};
+  float rKQuery{0};
+  float rKQuery_dup{0};
+};
+
+std::ostream& operator<<(std::ostream& os, const Evaluation& eval);
+
+template <typename KeyT, typename ValueT>
+struct Evaluator {
+  uint32_t KQuery{0};
+  DistanceMeasure measure{};
+  Dataset<KeyT> gt;
+  GTDuplicates gt_duplicates{};
+
+  Evaluator() = default;
+
+  Evaluator(const GenericDataset& base, const GenericDataset& query, const Dataset<KeyT>& gt,
+            const uint32_t KQuery, const DistanceMeasure measure);
+
+  [[nodiscard]] Evaluation evaluateResults(const Dataset<KeyT>& results);
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_EVAL_H
diff --git a/include/ggnn/base/fwd.h b/include/ggnn/base/fwd.h
new file mode 100644
index 0000000..1151001
--- /dev/null
+++ b/include/ggnn/base/fwd.h
@@ -0,0 +1,38 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_FWD_H
+#define INCLUDE_GGNN_FWD_H
+
+// some forward declarations
+
+namespace ggnn {
+
+struct GenericDataset;
+template <typename T>
+struct Dataset;
+
+struct GraphConfig;
+template <typename KeyT, typename ValueT>
+struct Graph;
+
+template <typename KeyT, typename ValueT>
+struct Results;
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_FWD_H
diff --git a/include/ggnn/base/ggnn.cuh b/include/ggnn/base/ggnn.cuh
new file mode 100644
index 0000000..3b3f3a7
--- /dev/null
+++ b/include/ggnn/base/ggnn.cuh
@@ -0,0 +1,179 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_GGNN_CUH
+#define INCLUDE_GGNN_GGNN_CUH
+
+#include <ggnn/base/def.h>        // IWYU pragma: export
+#include <ggnn/base/fwd.h>        // IWYU pragma: export
+#include <ggnn/base/dataset.cuh>  // IWYU pragma: export
+
+#include <cstddef>
+#include <cstdint>
+#include <filesystem>
+#include <memory>
+#include <span>
+#include <vector>
+
+namespace ggnn {
+
+/**
+ * GGNN multi-GPU wrapper
+ *
+ * @param KeyT datatype of dataset indices (needs to be able to represent
+ * N_base, signed integer required)
+ * @param ValueT distance value type
+ */
+template <typename KeyT, typename ValueT>
+class GGNN {
+ public:
+  using Results = ggnn::Results<KeyT, ValueT>;
+  using Graph = ggnn::Graph<KeyT, ValueT>;
+
+  /// Maximum data dimension supported by GGNN
+  static constexpr uint32_t MIN_D = 1;
+  static constexpr uint32_t MAX_D = 4096;
+  /// Maximum number of neighbors supported by GGNN
+  static constexpr uint32_t MIN_KBUILD = 2;
+  static constexpr uint32_t MAX_KBUILD = 512;
+
+  GGNN();
+  GGNN(const GGNN& other) = delete;
+  GGNN(GGNN&& other) noexcept = default;
+  GGNN& operator=(const GGNN& other) = delete;
+  GGNN& operator=(GGNN&& other) noexcept = default;
+  virtual ~GGNN() = default;
+
+  /**
+   * Set the cache directory for GGNN to work in.
+   * The graph will be loaded from and stored to files in this directory if requested or when
+   * required due to insufficient memory.
+   */
+  virtual void setWorkingDirectory(const std::filesystem::path& dir);
+  /**
+   * Set the maximum amount of CPU memory that GGNN is allowed to use for caching graph shards.
+   *
+   * When insufficient, shards will be swapped out to the working directory.
+   */
+  virtual void setCPUMemoryLimit(const size_t memory_limit);
+
+  /**
+   * Set the GPUs to use (CUDA device indices).
+   */
+  virtual void setGPUs(const std::span<const int>& gpu_ids);
+  /**
+   * Set the GPUs to use (CUDA device indices).
+   *
+   * (This overload allows to use an initializer list.)
+   */
+  void setGPUs(const std::vector<int>& gpu_ids)
+  {
+    setGPUs(std::span<const int>{gpu_ids.cbegin(), gpu_ids.cend()});
+  }
+
+  /**
+   * Set the size of shards to work on.
+   * (Optional, default 0: entire base in one shard).
+   * The base datasets needs to be evenly divisible by the shard size.
+   * The resulting number of shards needs to be evenly divisible by the number of GPUs.
+   */
+  virtual void setShardSize(const uint32_t N_shard);
+
+  /**
+   * Enable or disable returning results on GPU.
+   * When enabled, results are directly returned on the GPU, no copy to CPU is performed.
+   * NOTE: Only a single GPU is supported in this mode.
+   * NOTE: Querying for K results from N shards will return N*K sorted results per query.
+   */
+  virtual void setReturnResultsOnGPU(const bool return_results_on_gpu = true);
+
+  /**
+   * Set the base dataset.
+   */
+  virtual void setBase(GenericDataset&& base);
+
+  /**
+   * Set a reference to the base dataset.
+   * NOTE: The calling code needs to ensure that the base data remains accessible during graph
+   * construction and query.
+   */
+  void setBaseReference(const GenericDataset& base);
+  // setting a reference to a temporary would result in undefined behavior
+  void setBaseReference(GenericDataset&&) = delete;
+
+  /**
+   * Build the GGNN search graph.
+   *
+   * Requires base to be set.
+   */
+  virtual void build(const uint32_t KBuild, const float tau_build,
+                     const uint32_t refinement_iterations = 2,
+                     const DistanceMeasure measure = DistanceMeasure::Euclidean);
+  /**
+   * Store the GGNN search graph.
+   */
+  virtual void store();
+  /**
+   * Load a previously built GGNN search graph.
+   *
+   * Requires base to be set.
+   */
+  virtual void load(const uint32_t KBuild);
+
+  /**
+   * Query the GGNN search graph.
+   * @param query may be given on CPU or GPU
+   *
+   * Requires base to be set.
+   * Requires a graph to be built or loaded first.
+   *
+   * NOTE: query data type has to match base data type
+   */
+  [[nodiscard]] virtual Results query(const GenericDataset& query, const uint32_t KQuery,
+                                      const float tau_query, const uint32_t max_iterations = 400,
+                                      const DistanceMeasure measure = DistanceMeasure::Euclidean);
+
+  /**
+   * Run a brute-force query on the base dataset.
+   * @param query may be given on CPU or GPU
+   *
+   * Requires base to be set.
+   * NOTE: This function currently supports only a single GPU
+   *
+   * NOTE: query data type has to match base data type
+   */
+  [[nodiscard]] virtual Results bfQuery(const GenericDataset& query, const uint32_t KGT = 100,
+                                        const DistanceMeasure measure = DistanceMeasure::Euclidean);
+
+  /**
+   * Access the GGNN graph.
+   * @param global_shard_id The graph shard to be accessed.
+   *
+   * NOTE: The reference is invalidated when a query is run and shards need to be swapped out.
+   */
+  [[nodiscard]] virtual const Graph& getGraph(const uint32_t global_shard_id = 0);
+
+ protected:
+  GGNN(int) {}
+
+ private:
+  std::unique_ptr<GGNN> pimpl;
+};  // GGNN
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_GGNN_CUH
diff --git a/include/ggnn/base/gpu_instance.cuh b/include/ggnn/base/gpu_instance.cuh
new file mode 100644
index 0000000..1d1de72
--- /dev/null
+++ b/include/ggnn/base/gpu_instance.cuh
@@ -0,0 +1,220 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_GPU_INSTANCE_CUH
+#define INCLUDE_GGNN_GPU_INSTANCE_CUH
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph.h>
+#include <ggnn/base/graph_config.h>
+#include <ggnn/base/dataset.cuh>
+
+#include <cstddef>
+#include <cstdint>
+#include <filesystem>
+#include <limits>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+namespace ggnn {
+
+struct ShardingConfiguration {
+  /// Number of base data points per shard.
+  uint32_t N_shard{0};
+  /// Sequential index for sharding: GPU i/N - independent of the CUDA device index.
+  uint32_t device_index{0};
+  /// Number of shards to process on this GPU.
+  uint32_t num_shards{1};
+  /// Memory limit for swapping shards to CPU.
+  size_t cpu_memory_limit{std::numeric_limits<size_t>::max()};
+};
+
+struct CUDAStreamDeleter {
+  void operator()(cudaStream_t stream);
+};
+struct CUDAEventDeleter {
+  void operator()(cudaEvent_t event);
+};
+
+using CudaStream = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, CUDAStreamDeleter>;
+using CudaEvent = std::unique_ptr<std::remove_pointer_t<cudaEvent_t>, CUDAEventDeleter>;
+
+struct GPUContext {
+  const int gpu_id{getCurrentGPUId()};
+  static int getCurrentGPUId();
+
+  void activate() const;
+  CudaStream createStream();
+  CudaEvent createEvent();
+};
+
+/**
+ * GGNN core operations (shared between single-GPU and multi-GPU version)
+ *
+ * @param KeyT datatype of dataset indices (needs to be able to represent
+ * N_base, signed integer required)
+ * @param ValueT distance value type
+ * @param BaseT datatype of dataset vector elements
+ */
+template <typename KeyT, typename ValueT, typename BaseT>
+class GPUInstance {
+ public:
+  using Graph = ggnn::Graph<KeyT, ValueT>;
+  using Results = ggnn::Results<KeyT, ValueT>;
+
+  GPUInstance(const GPUContext& gpu_ctx, const ShardingConfiguration& shard_config,
+              const GraphConfig& graph_config)
+      : gpu_ctx{gpu_ctx}, shard_config{shard_config}, graph_config{graph_config}
+  {
+  }
+
+  GPUContext gpu_ctx{};
+  ShardingConfiguration shard_config{};
+  GraphConfig graph_config{};
+
+  float build(const Dataset<BaseT>& base, const std::filesystem::path& graph_dir,
+              const GraphConfig& graph_config, const float tau_build,
+              const uint32_t refinement_iterations, const DistanceMeasure measure);
+  void load(const Dataset<BaseT>& base, const std::filesystem::path& graph_dir,
+            const GraphConfig& graph_config);
+  void store(const std::filesystem::path& graph_dir);
+
+  [[nodiscard]] Results query(const Dataset<BaseT>& query, const std::filesystem::path& graph_dir,
+                              const uint32_t KQuery, const uint32_t max_iterations,
+                              const float tau_query, const DistanceMeasure measure);
+
+  struct GPUGraphBuffer {
+    Graph graph;
+    uint32_t global_shard_id;
+    CudaStream stream;
+  };
+
+  struct GPUBaseBuffer {
+    Dataset<BaseT> base;
+    uint32_t global_shard_id;
+  };
+
+  struct CPUGraphBuffer {
+    Graph graph;
+    uint32_t global_shard_id;
+
+    void load(const std::filesystem::path& part_filename, const uint32_t global_shard_id);
+    void store(const std::filesystem::path& part_filename) const;
+    void upload(GPUGraphBuffer& gpu_buffer) const;
+    void download(const GPUGraphBuffer& gpu_buffer);
+  };
+
+  [[nodiscard]] const CPUGraphBuffer& getCPUGraphShard(const std::filesystem::path& graph_dir,
+                                                       const uint32_t global_shard_id);
+  [[nodiscard]] const GPUGraphBuffer& getGPUGraphShard(const std::filesystem::path& graph_dir,
+                                                       const uint32_t global_shard_id,
+                                                       const bool sync_stream = true);
+  [[nodiscard]] const GPUBaseBuffer& getGPUBaseShard(const uint32_t global_shard_id,
+                                                     const bool sync_stream = true);
+  [[nodiscard]] bool hasPart(const uint32_t global_shard_id) const;
+
+  [[nodiscard]] cudaStream_t getStreamForPart(const uint32_t global_shard_id) const;
+
+  /// Get the GPU buffer responsible for the given \c on_gpu_shard_id (const version).
+  const GPUGraphBuffer& getGPUGraphBuffer(const uint32_t on_gpu_shard_id) const
+  {
+    return d_buffers.at(on_gpu_shard_id % d_buffers.size());
+  }
+  /// Get the GPU base buffer responsible for the given \c on_gpu_shard_id (const version).
+  const GPUBaseBuffer& getGPUBaseBuffer(const uint32_t on_gpu_shard_id) const
+  {
+    return d_base_buffers.at(on_gpu_shard_id % d_base_buffers.size());
+  }
+
+ private:
+  /// in-memory GPU shards (some shards might be swapped out to CPU)
+  std::vector<GPUGraphBuffer> d_buffers;
+  /// in-memory GPU base shards (some shards might be swapped out to CPU)
+  std::vector<GPUBaseBuffer> d_base_buffers;
+  /// in-memory CPU shards (some shards might be swapped out to disk)
+  std::vector<CPUGraphBuffer> h_buffers;
+  /// threads for performing i/o tasks (number is min of CPU/GPU shards)
+  std::vector<std::thread> io_threads;
+
+  Dataset<BaseT> h_base_ref{};
+
+  bool process_shards_back_to_front{false};
+
+  void allocateGraph(const GraphConfig& graph_config,
+                     const bool reserve_construction_memory = false);
+  void allocateCPUBuffers(const uint32_t num_cpu_buffers);
+
+  /// Get the GPU buffer responsible for the given \c on_gpu_shard_id.
+  GPUGraphBuffer& getGPUGraphBuffer(const uint32_t on_gpu_shard_id)
+  {
+    return d_buffers.at(on_gpu_shard_id % d_buffers.size());
+  }
+  /// Get the GPU base buffer responsible for the given \c on_gpu_shard_id.
+  GPUBaseBuffer& getGPUBaseBuffer(const uint32_t on_gpu_shard_id)
+  {
+    return d_base_buffers.at(on_gpu_shard_id % d_base_buffers.size());
+  }
+  /// Get the CPU buffer responsible for the given \c on_gpu_shard_id.
+  CPUGraphBuffer& getCPUGraphBuffer(const uint32_t on_gpu_shard_id)
+  {
+    return h_buffers.at(on_gpu_shard_id % h_buffers.size());
+  }
+
+  [[nodiscard]] std::thread& getThreadForPart(const uint32_t global_shard_id);
+
+  // NOTE: global_shard_id refers to the global index in the number of shards the dataset has been
+  // split into. NOTE: on_gpu_shard_id follows the same index but starts at 0 per GPU.
+
+  // io
+
+  /**
+   * Swap out a newly constructed graph shard from GPU to CPU.
+   * If necessary, or requested by \c force_store, store into a file.
+   */
+  void swapOutPart(const std::filesystem::path& graph_dir, const uint32_t global_shard_id,
+                   bool force_to_ram = false, bool force_to_file = false);
+  /**
+   * Swap in a previously constructed graph shard from CPU to GPU.
+   * If necessary, or requested by \c force_load, load from a file.
+   */
+  void swapInPart(const std::filesystem::path& graph_dir, const uint32_t global_shard_id,
+                  bool force_load_from_file = false);
+  /**
+   * Wait for swap in / swap out of the given part to complete.
+   */
+  void waitForPart(const uint32_t global_shard_id);
+  /**
+   * Swap in the base data for the given \c global_shard_id.
+   */
+  void loadBasePart(const uint32_t global_shard_id);
+  /**
+   * Load the first N base parts which fit into the \c d_base_buffers.
+   */
+  void prefetchBase();
+
+  /**
+   * Sort the results from querying multiple parts.
+   * Results are expected to be concatenated per query vector.
+   */
+  void sortQueryResults(Results& d_results, cudaStream_t stream);
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_GPU_INSTANCE_CUH
diff --git a/include/ggnn/base/graph.h b/include/ggnn/base/graph.h
new file mode 100644
index 0000000..8bddef8
--- /dev/null
+++ b/include/ggnn/base/graph.h
@@ -0,0 +1,76 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Rupert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_GRAPH_H
+#define INCLUDE_GGNN_GRAPH_H
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph_config.h>
+#include <ggnn/base/dataset.cuh>
+
+#include <array>
+#include <cstddef>
+
+namespace ggnn {
+
+/**
+ * GGNN graph data (on the CPU)
+ *
+ * @param KeyT datatype of dataset indices
+ * @param ValueT distance value type
+ */
+template <typename KeyT, typename ValueT>
+struct Graph {
+  struct PartSizes {
+    PartSizes(const GraphConfig& config)
+        : graph_size{align8(static_cast<size_t>(config.N_all) * config.KBuild * sizeof(KeyT))},
+          selection_translation_size{align8(static_cast<size_t>(config.ST_all) * sizeof(KeyT))},
+          // const size_t nn1_dist_buffer_size = N * sizeof(ValueT);
+          nn1_stats_size{align8(2UL * sizeof(ValueT))}
+    {
+    }
+
+    const size_t graph_size;
+    const size_t selection_translation_size;
+    const size_t nn1_stats_size;
+
+    size_t getGraphSize() const
+    {
+      return graph_size + 2 * selection_translation_size + nn1_stats_size;
+    }
+  };
+
+  /// neighborhood vectors
+  std::array<Dataset<KeyT>, GraphConfig::L> graph{};
+  /// translation of upper layer points into lowest layer
+  std::array<Dataset<KeyT>, GraphConfig::L> translation{};
+  /// translation of upper layer points into one layer below
+  std::array<Dataset<KeyT>, GraphConfig::L> selection{};
+
+  /// average and maximum distance to nearest known neighbors
+  Dataset<ValueT> nn1_stats{};
+
+  /// combined memory pool
+  Dataset<std::byte> memory{};
+
+  Graph() = default;
+  Graph(const GraphConfig& graph_config, Dataset<std::byte>&& memory);
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_GRAPH_H
diff --git a/include/ggnn/base/graph_config.h b/include/ggnn/base/graph_config.h
new file mode 100644
index 0000000..1b6d0f6
--- /dev/null
+++ b/include/ggnn/base/graph_config.h
@@ -0,0 +1,115 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_GRAPH_CONFIG_H
+#define INCLUDE_GGNN_GRAPH_CONFIG_H
+
+#include <ggnn/base/def.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+namespace ggnn {
+
+/**
+ * User-definable graph parameters
+ */
+struct GraphParameters {
+  /// number of base points per shard
+  uint32_t N{};
+  /// number of dimensions in the dataset and query
+  uint32_t D{};
+
+  /// number of neighbors per point
+  uint32_t KBuild{};
+
+  /// number of layers
+  static constexpr uint32_t L =
+      4;  // we empirically found 4 layers to work best across all datasets
+};
+
+/**
+ * Automatically derived secondary graph parameters
+ */
+struct GraphDerivedParameters : public GraphParameters {
+  GraphDerivedParameters() = default;
+  GraphDerivedParameters(const GraphParameters& params);
+
+  /// number of inverse (foreign) links per point, part of KBuild
+  uint32_t KF{KBuild / 2};
+
+  /// growth factor (number of sub-graphs merged together per layer)
+  uint32_t G{};
+
+  /// segment size
+  uint32_t S{next_multiple<uint32_t, 32U>(KF + 1)};
+  /// segment size in base layer
+  uint32_t S0{};
+  /// number of segments in base layer with one additional element
+  uint32_t S0_off{};
+
+  /// number of points per segment selected into upper-level segment
+  uint32_t SG{};
+  /// number of segments per layer contributing an additional point into the upper-level segment
+  uint32_t SG_off{};
+};
+
+/**
+ * Automatically derived graph dimensions
+ */
+struct GraphDimensions {
+  static constexpr uint32_t L = GraphParameters::L;
+
+  GraphDimensions() = default;
+  GraphDimensions(uint32_t N, uint32_t S, uint32_t G);
+
+  /// total number of neighborhoods in the graph
+  uint32_t N_all{};
+  /// total number of selection/translation entries
+  uint32_t ST_all{};
+
+  /// blocks/segments per layer
+  std::array<uint32_t, L> Bs{};  // [L]
+  /// neighborhoods per layer
+  std::array<uint32_t, L> Ns{};  // [L]
+  /// start of neighborhoods per layer
+  std::array<uint32_t, L> Ns_offsets{};  // [L]
+  /// start of selection/translation per layer
+  std::array<uint32_t, L> STs_offsets{};  // [L]
+};
+
+/**
+ * Combined Configuration of the GGNN search graph layout
+ */
+struct GraphConfig : public GraphDerivedParameters, public GraphDimensions {
+  using GraphParameters::L;
+
+  GraphConfig() = default;
+  GraphConfig(const GraphParameters& params);
+
+  size_t getBaseSize(const uint32_t base_t_size) const
+  {
+    return align8(static_cast<size_t>(N) * D * base_t_size);
+  }
+
+  size_t maxBaseAddr() const;
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_GRAPH_CONFIG_H
diff --git a/include/ggnn/base/lib.h b/include/ggnn/base/lib.h
new file mode 100644
index 0000000..304a83e
--- /dev/null
+++ b/include/ggnn/base/lib.h
@@ -0,0 +1,77 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_LIB_H
+#define INCLUDE_GGNN_LIB_H
+
+// list of types used to instantiate templates - extend as needed
+
+#define GGNN_VALUES(F, ...) \
+  F(__VA_OPT__(__VA_ARGS__, ) float);
+
+#define GGNN_BASES(F, ...)              \
+  F(__VA_OPT__(__VA_ARGS__, ) uint8_t); \
+  F(__VA_OPT__(__VA_ARGS__, ) float);
+
+#define GGNN_KEYS(F, ...) \
+  F(__VA_OPT__(__VA_ARGS__, ) int32_t);
+
+#define GGNN_QUERYS(F, ...)         \
+  F(__VA_OPT__(__VA_ARGS__, ) 32);  \
+  F(__VA_OPT__(__VA_ARGS__, ) 64);  \
+  F(__VA_OPT__(__VA_ARGS__, ) 128); \
+  F(__VA_OPT__(__VA_ARGS__, ) 256); \
+  F(__VA_OPT__(__VA_ARGS__, ) 512); \
+  F(__VA_OPT__(__VA_ARGS__, ) 1024);
+
+#define GGNN_TOPS(F, ...)              \
+  F(__VA_OPT__(__VA_ARGS__, ) 128, 4); \
+  F(__VA_OPT__(__VA_ARGS__, ) 256, 4); \
+  F(__VA_OPT__(__VA_ARGS__, ) 256, 8); \
+  F(__VA_OPT__(__VA_ARGS__, ) 512, 8);
+
+#define GGNN_MERGES(F, ...)            \
+  F(__VA_OPT__(__VA_ARGS__, ) 32, 4);  \
+  F(__VA_OPT__(__VA_ARGS__, ) 64, 4);  \
+  F(__VA_OPT__(__VA_ARGS__, ) 128, 4); \
+  F(__VA_OPT__(__VA_ARGS__, ) 256, 4); \
+  F(__VA_OPT__(__VA_ARGS__, ) 256, 8); \
+  F(__VA_OPT__(__VA_ARGS__, ) 512, 8);
+
+#define GGNN_SYMS(F, ...)              \
+  F(__VA_OPT__(__VA_ARGS__, ) 64, 4);  \
+  F(__VA_OPT__(__VA_ARGS__, ) 128, 4); \
+  F(__VA_OPT__(__VA_ARGS__, ) 256, 4); \
+  F(__VA_OPT__(__VA_ARGS__, ) 256, 8); \
+  F(__VA_OPT__(__VA_ARGS__, ) 512, 8);
+
+#define GGNN_DIST_STATS(F, ...) \
+  F(__VA_OPT__(__VA_ARGS__, ) false);
+
+#define GGNN_WRITE_DISTS(F, ...) \
+  F(__VA_OPT__(__VA_ARGS__, ) true);
+
+#define GGNN_EVAL(F, ...) \
+  F(__VA_ARGS__);
+
+#define GGNN_INSTANTIATE_STRUCT(T, ...) \
+  template struct T<__VA_ARGS__>;
+
+#define GGNN_INSTANTIATE_CLASS(T, ...) \
+  template class T<__VA_ARGS__>;
+
+#endif  // INCLUDE_GGNN_LIB_H
diff --git a/include/ggnn/base/result_merger.h b/include/ggnn/base/result_merger.h
new file mode 100644
index 0000000..754927a
--- /dev/null
+++ b/include/ggnn/base/result_merger.h
@@ -0,0 +1,53 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_RESULT_MERGER_H
+#define INCLUDE_GGNN_RESULT_MERGER_H
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/dataset.cuh>
+
+#include <cstdint>
+
+#include <vector>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT>
+struct ResultMerger {
+  using Results = ggnn::Results<KeyT, ValueT>;
+
+  uint32_t N_query{0};
+  uint32_t KQuery{0};
+
+  uint32_t num_gpus{1};
+  uint32_t num_shards_per_gpu{1};
+
+  ResultMerger() = default;
+  ResultMerger(const uint32_t N_query, const uint32_t KQuery, const uint32_t num_gpus = 1,
+               const uint32_t num_shards_per_gpu = 1);
+
+  // intermediate results per GPU, to be merged
+  std::vector<Results> partial_results_per_gpu;
+
+  /// merge together the results from multiple GPUs
+  [[nodiscard]] Results merge(uint32_t N_shard) &&;
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_RESULT_MERGER_H
diff --git a/include/ggnn/cache/cuda_simple_knn_cache.cuh b/include/ggnn/cache/cuda_simple_knn_cache.cuh
deleted file mode 100644
index 6ef495a..0000000
--- a/include/ggnn/cache/cuda_simple_knn_cache.cuh
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_CACHE_CUH_
-#define INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_CACHE_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-#include "ggnn/utils/cuda_knn_distance.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int KQuery, int D, int BLOCK_DIM_X,
-          int VISITED_SIZE = 256, int PRIOQ_SIZE = 128, int BEST_SIZE = 32,
-          typename BaseT = ValueT, typename BAddrT = KeyT,
-          bool DIST_STATS = false, bool OVERFLOW_STATS = false>
-struct SimpleKNNCache {
-  static constexpr KeyT EMPTY_KEY = (KeyT)-1;
-  static constexpr ValueT EMPTY_DIST = std::numeric_limits<ValueT>::infinity();
-
- private:
-  static constexpr int CACHE_SIZE = BEST_SIZE + PRIOQ_SIZE + VISITED_SIZE;
-  static constexpr int SORTED_SIZE = BEST_SIZE + PRIOQ_SIZE;
-
-  static constexpr int DIST_ITEMS_PER_THREAD = (D - 1) / BLOCK_DIM_X + 1;
-  static constexpr int BEST_ITEMS_PER_THREAD =
-      (BEST_SIZE - 1) / BLOCK_DIM_X + 1;
-  static constexpr int PRIOQ_ITEMS_PER_THREAD =
-      (PRIOQ_SIZE - 1) / BLOCK_DIM_X + 1;
-
-  static constexpr int CACHE_ITEMS_PER_THREAD =
-      (CACHE_SIZE - 1) / BLOCK_DIM_X + 1;
-  static constexpr int SORTED_ITEMS_PER_THREAD =
-      (SORTED_SIZE - 1) / BLOCK_DIM_X + 1;
-
-  static constexpr int BEST_END = BEST_SIZE - 1;
-
-  typedef Distance<measure, ValueT, KeyT, D, BLOCK_DIM_X, BaseT, BAddrT> Distance;
-
-  union SyncTempStorage {
-    KeyT cache;
-    bool flag;
-  };
-
- public:
-  KeyT* s_cache;
-  ValueT* s_dists;
-  int& s_prioQ_head;
-  int& s_visited_head;
-  int& s_overflow_counter;
-
-  SyncTempStorage& s_sync;
-
-  ValueT xi;
-
-  Distance rs_dist_calc;
-
-  //# threadIdx.x == 0 stats registers only
-  int dist_calc_counter;
-
-  __device__ __forceinline__ void initSharedStorage() {
-    __shared__ KeyT s_cache_tmp[CACHE_SIZE];
-    __shared__ ValueT s_dists_tmp[SORTED_SIZE];
-
-    s_cache = reinterpret_cast<KeyT*>(s_cache_tmp);
-    s_dists = reinterpret_cast<ValueT*>(s_dists_tmp);
-  }
-
-  __device__ __forceinline__ SyncTempStorage& SyncPrivateTmpStorage() {
-    __shared__ SyncTempStorage s_sync_tmp;
-    return s_sync_tmp;
-  }
-
-  __device__ __forceinline__ int& PrioQRingPrivateTmpStorage() {
-    __shared__ int s_prioQ_head_tmp;
-    return s_prioQ_head_tmp;
-  }
-
-  __device__ __forceinline__ int& CacheRingPrivateTmpStorage() {
-    __shared__ int s_visited_head_tmp;
-    return s_visited_head_tmp;
-  }
-
-  __device__ __forceinline__ int& OverflowPrivateTmpStorage() {
-    __shared__ int s_overflow_tmp;
-    return s_overflow_tmp;
-  }
-
-  __device__ __forceinline__ void init() {
-    for (int i = threadIdx.x; i < CACHE_SIZE; i += BLOCK_DIM_X) {
-      s_cache[i] = EMPTY_KEY;
-    }
-    for (int i = threadIdx.x; i < SORTED_SIZE; i += BLOCK_DIM_X) {
-      s_dists[i] = EMPTY_DIST;
-    }
-    if (DIST_STATS && !threadIdx.x) dist_calc_counter = 0;
-    if (OVERFLOW_STATS && !threadIdx.x) s_overflow_counter = 0;
-    if (!threadIdx.x) {
-      s_prioQ_head = 0;
-      s_visited_head = 0;
-    }
-    __syncthreads();
-  }
-
-  __device__ __forceinline__ SimpleKNNCache(const BaseT* d_base, const KeyT n,
-                                            const ValueT xi_criteria)
-      : xi(xi_criteria),
-        s_prioQ_head(PrioQRingPrivateTmpStorage()),
-        s_visited_head(CacheRingPrivateTmpStorage()),
-        s_overflow_counter(OverflowPrivateTmpStorage()),
-        s_sync(SyncPrivateTmpStorage()),
-        rs_dist_calc(d_base, n) {
-    initSharedStorage();
-    init();
-  }
-
-  __device__ __forceinline__ SimpleKNNCache(const BaseT* d_base,
-                                            const BaseT* d_query, const KeyT n,
-                                            const ValueT xi_criteria)
-      : xi(xi_criteria),
-        s_prioQ_head(PrioQRingPrivateTmpStorage()),
-        s_visited_head(CacheRingPrivateTmpStorage()),
-        s_overflow_counter(OverflowPrivateTmpStorage()),
-        s_sync(SyncPrivateTmpStorage()),
-        rs_dist_calc(d_base, d_query, n){
-    initSharedStorage();
-    init();
-  }
-
-  __device__ __forceinline__ bool criteria(ValueT dist) {
-    if (dist < s_dists[KQuery - 1] + xi) return true;
-    return false;
-  }
-
-  __device__ __forceinline__ bool is_end(int tid) {
-    const int prev_prioQ_ring =
-        (s_prioQ_head - 1 < 0) ? PRIOQ_SIZE - 1 : s_prioQ_head - 1;
-    return tid == BEST_END || tid == BEST_SIZE + prev_prioQ_ring;
-  }
-
-  __device__ __forceinline__ void push(const KeyT key, const ValueT dist) {
-    __syncthreads();
-    // Register for insertion in best and prioq
-    KeyT r_cache[SORTED_ITEMS_PER_THREAD];
-    ValueT r_dists[SORTED_ITEMS_PER_THREAD];
-
-    int r_write_item_best = -1;
-    int r_write_item_prioQ = -1;
-    if (!threadIdx.x) s_sync.flag = true;
-    __syncthreads();
-
-    // Load items for insertion.
-    for (int item = 0; item < SORTED_ITEMS_PER_THREAD && s_sync.flag; ++item) {
-      const int idx = item * BLOCK_DIM_X + threadIdx.x;
-      if (idx < SORTED_SIZE) {
-        r_cache[item] = s_cache[idx];
-        r_dists[item] = s_dists[idx];
-        if (r_cache[item] == key) s_sync.flag = false;
-      }
-    }
-    __syncthreads();
-    // TODO(fabi) return on s_sync.flag = true?
-    for (int item = 0; item < SORTED_ITEMS_PER_THREAD && s_sync.flag; ++item) {
-      const int idx = item * BLOCK_DIM_X + threadIdx.x;
-      if (idx < SORTED_SIZE) {
-        if (r_dists[item] >= dist) {
-          // Don't move if no entry or end of best or prioq.
-          if ((r_cache[item] != EMPTY_KEY) && !is_end(idx)) {
-            const int idx_next = (idx + 1 == SORTED_SIZE) ? BEST_SIZE : idx + 1;
-            s_cache[idx_next] = r_cache[item];
-            s_dists[idx_next] = r_dists[item];
-          }
-
-          // Find insert points.
-          const int idx_prev = idx - 1;
-          const ValueT dist_prev =
-              ((idx_prev == -1) || (idx_prev == BEST_SIZE + s_prioQ_head - 1))
-                  ? -1.f
-                  : (idx_prev == BEST_END) ? s_dists[SORTED_SIZE - 1]
-                                           : s_dists[idx_prev];
-          if (dist_prev < dist) {
-            if (idx < BEST_SIZE)
-              r_write_item_best = item;
-            else
-              r_write_item_prioQ = item;
-          }
-        }
-      }
-    }
-    __syncthreads();
-
-    // Insert into best and prioq.
-    if (r_write_item_best >= 0) {
-      const int idx = r_write_item_best * BLOCK_DIM_X + threadIdx.x;
-      s_cache[idx] = key;
-      s_dists[idx] = dist;
-    }
-    if (r_write_item_prioQ >= 0) {
-      const int idx = r_write_item_prioQ * BLOCK_DIM_X + threadIdx.x;
-      s_cache[idx] = key;
-      s_dists[idx] = dist;
-    }
-  }
-
-  __device__ __forceinline__ KeyT pop() {
-    __syncthreads();
-
-    if (!threadIdx.x) {
-      const int head_idx_prioQ = BEST_SIZE + s_prioQ_head;
-      const ValueT dist = s_dists[head_idx_prioQ];
-      if (dist == EMPTY_DIST) {
-        // Pop on empty prioQ.
-        s_sync.cache = EMPTY_KEY;
-      } else {
-        if (!criteria(dist)) {
-          s_sync.cache = EMPTY_KEY;
-        } else {
-          const KeyT key = s_cache[head_idx_prioQ];
-          s_sync.cache = key;
-          const int head_idx_visited = SORTED_SIZE + s_visited_head;
-          s_cache[head_idx_visited] = key;
-          s_visited_head = (s_visited_head + 1) % VISITED_SIZE;
-        }
-        s_cache[head_idx_prioQ] = EMPTY_KEY;
-        s_dists[head_idx_prioQ] = EMPTY_DIST;
-        // Move ring-buffer head forward.
-        s_prioQ_head = (s_prioQ_head + 1) % PRIOQ_SIZE;
-      }
-    }
-    __syncthreads();
-    return s_sync.cache;
-  }
-
-  __device__ __forceinline__ void fetch(KeyT* s_keys, const KeyT* d_translation,
-                                        int len) {
-    __syncthreads();
-    for (int item = 0; item < CACHE_ITEMS_PER_THREAD; ++item) {
-      const int i = item * BLOCK_DIM_X + threadIdx.x;
-      if (i < CACHE_SIZE) {
-        const KeyT n = s_cache[i];
-        for (int k = 0; n != EMPTY_KEY && k < len; k++) {
-          if (n == s_keys[k]) {
-            s_keys[k] = EMPTY_KEY;
-          }
-        }
-      }
-    }
-
-    for (int k = 0; k < len; k++) {
-      __syncthreads();
-      const KeyT other_n = s_keys[k];
-      if (other_n == EMPTY_KEY) continue;
-      const KeyT other_m =
-          (d_translation == nullptr) ? other_n : d_translation[other_n];
-      const ValueT dist = rs_dist_calc.distance_synced(other_m);
-
-      if (criteria(dist)) {
-        push(other_n, dist);
-        __syncthreads();
-      }
-
-    }
-    __syncthreads();
-  }
-
-  __device__ __forceinline__ void transform(const KeyT* transform) {
-    for (int item = 0; item < CACHE_ITEMS_PER_THREAD; ++item) {
-      const int i = item * BLOCK_DIM_X + threadIdx.x;
-
-      if (i < BEST_SIZE) {
-        // transform best
-        KeyT key = s_cache[i];
-        if (key != EMPTY_KEY)
-          key = transform[key];
-        s_cache[i] = key;
-
-        // copy best into prio queue
-        if (i+BEST_SIZE < SORTED_SIZE) {
-          s_cache[i+BEST_SIZE] = key;
-          s_dists[i+BEST_SIZE] = s_dists[i];
-        }
-      }
-      else if (i < 2*BEST_SIZE && i < SORTED_SIZE) {
-        // do nothing (handled by previous threads)
-      }
-      else if (i < CACHE_SIZE) {
-        // reset remainder of the prio queue and visited cache
-        s_cache[i] = EMPTY_KEY;
-        if (i < SORTED_SIZE)
-          s_dists[i] = EMPTY_DIST;
-      }
-    }
-
-    // reset heads.
-    if (!threadIdx.x) {
-      s_prioQ_head = 0;
-      s_visited_head = 0;
-    }
-  }
-
-  __device__ __forceinline__ void write_best_graph(KeyT* d_buffer, const KeyT n,
-                                                   int K, int offset = 1) {
-    for (int i = threadIdx.x; i < K; i += BLOCK_DIM_X) {
-      const KeyT idx = s_cache[i + offset];
-      d_buffer[n * K + i] = (idx != EMPTY_KEY) ? idx : n;
-    }
-  }
-
-  __device__ __forceinline__ void write_best(KeyT* d_buffer, const KeyT n,
-                                             int stride) {
-    for (int i = threadIdx.x; i < KQuery; i += BLOCK_DIM_X) {
-      const KeyT idx = s_cache[i];
-      d_buffer[n * stride + i] = idx;
-    }
-  }
-
-  __device__ __forceinline__ void write_best(KeyT* d_buffer, const KeyT n,
-                                             int stride, int idx_offset) {
-    for (int i = threadIdx.x; i < KQuery; i += BLOCK_DIM_X) {
-      const KeyT idx = s_cache[i];
-      d_buffer[n * stride + i] = idx + idx_offset;
-    }
-  }
-
-  template <DistanceMeasure m = measure, typename std::enable_if<m == Euclidean, int>::type = 0> // euclidean distance version
-  __device__ __forceinline__ float get_nn1_dist() {
-    return sqrtf(s_dists[1]);
-  }
-
-  template <DistanceMeasure m = measure, typename std::enable_if<m == Cosine, int>::type = 0> // cosine similarity version
-  __device__ __forceinline__ float get_nn1_dist() {
-    return s_dists[1];
-  }
-
-  __device__ __forceinline__ int get_dist_stats() { return dist_calc_counter; }
-  __device__ __forceinline__ int get_overflow_stats() {
-    return s_overflow_counter;
-  }
-
-  /**
-   * Prints first 'len' elements in the Cache. [parallel call]:
-   * cash.print(8);
-   *
-   */
-  __device__ __forceinline__ void print(int len = CACHE_SIZE) {
-    __syncthreads();
-    if (!threadIdx.x) printf("print \n");
-    if (!threadIdx.x) {
-      printf("Cache: ring: %d KQuery: %f (+xi -> %f) \n", s_prioQ_head,
-             s_dists[KQuery - 1], s_dists[KQuery - 1] + xi);
-      for (int i = 0; i < len; ++i) {
-        if (i < BEST_SIZE) {
-          printf("%d -> %d %f \n", i, s_cache[i], s_dists[i]);
-        } else {
-          if (i < SORTED_SIZE) {
-            printf("%d -> %d %f | ", i, s_cache[i], s_dists[i]);
-            if (i - BEST_SIZE == s_prioQ_head) printf("X");
-            printf("\n");
-          } else {
-            printf("%d -> %d | ", i, s_cache[i]);
-            if (i - SORTED_SIZE == s_visited_head) printf("X");
-            printf("\n");
-          }
-        }
-      }
-    }
-    __syncthreads();
-  }
-};
-
-#endif  // INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_CACHE_CUH_
diff --git a/include/ggnn/cache/cuda_simple_knn_cache_no_slack.cuh b/include/ggnn/cache/cuda_simple_knn_cache_no_slack.cuh
deleted file mode 100644
index 90c027a..0000000
--- a/include/ggnn/cache/cuda_simple_knn_cache_no_slack.cuh
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_CACHE_NO_SLACK_CUH_
-#define INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_CACHE_NO_SLACK_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-#include "ggnn/utils/cuda_knn_distance.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int KQuery, int D, int BLOCK_DIM_X,
-          int VISITED_SIZE = 256, int PRIOQ_SIZE = 128, int BEST_SIZE = 32,
-          typename BaseT = ValueT, typename BAddrT = KeyT,
-          bool DIST_STATS = false, bool OVERFLOW_STATS = false>
-struct SimpleKNNCacheNoSlack {
-  static constexpr KeyT EMPTY_KEY = (KeyT)-1;
-  static constexpr ValueT EMPTY_DIST = std::numeric_limits<ValueT>::infinity();
-
- private:
-  static constexpr int CACHE_SIZE = BEST_SIZE + PRIOQ_SIZE + VISITED_SIZE;
-  static constexpr int SORTED_SIZE = BEST_SIZE + PRIOQ_SIZE;
-
-  static constexpr int DIST_ITEMS_PER_THREAD = (D - 1) / BLOCK_DIM_X + 1;
-  static constexpr int BEST_ITEMS_PER_THREAD =
-      (BEST_SIZE - 1) / BLOCK_DIM_X + 1;
-  static constexpr int PRIOQ_ITEMS_PER_THREAD =
-      (PRIOQ_SIZE - 1) / BLOCK_DIM_X + 1;
-
-  static constexpr int CACHE_ITEMS_PER_THREAD =
-      (CACHE_SIZE - 1) / BLOCK_DIM_X + 1;
-  static constexpr int SORTED_ITEMS_PER_THREAD =
-      (SORTED_SIZE - 1) / BLOCK_DIM_X + 1;
-
-  static constexpr int BEST_END = BEST_SIZE - 1;
-
-  typedef Distance<measure, ValueT, KeyT, D, BLOCK_DIM_X, BaseT, BAddrT> Distance;
-
-  union SyncTempStorage {
-    KeyT cache;
-    bool flag;
-  };
-
- public:
-  KeyT* s_cache;
-  ValueT* s_dists;
-  int& s_prioQ_head;
-  int& s_visited_head;
-  int& s_overflow_counter;
-
-  SyncTempStorage& s_sync;
-
-  Distance rs_dist_calc;
-
-  //# threadIdx.x == 0 stats registers only
-  int dist_calc_counter;
-
-  __device__ __forceinline__ void initSharedStorage() {
-    __shared__ KeyT s_cache_tmp[CACHE_SIZE];
-    __shared__ ValueT s_dists_tmp[SORTED_SIZE];
-
-    s_cache = reinterpret_cast<KeyT*>(s_cache_tmp);
-    s_dists = reinterpret_cast<ValueT*>(s_dists_tmp);
-  }
-
-  __device__ __forceinline__ SyncTempStorage& SyncPrivateTmpStorage() {
-    __shared__ SyncTempStorage s_sync_tmp;
-    return s_sync_tmp;
-  }
-
-  __device__ __forceinline__ int& PrioQRingPrivateTmpStorage() {
-    __shared__ int s_prioQ_head_tmp;
-    return s_prioQ_head_tmp;
-  }
-
-  __device__ __forceinline__ int& CacheRingPrivateTmpStorage() {
-    __shared__ int s_visited_head_tmp;
-    return s_visited_head_tmp;
-  }
-
-  __device__ __forceinline__ int& OverflowPrivateTmpStorage() {
-    __shared__ int s_overflow_tmp;
-    return s_overflow_tmp;
-  }
-
-  __device__ __forceinline__ void init() {
-    for (int i = threadIdx.x; i < CACHE_SIZE; i += BLOCK_DIM_X) {
-      s_cache[i] = EMPTY_KEY;
-    }
-    for (int i = threadIdx.x; i < SORTED_SIZE; i += BLOCK_DIM_X) {
-      s_dists[i] = EMPTY_DIST;
-    }
-    if (DIST_STATS && !threadIdx.x) dist_calc_counter = 0;
-    if (OVERFLOW_STATS && !threadIdx.x) s_overflow_counter = 0;
-    if (!threadIdx.x) {
-      s_prioQ_head = 0;
-      s_visited_head = 0;
-    }
-    __syncthreads();
-  }
-
-  __device__ __forceinline__ SimpleKNNCacheNoSlack(const BaseT* d_base, const KeyT n)
-      : s_prioQ_head(PrioQRingPrivateTmpStorage()),
-        s_visited_head(CacheRingPrivateTmpStorage()),
-        s_overflow_counter(OverflowPrivateTmpStorage()),
-        s_sync(SyncPrivateTmpStorage()),
-        rs_dist_calc(d_base, n) {
-    initSharedStorage();
-    init();
-  }
-
-  __device__ __forceinline__ SimpleKNNCacheNoSlack(const BaseT* d_base,
-                                            const BaseT* d_query, const KeyT n)
-      : s_prioQ_head(PrioQRingPrivateTmpStorage()),
-        s_visited_head(CacheRingPrivateTmpStorage()),
-        s_overflow_counter(OverflowPrivateTmpStorage()),
-        s_sync(SyncPrivateTmpStorage()),
-        rs_dist_calc(d_base, d_query, n){
-    initSharedStorage();
-    init();
-  }
-
-  __device__ __forceinline__ bool criteria(ValueT dist) {
-    return dist < s_dists[BEST_SIZE - 1];
-  }
-
-  __device__ __forceinline__ bool is_end(int tid) {
-    const int prev_prioQ_ring =
-        (s_prioQ_head - 1 < 0) ? PRIOQ_SIZE - 1 : s_prioQ_head - 1;
-    return tid == BEST_END || tid == BEST_SIZE + prev_prioQ_ring;
-  }
-
-  __device__ __forceinline__ void push(const KeyT key, const ValueT dist) {
-    __syncthreads();
-    // Register for insertion in best and prioq
-    KeyT r_cache[SORTED_ITEMS_PER_THREAD];
-    ValueT r_dists[SORTED_ITEMS_PER_THREAD];
-
-    int r_write_item_best = -1;
-    int r_write_item_prioQ = -1;
-    if (!threadIdx.x) s_sync.flag = true;
-    __syncthreads();
-
-    // Load items for insertion.
-    for (int item = 0; item < SORTED_ITEMS_PER_THREAD && s_sync.flag; ++item) {
-      const int idx = item * BLOCK_DIM_X + threadIdx.x;
-      if (idx < SORTED_SIZE) {
-        r_cache[item] = s_cache[idx];
-        r_dists[item] = s_dists[idx];
-        if (r_cache[item] == key) s_sync.flag = false;
-      }
-    }
-    __syncthreads();
-    // TODO(fabi) return on s_sync.flag = true?
-    for (int item = 0; item < SORTED_ITEMS_PER_THREAD && s_sync.flag; ++item) {
-      const int idx = item * BLOCK_DIM_X + threadIdx.x;
-      if (idx < SORTED_SIZE) {
-        if (r_dists[item] >= dist) {
-          // Don't move if no entry or end of best or prioq.
-          if ((r_cache[item] != EMPTY_KEY) && !is_end(idx)) {
-            const int idx_next = (idx + 1 == SORTED_SIZE) ? BEST_SIZE : idx + 1;
-            s_cache[idx_next] = r_cache[item];
-            s_dists[idx_next] = r_dists[item];
-          }
-
-          // Find insert points.
-          const int idx_prev = idx - 1;
-          const ValueT dist_prev =
-              ((idx_prev == -1) || (idx_prev == BEST_SIZE + s_prioQ_head - 1))
-                  ? -1.f
-                  : (idx_prev == BEST_END) ? s_dists[SORTED_SIZE - 1]
-                                           : s_dists[idx_prev];
-          if (dist_prev < dist) {
-            if (idx < BEST_SIZE)
-              r_write_item_best = item;
-            else
-              r_write_item_prioQ = item;
-          }
-        }
-      }
-    }
-    __syncthreads();
-
-    // Insert into best and prioq.
-    if (r_write_item_best >= 0) {
-      const int idx = r_write_item_best * BLOCK_DIM_X + threadIdx.x;
-      s_cache[idx] = key;
-      s_dists[idx] = dist;
-    }
-    if (r_write_item_prioQ >= 0) {
-      const int idx = r_write_item_prioQ * BLOCK_DIM_X + threadIdx.x;
-      s_cache[idx] = key;
-      s_dists[idx] = dist;
-    }
-  }
-
-  __device__ __forceinline__ KeyT pop() {
-    __syncthreads();
-
-    if (!threadIdx.x) {
-      const int head_idx_prioQ = BEST_SIZE + s_prioQ_head;
-      const ValueT dist = s_dists[head_idx_prioQ];
-      if (dist == EMPTY_DIST) {
-        // Pop on empty prioQ.
-        s_sync.cache = EMPTY_KEY;
-      } else {
-        if (!criteria(dist)) {
-          s_sync.cache = EMPTY_KEY;
-        } else {
-          const KeyT key = s_cache[head_idx_prioQ];
-          s_sync.cache = key;
-          const int head_idx_visited = SORTED_SIZE + s_visited_head;
-          s_cache[head_idx_visited] = key;
-          s_visited_head = (s_visited_head + 1) % VISITED_SIZE;
-        }
-        s_cache[head_idx_prioQ] = EMPTY_KEY;
-        s_dists[head_idx_prioQ] = EMPTY_DIST;
-        // Move ring-buffer head forward.
-        s_prioQ_head = (s_prioQ_head + 1) % PRIOQ_SIZE;
-      }
-    }
-    __syncthreads();
-    return s_sync.cache;
-  }
-
-  __device__ __forceinline__ void fetch(KeyT* s_keys, const KeyT* d_translation,
-                                        int len, bool debug = false) {
-    __syncthreads();
-    for (int item = 0; item < CACHE_ITEMS_PER_THREAD; ++item) {
-      const int i = item * BLOCK_DIM_X + threadIdx.x;
-      if (i < CACHE_SIZE) {
-        const KeyT n = s_cache[i];
-        for (int k = 0; n != EMPTY_KEY && k < len; k++) {
-          if (n == s_keys[k]) {
-            s_keys[k] = EMPTY_KEY;
-          }
-        }
-      }
-    }
-
-    for (int k = 0; k < len; k++) {
-      __syncthreads();
-      const KeyT other_n = s_keys[k];
-      if (other_n == EMPTY_KEY) continue;
-      const KeyT other_m =
-          (d_translation == nullptr) ? other_n : d_translation[other_n];
-      const ValueT dist = rs_dist_calc.distance_synced(other_m);
-
-      if (criteria(dist)) {
-        push(other_n, dist);
-        __syncthreads();
-      }
-
-    }
-    __syncthreads();
-  }
-
-  __device__ __forceinline__ void transform(const KeyT* transform) {
-    for (int item = 0; item < CACHE_ITEMS_PER_THREAD; ++item) {
-      const int i = item * BLOCK_DIM_X + threadIdx.x;
-
-      if (i < BEST_SIZE) {
-        // transform best
-        KeyT key = s_cache[i];
-        if (key != EMPTY_KEY)
-          key = transform[key];
-        s_cache[i] = key;
-
-        // copy best into prio queue
-        if (i+BEST_SIZE < SORTED_SIZE) {
-          s_cache[i+BEST_SIZE] = key;
-          s_dists[i+BEST_SIZE] = s_dists[i];
-        }
-      }
-      else if (i < 2*BEST_SIZE && i < SORTED_SIZE) {
-        // do nothing (handled by previous threads)
-      }
-      else if (i < CACHE_SIZE) {
-        // reset remainder of the prio queue and visited cache
-        s_cache[i] = EMPTY_KEY;
-        if (i < SORTED_SIZE)
-          s_dists[i] = EMPTY_DIST;
-      }
-    }
-
-    // reset heads.
-    if (!threadIdx.x) {
-      s_prioQ_head = 0;
-      s_visited_head = 0;
-    }
-  }
-
-  __device__ __forceinline__ void write_best_graph(KeyT* d_buffer, const KeyT n,
-                                                   int K, int offset = 1) {
-    for (int i = threadIdx.x; i < K; i += BLOCK_DIM_X) {
-      const KeyT idx = s_cache[i + offset];
-      d_buffer[n * K + i] = (idx != EMPTY_KEY) ? idx : n;
-    }
-  }
-
-  __device__ __forceinline__ void write_best(KeyT* d_buffer, const KeyT n,
-                                             int stride) {
-    for (int i = threadIdx.x; i < KQuery; i += BLOCK_DIM_X) {
-      const KeyT idx = s_cache[i];
-      d_buffer[n * stride + i] = idx;
-    }
-  }
-
-  template <DistanceMeasure m = measure, typename std::enable_if<m == Euclidean, int>::type = 0> // euclidean distance version
-  __device__ __forceinline__ float get_nn1_dist() {
-    return sqrtf(s_dists[1]);
-  }
-
-  template <DistanceMeasure m = measure, typename std::enable_if<m == Cosine, int>::type = 0> // cosine similarity version
-  __device__ __forceinline__ float get_nn1_dist() {
-    return s_dists[1];
-  }
-
-  __device__ __forceinline__ int get_dist_stats() { return dist_calc_counter; }
-  __device__ __forceinline__ int get_overflow_stats() {
-    return s_overflow_counter;
-  }
-
-  /**
-   * Prints first 'len' elements in the Cache. [parallel call]:
-   * cash.print(8);
-   *
-   */
-  __device__ __forceinline__ void print(int len = CACHE_SIZE) {
-    __syncthreads();
-    if (!threadIdx.x) printf("print \n");
-    if (!threadIdx.x) {
-      printf("Cache: ring: %d KQuery: %f (BEST_SIZE -> %f) \n", s_prioQ_head,
-             s_dists[KQuery - 1], s_dists[BEST_SIZE - 1]);
-      for (int i = 0; i < len; ++i) {
-        if (i < BEST_SIZE) {
-          printf("%d -> %d %f \n", i, s_cache[i], s_dists[i]);
-        } else {
-          if (i < SORTED_SIZE) {
-            printf("%d -> %d %f | ", i, s_cache[i], s_dists[i]);
-            if (i - BEST_SIZE == s_prioQ_head) printf("X");
-            printf("\n");
-          } else {
-            printf("%d -> %d | ", i, s_cache[i]);
-            if (i - SORTED_SIZE == s_visited_head) printf("X");
-            printf("\n");
-          }
-        }
-      }
-    }
-    __syncthreads();
-  }
-};
-
-#endif  // INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_CACHE_NO_SLACK_CUH_
diff --git a/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh b/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh
deleted file mode 100644
index 890420e..0000000
--- a/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh
+++ /dev/null
@@ -1,528 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_SYM_CACHE_CUH_
-#define INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_SYM_CACHE_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int KQuery, int D, int BLOCK_DIM_X,
-          int VISITED_SIZE = 256, int PRIOQ_SIZE = 128, int BEST_SIZE = 32,
-          typename BaseT = ValueT, typename BAddrT = KeyT,
-          bool DIST_STATS = false, bool OVERFLOW_STATS = false>
-struct SimpleKNNSymCache {
-  static constexpr KeyT EMPTY_KEY = (KeyT)-1;
-  static constexpr ValueT EMPTY_DIST = std::numeric_limits<ValueT>::infinity();
-
-  // TODO(fabi): change to constant?
-  static constexpr float EPS = 0.1f;
-
- private:
-  static constexpr int CACHE_SIZE = BEST_SIZE + PRIOQ_SIZE + VISITED_SIZE;
-  static constexpr int SORTED_SIZE = BEST_SIZE + PRIOQ_SIZE;
-
-  static constexpr int DIST_ITEMS_PER_THREAD = (D - 1) / BLOCK_DIM_X + 1;
-  static constexpr int BEST_ITEMS_PER_THREAD =
-      (BEST_SIZE - 1) / BLOCK_DIM_X + 1;
-  static constexpr int PRIOQ_ITEMS_PER_THREAD =
-      (PRIOQ_SIZE - 1) / BLOCK_DIM_X + 1;
-
-  static constexpr int CACHE_ITEMS_PER_THREAD =
-      (CACHE_SIZE - 1) / BLOCK_DIM_X + 1;
-  static constexpr int SORTED_ITEMS_PER_THREAD =
-      (SORTED_SIZE - 1) / BLOCK_DIM_X + 1;
-
-  static constexpr int BEST_END = BEST_SIZE - 1;
-
-  struct DistQueryAndHalf {
-    ValueT dist_query;
-    ValueT dist_half;
-
-    __device__ __forceinline__ DistQueryAndHalf(const ValueT dist_query,
-                                                const ValueT dist_half)
-        : dist_query(dist_query), dist_half(dist_half) {}
-
-    __device__ __forceinline__ DistQueryAndHalf() {}
-  };
-
-  struct DistanceAndNorm {
-    ValueT r_dist;
-    ValueT r_norm;
-
-    __device__ __forceinline__ DistanceAndNorm(const ValueT dist,
-                                               const ValueT norm)
-        : r_dist(dist), r_norm(norm) {}
-
-    __device__ __forceinline__ DistanceAndNorm() {}
-
-    struct Sum {
-      __host__ __device__ __forceinline__ DistanceAndNorm
-      operator()(const DistanceAndNorm& a, const DistanceAndNorm& b) const {
-        return DistanceAndNorm(a.r_dist + b.r_dist, a.r_norm + b.r_norm);
-      }
-    };
-  };
-
-  typedef cub::BlockReduce<ValueT, BLOCK_DIM_X> DistReduce;
-  typedef cub::BlockReduce<DistQueryAndHalf, BLOCK_DIM_X>
-      DistQueryAndHalfReduce;
-
-  union CacheTempStorage {
-    struct {
-      typename DistReduce::TempStorage dist_reduce;
-      typename DistQueryAndHalfReduce::TempStorage dist_query_half_reduce;
-    };
-  };
-
-  union SyncTempStorage {
-    KeyT cache;
-    DistQueryAndHalf dist;
-    bool flag;
-
-    __device__ __forceinline__ SyncTempStorage() {}
-  };
-
- public:
-  KeyT* s_cache;
-  ValueT* s_dists;
-  int& s_prioQ_head;
-  int& s_visited_head;
-  int& s_overflow_counter;
-
-  CacheTempStorage& s_storage;
-  SyncTempStorage& s_sync;
-
-  ValueT criteria_dist;
-  ValueT xi;
-
-  const BaseT* d_base;
-  BaseT r_query[DIST_ITEMS_PER_THREAD];
-  ValueT r_half[DIST_ITEMS_PER_THREAD];
-
-  // only valid in thread 0
-  ValueT query_norm;
-  ValueT half_norm;
-
-  //# threadIdx.x == 0 stats registers only
-  int dist_calc_counter;
-
-  __device__ __forceinline__ void initSharedStorage() {
-    __shared__ KeyT s_cache_tmp[CACHE_SIZE];
-    __shared__ ValueT s_dists_tmp[SORTED_SIZE];
-
-    s_cache = reinterpret_cast<KeyT*>(s_cache_tmp);
-    s_dists = reinterpret_cast<ValueT*>(s_dists_tmp);
-  }
-
-  __device__ __forceinline__ CacheTempStorage& CachePrivateTmpStorage() {
-    __shared__ CacheTempStorage cache_tmp_storage;
-    return cache_tmp_storage;
-  }
-
-  __device__ __forceinline__ SyncTempStorage& SyncPrivateTmpStorage() {
-    __shared__ SyncTempStorage s_sync_tmp;
-    return s_sync_tmp;
-  }
-
-  __device__ __forceinline__ int& PrioQRingPrivateTmpStorage() {
-    __shared__ int s_prioQ_head_tmp;
-    return s_prioQ_head_tmp;
-  }
-
-  __device__ __forceinline__ int& CacheRingPrivateTmpStorage() {
-    __shared__ int s_visited_head_tmp;
-    return s_visited_head_tmp;
-  }
-
-  __device__ __forceinline__ int& OverflowPrivateTmpStorage() {
-    __shared__ int s_overflow_tmp;
-    return s_overflow_tmp;
-  }
-
-  __device__ __forceinline__ void init() {
-    for (int i = threadIdx.x; i < CACHE_SIZE; i += BLOCK_DIM_X) {
-      s_cache[i] = EMPTY_KEY;
-    }
-    for (int i = threadIdx.x; i < SORTED_SIZE; i += BLOCK_DIM_X) {
-      s_dists[i] = EMPTY_DIST;
-    }
-    if (DIST_STATS && !threadIdx.x) dist_calc_counter = 0;
-    if (OVERFLOW_STATS && !threadIdx.x) s_overflow_counter = 0;
-    if (!threadIdx.x) {
-      s_prioQ_head = 0;
-      s_visited_head = 0;
-    }
-    __syncthreads();
-  }
-
-  __device__ __forceinline__ SimpleKNNSymCache(const BaseT* d_base,
-                                               const KeyT n,
-                                               const ValueT xi_criteria)
-      : s_storage(CachePrivateTmpStorage()),
-        d_base(d_base),
-        xi(xi_criteria),
-        s_prioQ_head(PrioQRingPrivateTmpStorage()),
-        s_visited_head(CacheRingPrivateTmpStorage()),
-        s_overflow_counter(OverflowPrivateTmpStorage()),
-        s_sync(SyncPrivateTmpStorage()) {
-    initSharedStorage();
-    init();
-    loadQueryPos(d_base + static_cast<BAddrT>(n) * D);
-  }
-
-  __device__ __forceinline__ void loadQueryPos(const BaseT* d_query) {
-    ValueT r_query_norm = 0.0f;
-    for (int item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
-      const int read_dim = item * BLOCK_DIM_X + threadIdx.x;
-      if (read_dim < D) {
-        r_query[item] = *(d_query + read_dim);
-        if (measure == Cosine) r_query_norm += r_query[item] * r_query[item];
-      }
-    }
-    if (measure == Cosine) {
-      // only needed by thread 0
-      query_norm = DistReduce(s_storage.dist_reduce).Sum(r_query_norm);
-    }
-  }
-
-  __device__ __forceinline__ void init_start_point(const KeyT other_n,
-                                                   const KeyT* d_translation) {
-    init();
-    const KeyT s =
-        (d_translation == nullptr) ? other_n : d_translation[other_n];
-    DistQueryAndHalf r_norms(0.0f, 0.0f);
-    for (int item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
-      const int read_dim = item * BLOCK_DIM_X + threadIdx.x;
-      if (read_dim < D) {
-        r_half[item] =
-            r_query[item] +
-            (0.5f - EPS) * ((d_base[static_cast<BAddrT>(s) * D + read_dim] -
-                             r_query[item]));
-        if (measure == Cosine) {
-          r_norms.dist_query += r_query[item] * r_query[item];
-          r_norms.dist_half += r_half[item] * r_half[item];
-        }
-      }
-    }
-    __syncthreads();
-    if (measure == Cosine) {
-      DistQueryAndHalf norms =
-          DistQueryAndHalfReduce(s_storage.dist_query_half_reduce)
-              .Reduce(r_norms, DistSum());
-      if (!threadIdx.x) {
-        query_norm = norms.dist_query;
-        half_norm = norms.dist_half;
-      }
-    }
-    const DistQueryAndHalf dists = distance_synced(other_n);
-    criteria_dist = dists.dist_half + xi;
-    if (!threadIdx.x) {
-      // Add start point to best list...
-      s_cache[0] = other_n;
-      s_dists[0] = dists.dist_query;
-      // ... and and prioQ.
-      s_cache[BEST_SIZE] = other_n;
-      s_dists[BEST_SIZE] = dists.dist_query;
-    }
-  }
-
-  struct DistSum {
-    __host__ __device__ __forceinline__ DistQueryAndHalf
-    operator()(const DistQueryAndHalf& a, const DistQueryAndHalf& b) const {
-      return DistQueryAndHalf(a.dist_query + b.dist_query,
-                              a.dist_half + b.dist_half);
-    }
-  };
-
-  /**
-   * Calculates synced distance of base vector to other_id vector.
-   *
-   * [parallel call]:
-   * ValueT dist = cache.distance(other_id)
-   *
-   * Return:
-   *   ValueT distance
-   *
-   * Note: distance valid in all threads.
-   */
-  __device__ __forceinline__ DistQueryAndHalf
-  distance_synced(const KeyT other_id) {
-    DistQueryAndHalf r_diff(0.f, 0.f);
-    ValueT r_norm_other = 0.0f;
-    for (int item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
-      const int read_dim = item * BLOCK_DIM_X + threadIdx.x;
-      if (read_dim < D) {
-        const ValueT p = d_base[static_cast<BAddrT>(other_id) * D + read_dim];
-        if (measure == Euclidean) {
-          const ValueT dist_query = r_query[item] - p;
-          r_diff.dist_query += dist_query * dist_query;
-          const ValueT dist_half = r_half[item] - p;
-          r_diff.dist_half += dist_half * dist_half;
-        } else if (measure == Cosine) {
-          const ValueT dist_query = r_query[item] * p;
-          r_diff.dist_query += dist_query;
-          const ValueT dist_half = r_half[item] * p;
-          r_diff.dist_half += dist_half;
-          r_norm_other += p * p;
-        }
-      }
-    }
-
-    DistQueryAndHalf aggregate =
-        DistQueryAndHalfReduce(s_storage.dist_query_half_reduce)
-            .Reduce(r_diff, DistSum());
-    if (measure == Cosine) {
-      // need to normalize by the vectors' lengths (in high dimensions, no
-      // vector has length 1.0f)
-      const ValueT norm_other =
-          DistReduce(s_storage.dist_reduce).Sum(r_norm_other);
-      const ValueT query_norm_sqr = norm_other * query_norm;
-      const ValueT half_norm_sqr = norm_other * half_norm;
-      // use negative dot product, as larger values are closer to each other
-      // otherwise, we would need to adjust each and every distance comparison
-      // in the code
-      if (!threadIdx.x) {
-        if (query_norm_sqr > 0.0f)
-          aggregate.dist_query =
-              fabs(1.0f - aggregate.dist_query / sqrt(query_norm_sqr));
-        else
-          aggregate.dist_query = 1.0f;
-        // while this could be computed in parallel to the query distance,
-        // the necessary shuffling and synchronization costs more.
-        if (half_norm_sqr > 0.0f)
-          aggregate.dist_half =
-              fabs(1.0f - aggregate.dist_half / sqrt(half_norm_sqr));
-        else
-          aggregate.dist_half = 1.0f;
-      }
-    }
-    if (!threadIdx.x) {
-      if (DIST_STATS) dist_calc_counter++;
-      s_sync.dist = aggregate;
-    }
-    __syncthreads();
-
-    return s_sync.dist;
-  }
-
-  __device__ __forceinline__ bool criteria(const ValueT dist) {
-    return (dist < (s_dists[0] + xi));
-  }
-
-  __device__ __forceinline__ bool criteria(const DistQueryAndHalf& dist) {
-    return ((dist.dist_query < (s_dists[0] + xi)) &&
-            (dist.dist_half < criteria_dist));
-  }
-
-  __device__ __forceinline__ bool is_end(int tid) {
-    const int prev_prioQ_ring =
-        (s_prioQ_head - 1 < 0) ? PRIOQ_SIZE - 1 : s_prioQ_head - 1;
-    return tid == BEST_END || tid == BEST_SIZE + prev_prioQ_ring;
-  }
-
-  __device__ __forceinline__ void push(const KeyT key, const ValueT dist) {
-    __syncthreads();
-    // Register for insertion in best and prioq
-    KeyT r_cache[SORTED_ITEMS_PER_THREAD];
-    ValueT r_dists[SORTED_ITEMS_PER_THREAD];
-
-    int r_write_item_best = -1;
-    int r_write_item_prioQ = -1;
-    if (!threadIdx.x) s_sync.flag = true;
-    __syncthreads();
-
-    // Load items for insertion.
-    for (int item = 0; item < SORTED_ITEMS_PER_THREAD && s_sync.flag; ++item) {
-      const int idx = item * BLOCK_DIM_X + threadIdx.x;
-      if (idx < SORTED_SIZE) {
-        r_cache[item] = s_cache[idx];
-        r_dists[item] = s_dists[idx];
-        if (r_cache[item] == key) s_sync.flag = false;
-      }
-    }
-    __syncthreads();
-    // TODO(fabi) return on s_sync.flag = true?
-    for (int item = 0; item < SORTED_ITEMS_PER_THREAD && s_sync.flag; ++item) {
-      const int idx = item * BLOCK_DIM_X + threadIdx.x;
-      if (idx < SORTED_SIZE) {
-        if (r_dists[item] >= dist) {
-          // Don't move if no entry or end of best or prioq.
-          if ((r_cache[item] != EMPTY_KEY) && !is_end(idx)) {
-            const int idx_next = (idx + 1 == SORTED_SIZE) ? BEST_SIZE : idx + 1;
-            s_cache[idx_next] = r_cache[item];
-            s_dists[idx_next] = r_dists[item];
-          }
-
-          // Find insert points.
-          const int idx_prev = idx - 1;
-          const ValueT dist_prev =
-              ((idx_prev == -1) || (idx_prev == BEST_SIZE + s_prioQ_head - 1))
-                  ? -1.f
-                  : (idx_prev == BEST_END) ? s_dists[SORTED_SIZE - 1]
-                                           : s_dists[idx_prev];
-          if (dist_prev < dist) {
-            if (idx < BEST_SIZE)
-              r_write_item_best = item;
-            else
-              r_write_item_prioQ = item;
-          }
-        }
-      }
-    }
-    __syncthreads();
-
-    // Insert into best and prioq.
-    if (r_write_item_best >= 0) {
-      const int idx = r_write_item_best * BLOCK_DIM_X + threadIdx.x;
-      s_cache[idx] = key;
-      s_dists[idx] = dist;
-    }
-    if (r_write_item_prioQ >= 0) {
-      const int idx = r_write_item_prioQ * BLOCK_DIM_X + threadIdx.x;
-      s_cache[idx] = key;
-      s_dists[idx] = dist;
-    }
-  }
-
-  __device__ __forceinline__ KeyT pop() {
-    __syncthreads();
-
-    if (!threadIdx.x) {
-      const int head_idx_prioQ = BEST_SIZE + s_prioQ_head;
-      const ValueT dist = s_dists[head_idx_prioQ];
-      if (dist == EMPTY_DIST) {
-        // Pop on empty prioQ.
-        s_sync.cache = EMPTY_KEY;
-      } else {
-        if (!criteria(dist)) {
-          s_sync.cache = EMPTY_KEY;
-        } else {
-          const KeyT key = s_cache[head_idx_prioQ];
-          s_sync.cache = key;
-          const int head_idx_visited = SORTED_SIZE + s_visited_head;
-          s_cache[head_idx_visited] = key;
-          s_visited_head = (s_visited_head + 1) % VISITED_SIZE;
-        }
-        s_cache[head_idx_prioQ] = EMPTY_KEY;
-        s_dists[head_idx_prioQ] = EMPTY_DIST;
-        // Move ring-buffer head forward.
-        s_prioQ_head = (s_prioQ_head + 1) % PRIOQ_SIZE;
-      }
-    }
-    __syncthreads();
-    return s_sync.cache;
-  }
-
-  __device__ __forceinline__ void fetch(KeyT* s_keys, const KeyT* d_translation,
-                                        int len, bool debug = false) {
-    __syncthreads();
-    for (int item = 0; item < CACHE_ITEMS_PER_THREAD; ++item) {
-      const int i = item * BLOCK_DIM_X + threadIdx.x;
-      if (i < CACHE_SIZE) {
-        const KeyT n = s_cache[i];
-        for (int k = 0; n != EMPTY_KEY && k < len; k++) {
-          if (n == s_keys[k]) {
-            s_keys[k] = EMPTY_KEY;
-          }
-        }
-      }
-    }
-
-    for (int k = 0; k < len; k++) {
-      __syncthreads();
-      const KeyT other_n = s_keys[k];
-      if (other_n == EMPTY_KEY) continue;
-      const KeyT other_m =
-          (d_translation == nullptr) ? other_n : d_translation[other_n];
-      const DistQueryAndHalf dist = distance_synced(other_m);
-      if (criteria(dist)) {
-        push(other_n, dist.dist_query);
-        __syncthreads();
-      }
-    }
-    __syncthreads();
-  }
-
-  __device__ __forceinline__ void write_best_graph(KeyT* d_buffer, const KeyT n,
-                                                   int K, int offset = 1) {
-    for (int i = threadIdx.x; i < K; i += BLOCK_DIM_X) {
-      const KeyT idx = s_cache[i + offset];
-      d_buffer[n * K + i] = (idx != EMPTY_KEY) ? idx : n;
-    }
-  }
-
-  __device__ __forceinline__ void write_best(KeyT* d_buffer, const KeyT n,
-                                             int stride) {
-    for (int i = threadIdx.x; i < KQuery; i += BLOCK_DIM_X) {
-      const KeyT idx = s_cache[i];
-      d_buffer[n * stride + i] = idx;
-    }
-  }
-
-  __device__ __forceinline__ float get_nn1_dist() {
-    if (measure == Euclidean) {
-      return sqrtf(s_dists[1]);
-    } else if (measure == Cosine) {
-      return s_dists[1];
-    }
-    // TODO(fabi): restructure or error.
-    return 0;
-  }
-
-  __device__ __forceinline__ int get_dist_stats() { return dist_calc_counter; }
-  __device__ __forceinline__ int get_overflow_stats() {
-    return s_overflow_counter;
-  }
-
-  /**
-   * Prints first 'len' elements in the Cache. [parallel call]:
-   * cash.print(8);
-   *
-   */
-  __device__ __forceinline__ void print(int len = CACHE_SIZE) {
-    __syncthreads();
-    if (!threadIdx.x) printf("print \n");
-    if (!threadIdx.x) {
-      printf("Cache: ring: %d KQuery: %f (+xi -> %f) \n", s_prioQ_head,
-             s_dists[KQuery - 1], s_dists[KQuery - 1] + xi);
-      for (int i = 0; i < len; ++i) {
-        if (i < BEST_SIZE) {
-          printf("%d -> %d %f \n", i, s_cache[i], s_dists[i]);
-        } else {
-          if (i < SORTED_SIZE) {
-            printf("%d -> %d %f | ", i, s_cache[i], s_dists[i]);
-            if (i - BEST_SIZE == s_prioQ_head) printf("X");
-            printf("\n");
-          } else {
-            printf("%d -> %d | ", i, s_cache[i]);
-            if (i - SORTED_SIZE == s_visited_head) printf("X");
-            printf("\n");
-          }
-        }
-      }
-    }
-    __syncthreads();
-  }
-};
-
-#endif  // INCLUDE_GGNN_CACHE_CUDA_SIMPLE_KNN_SYM_CACHE_CUH_
diff --git a/include/ggnn/construction/graph_buffer.cuh b/include/ggnn/construction/graph_buffer.cuh
new file mode 100644
index 0000000..21a2370
--- /dev/null
+++ b/include/ggnn/construction/graph_buffer.cuh
@@ -0,0 +1,96 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_GRAPH_BUFFER_CUH
+#define INCLUDE_GGNN_GRAPH_BUFFER_CUH
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph_config.h>
+#include <ggnn/base/dataset.cuh>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+namespace ggnn {
+
+/**
+ * GGNN graph buffer data
+ * auxiliary data needed for graph construction once per GPU
+ *
+ * @param KeyT datatype of dataset indices
+ * @param ValueT distance value type
+ */
+template <typename KeyT, typename ValueT>
+struct GraphBuffer {
+  struct PartSizes {
+    PartSizes(const GraphConfig& config)
+        : graph_buffer_size{align8(static_cast<size_t>(config.N) * config.KBuild * sizeof(KeyT))},
+          nn1_dist_buffer_size{align8(static_cast<size_t>(config.N) * sizeof(ValueT))},
+          rng_size{align8(static_cast<size_t>(config.N) * sizeof(float))},
+          sym_buffer_size{align8(static_cast<size_t>(config.N) * config.KF * sizeof(KeyT))},
+          sym_atomic_size{align8(static_cast<size_t>(config.N) * sizeof(uint32_t))}
+    {
+    }
+
+    const size_t graph_buffer_size;
+    const size_t nn1_dist_buffer_size;
+    const size_t rng_size;
+    const size_t sym_buffer_size;
+    const size_t sym_atomic_size;
+
+    size_t getBufferSize() const
+    {
+      const size_t merge_size = nn1_dist_buffer_size + graph_buffer_size;
+      const size_t select_size = nn1_dist_buffer_size + rng_size;
+      const size_t sym_size = sym_buffer_size + sym_atomic_size;
+
+      // NOTE: we're ignoring the size for stats, which should be lower
+      return std::max({merge_size, select_size, sym_size});
+    }
+  };
+
+  /// distance to nearest known neighbor per point
+  ValueT* nn1_dist_buffer{nullptr};
+
+  // BUFFER
+  KeyT* graph_buffer{nullptr};
+  KeyT* sym_buffer{nullptr};
+
+  float* rng{nullptr};
+
+  uint32_t* sym_atomic{nullptr};
+
+  // cub buffer
+  std::byte* temp_storage_cub{nullptr};
+
+  size_t temp_storage_bytes_cub{0};
+
+  Dataset<std::byte> memory{};
+
+  GraphBuffer() = default;
+  GraphBuffer(const GraphConfig& graph_config, Dataset<std::byte>&& memory);
+
+  GraphBuffer(const GraphBuffer& other) = delete;
+  GraphBuffer(GraphBuffer&& other) noexcept = default;
+  GraphBuffer& operator=(const GraphBuffer& other) = delete;
+  GraphBuffer& operator=(GraphBuffer&& other) noexcept = default;
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_GRAPH_BUFFER_CUH
diff --git a/include/ggnn/construction/graph_construction.cuh b/include/ggnn/construction/graph_construction.cuh
new file mode 100644
index 0000000..b0bbc66
--- /dev/null
+++ b/include/ggnn/construction/graph_construction.cuh
@@ -0,0 +1,63 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_GRAPH_CONSTRUCTION_H
+#define INCLUDE_GGNN_GRAPH_CONSTRUCTION_H
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/fwd.h>
+
+#include <memory>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT, typename BaseT>
+struct GPUInstance;
+
+/**
+ * Wrapper for graph construction kernels.
+ */
+template <typename KeyT, typename ValueT, typename BaseT>
+class GraphConstruction {
+ public:
+  using GPUInstance = ggnn::GPUInstance<KeyT, ValueT, BaseT>;
+  using Graph = ggnn::Graph<KeyT, ValueT>;
+
+  GraphConstruction() = default;
+  GraphConstruction(GPUInstance& gpu_instance, float tau_build, const DistanceMeasure measure);
+  virtual ~GraphConstruction() = default;
+  GraphConstruction(const GraphConstruction&) = delete;
+  GraphConstruction(GraphConstruction&&) noexcept = default;
+  GraphConstruction& operator=(const GraphConstruction&) = delete;
+  GraphConstruction& operator=(GraphConstruction&&) noexcept = default;
+
+  virtual void build(Graph& graph, const Dataset<BaseT>& base, const cudaStream_t stream)
+  {
+    pimpl->build(graph, base, stream);
+  }
+  virtual void refine(Graph& graph, const Dataset<BaseT>& base, const cudaStream_t stream)
+  {
+    pimpl->refine(graph, base, stream);
+  }
+
+ private:
+  std::unique_ptr<GraphConstruction> pimpl;
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_GRAPH_CONSTRUCTION_H
diff --git a/include/ggnn/construction/merge_layer.cuh b/include/ggnn/construction/merge_layer.cuh
new file mode 100644
index 0000000..567e9c8
--- /dev/null
+++ b/include/ggnn/construction/merge_layer.cuh
@@ -0,0 +1,93 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_MERGE_LAYER_CUH
+#define INCLUDE_GGNN_MERGE_LAYER_CUH
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph_config.h>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include <glog/logging.h>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void merge(const T kernel);
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          uint32_t DIST_ITEMS_PER_THREAD>
+struct MergeKernel {
+  static constexpr uint32_t BLOCK_DIM_X = BLOCK_SIZE;
+
+  static constexpr uint32_t MAX_ITERATIONS = 200;
+  static constexpr uint32_t CACHE_SIZE = 256;
+  static constexpr uint32_t MIN_PRIOQ_SIZE = 16;
+
+  void launch(const uint32_t N, const cudaStream_t stream = 0)
+  {
+    CHECK_GT(layer_top, layer_btm);
+    VLOG(1) << "MergeKernel -- Layer: " << layer_top << " -> " << layer_btm << " |  N: " << N
+            << "\n";
+    uint32_t sm_size = CACHE_SIZE * sizeof(KeyT) + SORTED_SIZE * sizeof(ValueT);
+    CHECK_LT(SORTED_SIZE, CACHE_SIZE);
+    CHECK_LE(D, BLOCK_DIM_X * DIST_ITEMS_PER_THREAD);
+
+    merge<<<N, BLOCK_DIM_X, sm_size, stream>>>((*this));
+  }
+
+  // determine the start of the top-layer segment (always 0 for layer_top = L-1)
+  __device__ __forceinline__ uint32_t get_top_seg_offset(const KeyT n) const;
+
+  __device__ __forceinline__ void operator()() const;
+
+  const uint32_t D;
+  const DistanceMeasure measure;
+  const uint32_t KBuild;
+  const uint32_t SORTED_SIZE = std::max(CACHE_SIZE < 512U ? 64U : 32U,
+                                        next_multiple<uint32_t, 32>(KBuild + 1 + MIN_PRIOQ_SIZE));
+  const uint32_t S;
+
+  const BaseT* d_base;        // [Nall,D]
+  const KeyT* d_selection;    // [Sall]
+  const KeyT* d_translation;  // [Nall]
+
+  const KeyT* d_graph;   // [N,K]
+  KeyT* d_graph_buffer;  // [N,K]
+
+  const float* d_nn1_stats;  // [sum,max]
+  float* d_nn1_dist_buffer;  // [N0]
+
+  const uint32_t layer_top;  // layer to start from
+  const uint32_t layer_btm;  // layer to merge
+
+  const uint32_t G;          // growth factor
+  const uint32_t S0;         // segment size on layer 0
+  const uint32_t S0_offset;  // segment size offset on layer 0
+
+  const std::array<uint32_t, GraphConfig::L> Ns_offsets;   // start position of graph layer
+  const std::array<uint32_t, GraphConfig::L> STs_offsets;  // start position of translation layer
+
+  const float tau_build;
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_MERGE_LAYER_CUH
diff --git a/include/ggnn/construction/sym_buffer_merge_layer.cuh b/include/ggnn/construction/sym_buffer_merge_layer.cuh
new file mode 100644
index 0000000..84943df
--- /dev/null
+++ b/include/ggnn/construction/sym_buffer_merge_layer.cuh
@@ -0,0 +1,58 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_SYM_BUFFER_MERGE_LAYER_CUH
+#define INCLUDE_GGNN_SYM_BUFFER_MERGE_LAYER_CUH
+
+#include <ggnn/base/def.h>
+#include <glog/logging.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void sym_buffer_merge(const T kernel, const uint32_t N);
+
+template <typename KeyT, typename ValueT>
+struct SymBufferMergeKernel {
+  static constexpr uint32_t BLOCK_DIM_X = 128;
+
+  void launch(const uint32_t N, const cudaStream_t stream = 0)
+  {
+    VLOG(2) << "SymBufferMergeKernel -- N: " << N;
+    dim3 block(KF, POINTS_PER_BLOCK);
+    size_t sm_size = sizeof(KeyT) * POINTS_PER_BLOCK * KF * 2 + sizeof(bool) * POINTS_PER_BLOCK;
+    sym_buffer_merge<<<(N - 1) / POINTS_PER_BLOCK + 1, block, sm_size, stream>>>((*this), N);
+  }
+
+  __device__ __forceinline__ void operator()(uint32_t N) const;
+  const uint32_t KBuild;
+  const uint32_t KF{KBuild / 2};
+
+  const uint32_t POINTS_PER_BLOCK = BLOCK_DIM_X / KF;
+  const uint32_t KL = KBuild - KF;
+
+  const KeyT* d_sym_buffer;      // [N, KF]
+  const uint32_t* d_sym_atomic;  // [N]
+  KeyT* d_graph;                 // [N, K]
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_SYM_BUFFER_MERGE_LAYER_CUH
diff --git a/include/ggnn/construction/sym_query_layer.cuh b/include/ggnn/construction/sym_query_layer.cuh
new file mode 100644
index 0000000..ca8123a
--- /dev/null
+++ b/include/ggnn/construction/sym_query_layer.cuh
@@ -0,0 +1,75 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_SYM_QUERY_LAYER_CUH
+#define INCLUDE_GGNN_SYM_QUERY_LAYER_CUH
+
+#include <ggnn/base/def.h>
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <cstdint>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void sym(const T kernel);
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          uint32_t DIST_ITEMS_PER_THREAD>
+struct SymQueryKernel {
+  static constexpr uint32_t BLOCK_DIM_X = BLOCK_SIZE;
+
+  static constexpr uint32_t MAX_PER_PATH_ITERATIONS = 20;
+  static constexpr uint32_t CACHE_SIZE = 128;
+  static constexpr uint32_t MIN_PRIOQ_SIZE = 16;
+
+  void launch(const uint32_t N, const cudaStream_t stream = 0)
+  {
+    VLOG(1) << "SymQueryKernel -- N: " << N;
+    uint32_t sm_size = CACHE_SIZE * sizeof(KeyT) + sorted_size * sizeof(ValueT);
+
+    CHECK_LT(sorted_size, CACHE_SIZE);
+    CHECK_LE(D, BLOCK_DIM_X * DIST_ITEMS_PER_THREAD);
+
+    sym<<<N, BLOCK_DIM_X, sm_size, stream>>>((*this));
+  }
+
+  __device__ __forceinline__ void operator()() const;
+
+  const uint32_t D;
+  const DistanceMeasure measure;
+  const uint32_t KBuild;
+  // best size is KF = KBuild/2
+  const uint32_t sorted_size = std::max(CACHE_SIZE < 512U ? 64U : 32U,
+                                        next_multiple<uint32_t, 32>(KBuild / 2 + MIN_PRIOQ_SIZE));
+
+  const BaseT* d_base;        // [N0,D]
+  const KeyT* d_graph;        // [N,K]
+  const KeyT* d_translation;  // [N] or nullptr if on base layer
+
+  const float* d_nn1_stats;
+
+  const float tau_build;
+
+  KeyT* d_sym_buffer;      // [N,KF]
+  uint32_t* d_sym_atomic;  // [N]
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_SYM_QUERY_LAYER_CUH
diff --git a/include/ggnn/construction/top_merge_layer.cuh b/include/ggnn/construction/top_merge_layer.cuh
new file mode 100644
index 0000000..016b8d7
--- /dev/null
+++ b/include/ggnn/construction/top_merge_layer.cuh
@@ -0,0 +1,66 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_TOP_MERGE_LAYER_CUH
+#define INCLUDE_GGNN_TOP_MERGE_LAYER_CUH
+
+#include <ggnn/base/def.h>
+#include <glog/logging.h>
+
+#include <cstdint>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void top(const T kernel);
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          uint32_t DIST_ITEMS_PER_THREAD>
+struct TopMergeKernel {
+  static constexpr uint32_t BLOCK_DIM_X = BLOCK_SIZE;
+
+  void launch(const uint32_t N, const cudaStream_t stream = 0)
+  {
+    VLOG(1) << "TopMergeKernel -- Layer: " << layer << " | N: " << N << "\n";
+    uint32_t sm_size = KBuild * sizeof(ValueT) + KBuild * sizeof(KeyT);
+
+    CHECK_LE(D, BLOCK_DIM_X * DIST_ITEMS_PER_THREAD);
+
+    top<<<N, BLOCK_DIM_X, sm_size, stream>>>((*this));
+  }
+
+  __device__ __forceinline__ void operator()() const;
+
+  const uint32_t D;
+  const DistanceMeasure measure;
+  const uint32_t KBuild;
+
+  const BaseT* d_base;
+  const KeyT* d_translation;
+
+  KeyT* d_graph;
+  ValueT* d_nn1_dist_buffer;
+
+  const uint32_t S;
+  const uint32_t S_offset;
+
+  const uint32_t layer;
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_TOP_MERGE_LAYER_CUH
diff --git a/include/ggnn/construction/wrs_select_layer.cuh b/include/ggnn/construction/wrs_select_layer.cuh
new file mode 100644
index 0000000..e5af1d4
--- /dev/null
+++ b/include/ggnn/construction/wrs_select_layer.cuh
@@ -0,0 +1,71 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_WRS_SELECT_LAYER_CUH
+#define INCLUDE_GGNN_WRS_SELECT_LAYER_CUH
+
+#include <ggnn/base/def.h>
+#include <glog/logging.h>
+
+#include <cstdint>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void select(const T kernel);
+
+/*
+ * Selection of K Points per B for Layers.
+ */
+template <typename KeyT, typename ValueT>
+struct WRSSelectionKernel {
+  static constexpr uint32_t BLOCK_DIM_X = 128;
+  static constexpr uint32_t ITEMS_PER_THREAD = 2;
+
+  void launch(const uint32_t B, const cudaStream_t stream = 0)
+  {
+    VLOG(2) << "SelectionKernel -- B: " << B;  // number of blocks to work on
+
+    CHECK_LE(S + (S_offset > 0), ITEMS_PER_THREAD * BLOCK_DIM_X);
+    CHECK_LE(SG + (SG_offset > 0), ITEMS_PER_THREAD * BLOCK_DIM_X);
+
+    select<<<B, BLOCK_DIM_X, 0, stream>>>((*this));
+  }
+
+  __device__ __forceinline__ void operator()() const;
+
+  KeyT* d_selection;
+  KeyT* d_translation;
+  const KeyT* d_translation_layer;
+  const float* d_nn1_dist_buffer;
+  const float* d_rng;
+
+  const uint32_t Sglob;     // segment/block size in upper layer (global segment size)
+  const uint32_t S;         // segment/block size in current layer
+  const uint32_t S_offset;  // number of blocks with S+1 elements (can only be > 0 for base layer)
+
+  const uint32_t G;   // growth factor
+  const uint32_t SG;  // S/G = number of points contributed per segment from lower to upper layer
+  const uint32_t SG_offset;  // S%G = number of segments which contribute an additional point to the
+                             // upper segment
+
+  const uint32_t layer;  // bottom layer to select from
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_WRS_SELECT_LAYER_CUH
diff --git a/include/ggnn/cuda_knn_ggnn.cuh b/include/ggnn/cuda_knn_ggnn.cuh
deleted file mode 100644
index 6ce456c..0000000
--- a/include/ggnn/cuda_knn_ggnn.cuh
+++ /dev/null
@@ -1,493 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_CUDA_KNN_GGNN_CUH_
-#define INCLUDE_GGNN_CUDA_KNN_GGNN_CUH_
-
-#include <limits>
-#include <string>
-#include <vector>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#include "cub/cub.cuh"
-#include "ggnn/cuda_knn_ggnn_gpu_instance.cuh"
-#include "ggnn/query/cuda_knn_query_layer.cuh"
-#include "ggnn/query/cuda_knn_bf_query_layer.cuh"
-#include "ggnn/query/cuda_knn_stats_query_layer.cuh"
-#include "ggnn/query/cuda_knn_no_slack_query_layer.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_ggnn_results.cuh"
-
-
-// for storing generated ground truth data
-#include "io/storer_ann.hpp"
-
-// only needed for file_exists check
-#include <sys/stat.h>
-
-inline bool file_exists(const std::string& name) {
-  struct stat buffer;
-  return (stat(name.c_str(), &buffer) == 0);
-}
-
-/**
- * GGNN single-GPU wrapper
- *
- * @param measure distance measure: Euclidean or Cosine
- * @param KeyT datatype of dataset indices (needs to be able to represent
- * N_base, signed integer required)
- * @param ValueT distance value type
- * @param GAddrT address type used to access neighborhood vectors (needs to be
- * able to represent N_all*K)
- * @param BaseT datatype of dataset vector elements
- * @param BAddrT address type used to access dataset vectors (needs to be able
- * to represent N_base*D)
- * @param D dimension of dataset
- * @param KBuild neighbors per node in the GGNN graph
- * @param KF maximum number of inverse links per node in the GGNN graph
- * @param KQuery number of nearest neighbors to retrieve during query
- * @param S segment size
- */
-template <DistanceMeasure measure,
-          typename KeyT, typename ValueT, typename GAddrT, typename BaseT,
-          typename BAddrT, int D, int KBuild, int KF, int KQuery, int S>
-struct GGNN {
-  using Dataset = Dataset<KeyT, BaseT, BAddrT>;
-  using GGNNGPUInstance = GGNNGPUInstance<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF, KQuery, S>;
-  using GGNNResults = GGNNResults<measure, KeyT, ValueT, BaseT, BAddrT, KQuery>;
-
-  Dataset dataset;
-  GGNNGPUInstance ggnn_gpu_instance;
-  GGNNResults ggnn_results {&dataset};
-
-  GGNN(const std::string& basePath, const std::string& queryPath,
-       const std::string& gtPath, const int L, const float tau_build,
-       const size_t N_base = std::numeric_limits<size_t>::max())
-      : dataset{basePath, queryPath, file_exists(gtPath) ? gtPath : "", N_base},
-        ggnn_gpu_instance{[](){int device; cudaGetDevice(&device); return device;}(), &dataset, dataset.N_base, L, true, tau_build} {
-    CHECK_EQ(dataset.D, D) << "DIM needs to be the same";
-
-    const auto& shard = ggnn_gpu_instance.ggnn_shards.at(0);
-    ggnn_gpu_instance.loadShardBaseDataAsync(0, 0);
-    cudaStreamSynchronize(shard.stream);
-
-    if (gtPath.empty() || !file_exists(gtPath)) {
-      generateGTUsingBF();
-      if (!gtPath.empty()) {
-        LOG(INFO) << "exporting brute-forced ground truth data.";
-        IVecsStorer gt_storer(gtPath, dataset.K_gt,
-            dataset.N_query);
-        gt_storer.store(dataset.gt, dataset.N_query);
-      }
-    }
-  }
-
-  void ggnnMain(const std::string& graph_filename, const int refinement_iterations) {
-    const bool export_graph =
-        !graph_filename.empty() && !file_exists(graph_filename);
-    const bool import_graph =
-        !graph_filename.empty() && file_exists(graph_filename);
-    const bool perform_build = export_graph || !import_graph;
-
-    if (perform_build) {
-      std::vector<float> construction_times;
-      construction_times.reserve(refinement_iterations+1);
-
-      cudaEvent_t start, stop;
-      cudaEventCreate(&start);
-      cudaEventCreate(&stop);
-
-      LOG(INFO) << "Starting Graph construction... (tau=" << ggnn_gpu_instance.tau_build << ")";
-
-      cudaEventRecord(start);
-      build();
-      cudaEventRecord(stop);
-
-      cudaEventSynchronize(stop);
-      float milliseconds = 0;
-      cudaEventElapsedTime(&milliseconds, start, stop);
-      construction_times.push_back(milliseconds);
-
-      for (int refinement_step = 0; refinement_step < refinement_iterations;
-           ++refinement_step) {
-        DLOG(INFO) << "Refinement step " << refinement_step;
-        refine();
-        cudaEventRecord(stop);
-
-        cudaEventSynchronize(stop);
-
-        float elapsed_milliseconds = 0;
-        cudaEventElapsedTime(&elapsed_milliseconds, start, stop);
-        construction_times.push_back(elapsed_milliseconds);
-      }
-      cudaEventDestroy(start);
-      cudaEventDestroy(stop);
-
-      for (int refinement_step = 0;
-           refinement_step < construction_times.size(); refinement_step++) {
-        const float elapsed_milliseconds = construction_times[refinement_step];
-        const float elapsed_seconds = elapsed_milliseconds / 1000.0f;
-        const int number_of_points = ggnn_gpu_instance.N_shard;
-
-        LOG(INFO) << "Graph construction + " << refinement_step << " refinement step(s)";
-        LOG(INFO) << "                   -- secs: " << elapsed_seconds;
-        LOG(INFO) << "                   -- points: " << number_of_points;
-        LOG(INFO) << "                   -- ms/point: "
-                  << elapsed_milliseconds / number_of_points;
-      }
-
-      if (export_graph) {
-        write(graph_filename);
-      }
-    }
-
-    if (import_graph) {
-      read(graph_filename);
-    }
-  }
-
-  /**
-   * reset the graph and prepare for a subset of size N
-   */
-  void reinit_graph_for_subset(KeyT N) {
-    CHECK_LE(N, dataset.N_base);
-    ggnn_gpu_instance.N_shard = N;
-    ggnn_gpu_instance.computeGraphParameters();
-    ggnn_gpu_instance.copyConstantsToGPU();
-
-    dataset.top1DuplicateEnd.clear();
-    dataset.topKDuplicateEnd.clear();
-  }
-
-  void read(const std::string& filename) {
-    auto& ggnn_host = ggnn_gpu_instance.ggnn_cpu_buffers.at(0);
-    auto& ggnn_device = ggnn_gpu_instance.ggnn_shards.at(0);
-
-    ggnn_host.load(filename);
-
-    ggnn_host.uploadAsync(ggnn_device);
-    cudaStreamSynchronize(ggnn_device.stream);
-  }
-
-  void write(const std::string& filename) {
-    auto& ggnn_host = ggnn_gpu_instance.ggnn_cpu_buffers.at(0);
-    auto& ggnn_device = ggnn_gpu_instance.ggnn_shards.at(0);
-
-    ggnn_host.downloadAsync(ggnn_device);
-    cudaStreamSynchronize(ggnn_device.stream);
-
-    ggnn_host.store(filename);
-  }
-
-  void evaluateKNNGraph() {
-    CHECK_EQ(dataset.N_base, dataset.N_query) << "the base needs to be loaded as the query set.";
-    CHECK_GE(KBuild/2, KQuery) << "there aren't as many nearest neighbors in the graph as queried for.";
-    CHECK_GE(dataset.K_gt, KQuery+1) << "need one additional ground truth entry to exclude the point itself.";
-
-    KeyT* const original_gt = dataset.gt;
-    dataset.top1DuplicateEnd.clear();
-    dataset.topKDuplicateEnd.clear();
-    dataset.gt = new KeyT[static_cast<size_t>(dataset.N_query)*dataset.K_gt];
-
-    // shift ground truth left by one to exclude the point itself
-    std::copy_n(original_gt+1, static_cast<size_t>(dataset.N_query)*dataset.K_gt-1, dataset.gt);
-
-    dataset.template checkForDuplicatesInGroundTruth<measure, ValueT>(KQuery);
-
-    auto& ggnn_host = ggnn_gpu_instance.ggnn_cpu_buffers.at(0);
-    auto& ggnn_device = ggnn_gpu_instance.ggnn_shards.at(0);
-
-    ggnn_host.downloadAsync(ggnn_device);
-    cudaStreamSynchronize(ggnn_device.stream);
-
-    // simply copy the neighbors from the graph into the results
-    for (size_t n=0; n<dataset.N_query; ++n) {
-      std::copy_n(ggnn_host.h_graph+n*KBuild, KQuery, ggnn_results.h_sorted_ids+n*KQuery);
-    }
-
-    ggnn_results.evaluateResults();
-
-    delete[] dataset.gt;
-    dataset.gt = original_gt;
-  }
-
-  template <int BLOCK_DIM_X = 32, int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256, bool DIST_STATS = false>
-  void queryLayer() {
-    dataset.template checkForDuplicatesInGroundTruth<measure, ValueT>(KQuery);
-
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    float milliseconds = 0;
-
-    const auto& shard = ggnn_gpu_instance.ggnn_shards.at(0);
-
-    cudaEventRecord(start, shard.stream);
-    ggnn_gpu_instance.template queryLayer<BLOCK_DIM_X, MAX_ITERATIONS, CACHE_SIZE, SORTED_SIZE, DIST_STATS>();
-    cudaEventRecord(stop, shard.stream);
-    ggnn_gpu_instance.ggnn_query.sortAsync(shard.stream);
-    ggnn_results.loadAsync(ggnn_gpu_instance.ggnn_query, 0, shard.stream);
-
-    cudaEventSynchronize(stop);
-
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    VLOG(0) << "[GPU: " << ggnn_gpu_instance.gpu_id << "] query part: " << 0 << " => ms: " << milliseconds << " [" << dataset.N_query << " points query -> " << milliseconds*1000.0f/dataset.N_query << " us/point] \n";
-
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
-
-    cudaStreamSynchronize(shard.stream);
-    ggnn_results.merge();
-    ggnn_results.evaluateResults();
-  }
-
-  template <int BLOCK_DIM_X = 32, int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256, int BEST_SIZE = 128, bool DIST_STATS = false>
-  void noSlackQueryLayer() {
-    dataset.template checkForDuplicatesInGroundTruth<measure, ValueT>(KQuery);
-
-    auto& shard = ggnn_gpu_instance.ggnn_shards.at(0);
-
-    typedef NoSlackQueryKernel<measure, ValueT, KeyT, D, KBuild, KF, KQuery, S, BLOCK_DIM_X, BaseT,
-                        BAddrT, GAddrT, DIST_STATS, false, MAX_ITERATIONS, CACHE_SIZE, SORTED_SIZE, BEST_SIZE>
-        QueryKernel;
-
-    KeyT* m_query_results;
-    cudaMallocManaged(&m_query_results,
-                      dataset.N_query * KQuery * sizeof(KeyT));
-    int* m_dist_statistics = nullptr;
-    if (DIST_STATS)
-      cudaMallocManaged(&m_dist_statistics, dataset.N_query * sizeof(int));
-
-    QueryKernel query_kernel;
-    query_kernel.d_base = shard.d_base;
-    query_kernel.d_query = ggnn_gpu_instance.ggnn_query.d_query;
-
-    query_kernel.d_graph = shard.d_graph;
-    query_kernel.d_query_results = ggnn_gpu_instance.ggnn_query.d_query_result_ids;
-
-    query_kernel.d_translation = shard.d_translation;
-
-    query_kernel.d_nn1_stats = shard.d_nn1_stats;
-
-    query_kernel.N = dataset.N_query;
-    query_kernel.N_offset = 0;
-
-    query_kernel.d_dist_stats = m_dist_statistics;
-
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    float milliseconds = 0;
-
-    cudaEventRecord(start, shard.stream);
-    query_kernel.launch(shard.stream);
-    cudaEventRecord(stop, shard.stream);
-    ggnn_gpu_instance.ggnn_query.sortAsync(shard.stream);
-    ggnn_results.loadAsync(ggnn_gpu_instance.ggnn_query, 0, shard.stream);
-
-    cudaEventSynchronize(stop);
-
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    VLOG(0) << "[GPU: " << ggnn_gpu_instance.gpu_id << "] query part: " << 0 << " => ms: " << milliseconds << " [" << dataset.N_query << " points query -> " << milliseconds*1000.0f/dataset.N_query << " us/point] \n";
-
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
-
-    cudaStreamSynchronize(shard.stream);
-    ggnn_results.merge();
-    ggnn_results.evaluateResults();
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-  }
-
-  /// verbose query with additional logging
-  /// templated mainly to avoid compilation when not used
-  template <int BLOCK_DIM_X = 32, int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256>
-  void queryLayerDebug() {
-    dataset.template checkForDuplicatesInGroundTruth<measure, ValueT>(KQuery);
-
-    auto& shard = ggnn_gpu_instance.ggnn_shards.at(0);
-
-    /*
-    typedef QueryKernel<ValueT, KeyT, D, KBuild, KF, KQuery, S, BLOCK_DIM_X,
-                                BaseT, BAddrT, GAddrT, true, false, MAX_ITERATIONS, CACHE_SIZE, SORTED_SIZE, true>
-    */
-    typedef StatsQueryKernel<measure, ValueT, KeyT, D, KBuild, KF, KQuery, S, BLOCK_DIM_X, BaseT,
-                             BAddrT, GAddrT, true, false, MAX_ITERATIONS, CACHE_SIZE, SORTED_SIZE>
-        QueryKernel;
-
-
-    KeyT* m_query_results;
-    cudaMallocManaged(&m_query_results,
-                      dataset.N_query * KQuery * sizeof(KeyT));
-    ValueT* m_query_results_dists;
-    cudaMallocManaged(&m_query_results_dists,
-                      dataset.N_query * KQuery * sizeof(ValueT));
-    int* m_dist_statistics;
-    cudaMallocManaged(&m_dist_statistics, dataset.N_query * sizeof(int));
-
-    ValueT* m_dist_1_best_stats;
-    ValueT* m_dist_k_best_stats;
-    cudaMallocManaged(&m_dist_1_best_stats,
-                      dataset.N_query * (MAX_ITERATIONS+1) * sizeof(ValueT));
-    cudaMallocManaged(&m_dist_k_best_stats,
-                      dataset.N_query * (MAX_ITERATIONS+1) * sizeof(ValueT));
-    cudaMemset(m_dist_1_best_stats, -1, dataset.N_query * (MAX_ITERATIONS+1) * sizeof(ValueT));
-    cudaMemset(m_dist_k_best_stats, -1, dataset.N_query * (MAX_ITERATIONS+1) * sizeof(ValueT));
-
-    const KeyT debug_query_id = -1;
-    KeyT* m_debug_query_visited_ids;
-    if (debug_query_id > 0) {
-      cudaMallocManaged(&m_debug_query_visited_ids, MAX_ITERATIONS * sizeof(KeyT));
-      cudaMemset(m_debug_query_visited_ids, -1, MAX_ITERATIONS * sizeof(KeyT));
-    }
-
-    QueryKernel query_kernel;
-    query_kernel.d_base = shard.d_base;
-    query_kernel.d_query = ggnn_gpu_instance.ggnn_query.d_query;
-
-    query_kernel.d_graph = shard.d_graph;
-    query_kernel.d_query_results = m_query_results;
-    query_kernel.d_query_results_dists = m_query_results_dists;
-
-    query_kernel.d_dist_1_best_stats = m_dist_1_best_stats;
-    query_kernel.d_dist_k_best_stats = m_dist_k_best_stats;
-    query_kernel.d_debug_query_visited_ids = m_debug_query_visited_ids;
-    query_kernel.debug_query_id = debug_query_id;
-
-    query_kernel.d_translation = shard.d_translation;
-
-    query_kernel.d_nn1_stats = shard.d_nn1_stats;
-
-    //query_kernel.N_base = dataset.N_base;
-    query_kernel.N = dataset.N_query;
-    query_kernel.N_offset = 0;
-
-    query_kernel.d_dist_stats = m_dist_statistics;
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-
-    time_launcher(0, &query_kernel, query_kernel.N);
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-
-    std::ofstream distance_stats_file("distances_k_best.csv", std::ofstream::out);
-    distance_stats_file << "top-layer;";
-    for (int j=0; j<MAX_ITERATIONS; ++j)
-      distance_stats_file << "iteration " << j << ";";
-    distance_stats_file << "last improvement;last distance" << std::endl;
-    for (int i=0; i<dataset.N_query; ++i) {
-      ValueT last_dist = std::numeric_limits<ValueT>::infinity();
-      int last_improvement = 0;
-      for (int j=0; j<MAX_ITERATIONS+1; ++j) {
-        const ValueT dist = m_dist_k_best_stats[i*(MAX_ITERATIONS+1)+j];
-        distance_stats_file << dist << ";";
-        if (dist < last_dist) {
-          last_dist = dist;
-          last_improvement = j;
-        }
-      }
-      distance_stats_file << last_improvement << ";" << last_dist << std::endl;
-    }
-    distance_stats_file.close();
-
-    if (debug_query_id > 0) {
-      // compute distance matrix for multi dimensional scaling
-      std::vector<ValueT> distance_matrix;
-      // wasteful, but easier than indexing a triangle matrix
-      distance_matrix.resize(MAX_ITERATIONS*MAX_ITERATIONS, std::numeric_limits<ValueT>::infinity());
-      for (int i=0; i<MAX_ITERATIONS; ++i) {
-        for (int j=i+1; j<MAX_ITERATIONS; ++j) { // this will take some time
-          distance_matrix[i*MAX_ITERATIONS+j] = dataset.template compute_distance_base_to_base<measure, ValueT>(m_debug_query_visited_ids[i], m_debug_query_visited_ids[j]);
-        }
-      }
-
-      std::vector<ValueT> distances_to_query;
-      distances_to_query.resize(MAX_ITERATIONS);
-      std::ofstream visited_distance_matrix_file("visited_distance_matrix.csv", std::ofstream::out);
-      visited_distance_matrix_file << ValueT(0);
-      for (int i=0; i<MAX_ITERATIONS; ++i) {
-        distances_to_query[i] = dataset.template compute_distance_query<measure, ValueT>(m_debug_query_visited_ids[i], query_kernel.debug_query_id);
-        visited_distance_matrix_file << ';' << distances_to_query[i];
-      }
-      visited_distance_matrix_file << std::endl;
-      for (int i=0; i<MAX_ITERATIONS; ++i) {
-        // insert query point as first point
-        visited_distance_matrix_file << distances_to_query[i];
-
-        for (int j=0; j<MAX_ITERATIONS; ++j) {
-          visited_distance_matrix_file << ';';
-          if (j<i)
-            visited_distance_matrix_file << distance_matrix[j*MAX_ITERATIONS+i];
-          else if (i < j)
-            visited_distance_matrix_file << distance_matrix[i*MAX_ITERATIONS+j];
-          else // if (i == j)
-            visited_distance_matrix_file << 0;
-        }
-        visited_distance_matrix_file << std::endl;
-      }
-      visited_distance_matrix_file.close();
-    }
-
-    printf("query results:\n");
-    for (int i=0; i<min(100, dataset.N_query); ++i) {
-      KeyT gt_index = dataset.gt[i*dataset.K_gt];
-      printf("query %i:", i);
-      for (int j=0; j<KQuery; ++j) {
-        ValueT result_distance = measure == Euclidean ? sqrtf(m_query_results_dists[i*KQuery+j]) : m_query_results_dists[i*KQuery+j];
-        printf("\t%i (%f)", m_query_results[i*KQuery+j], result_distance);
-      }
-      printf("\tgt: %i (%f)\n", gt_index, dataset.template compute_distance_query<measure, ValueT>(gt_index, i));
-    }
-
-    std::copy_n(m_query_results, static_cast<size_t>(dataset.N_query)*KQuery, ggnn_results.h_sorted_ids);
-    ggnn_results.evaluateResults();
-
-    cudaFree(m_query_results);
-    cudaFree(m_query_results_dists);
-    cudaFree(m_dist_statistics);
-    cudaFree(m_dist_1_best_stats);
-    cudaFree(m_dist_k_best_stats);
-    if (debug_query_id > 0)
-      cudaFree(m_debug_query_visited_ids);
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-  }
-
-  void generateGTUsingBF() {
-    ggnn_gpu_instance.generateGTUsingBF(0);
-  }
-
-  void build() {
-    ggnn_gpu_instance.build(0);
-  }
-
-  void refine() {
-    ggnn_gpu_instance.refine();
-  }
-};
-
-#endif  // INCLUDE_GGNN_CUDA_KNN_GGNN_CUH_
diff --git a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
deleted file mode 100644
index 8cbaf0d..0000000
--- a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+++ /dev/null
@@ -1,776 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_
-#define INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_
-
-#include <array>
-#include <limits>
-#include <string>
-#include <thread>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-#include "ggnn/graph/cuda_knn_ggnn_graph_device.cuh"
-#include "ggnn/graph/cuda_knn_ggnn_graph_host.cuh"
-#include "ggnn/graph/cuda_knn_ggnn_graph_buffer.cuh"
-#include "ggnn/merge/cuda_knn_merge_layer.cuh"
-#include "ggnn/merge/cuda_knn_top_merge_layer.cuh"
-#include "ggnn/query/cuda_knn_query_layer.cuh"
-#include "ggnn/query/cuda_knn_ggnn_query.cuh"
-#include "ggnn/query/cuda_knn_bf_query_layer.cuh"
-#include "ggnn/query/cuda_knn_stats_query_layer.cuh"
-#include "ggnn/select/cuda_knn_wrs_select_layer.cuh"
-#include "ggnn/sym/cuda_knn_sym_buffer_merge_layer.cuh"
-#include "ggnn/sym/cuda_knn_sym_query_layer.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-
-template <typename ValueT>
-__global__ void divide(ValueT* res, ValueT* input, ValueT N) {
-  res[threadIdx.x] = input[threadIdx.x]/N;
-}
-
-/**
- * GGNN core operations (shared between single-GPU and multi-GPU version)
- *
- * @param measure distance measure: Euclidean or Cosine
- * @param KeyT datatype of dataset indices (needs to be able to represent
- * N_base, signed integer required)
- * @param ValueT distance value type
- * @param GAddrT address type used to access neighborhood vectors (needs to be
- * able to represent N_all*K)
- * @param BaseT datatype of dataset vector elements
- * @param BAddrT address type used to access dataset vectors (needs to be able
- * to represent N_base*D)
- * @param D dimension of dataset
- * @param KBuild neighbors per node in the GGNN graph
- * @param KF maximum number of inverse links per node in the GGNN graph
- * @param KQuery number of nearest neighbors to retrieve during query
- * @param S segment size
- */
-template <DistanceMeasure measure,
-          typename KeyT, typename ValueT, typename GAddrT, typename BaseT,
-          typename BAddrT, int D, int KBuild, int KF, int KQuery, int S>
-struct GGNNGPUInstance {
-  /// number of base points per shard
-  int N_shard;
-  /// number of layers
-  int L;
-  /// growth factor (number of sub-graphs merged together per layer)
-  int G;
-  /// segment size in base layer
-  int S0;
-  /// number of segments in base layer with one additional element
-  int S0_off;
-  /// slack factor for symmetric linking
-  float tau_build;
-
-  /// total number of neighborhoods in the graph
-  int N_all;
-  /// total number of selection/translation entries
-  int ST_all;
-
-  /// neighborhoods per layer
-  std::array<int, MAX_LAYER> Ns;  // [L]
-  /// start of neighborhoods per layer
-  std::array<int, MAX_LAYER> Ns_offsets;  // [L]
-  /// start of selection/translation per layer
-  std::array<int, MAX_LAYER> STs_offsets;  // [L]
-
-  typedef GGNNGraphDevice<KeyT, BaseT, ValueT> GGNNGraphDevice;
-  typedef GGNNGraphHost<KeyT, BaseT, ValueT> GGNNGraphHost;
-
-  const Dataset<KeyT, BaseT, BAddrT>* dataset;
-  GGNNGraphBuffer<KeyT, ValueT>* ggnn_buffer {nullptr};
-  GGNNQuery<KeyT, ValueT, BaseT> ggnn_query;
-
-  // Graph Shards resident on the GPU
-  std::vector<GGNNGraphDevice> ggnn_shards;
-  // Graph Shards resident on the CPU (for swapping, loading, and storing)
-  std::vector<GGNNGraphHost> ggnn_cpu_buffers;
-
-  curandGenerator_t gen;
-
-  //TODO (lukas): merge the buffer-code in here?
-
-  // CUDA GPU id associated with this instance
-  const int gpu_id;
-
-  // number of shards that need to be processed by this instance
-  const int num_parts;
-
-  GGNNGPUInstance(const int gpu_id, const Dataset<KeyT, BaseT, BAddrT>* dataset,
-            const int N_shard, const int L,
-            const bool enable_construction, const float tau_build,
-            const int num_parts=1, const int num_cpu_buffers=1) :
-    N_shard{N_shard}, L{L}, tau_build{tau_build},
-    dataset{dataset}, gpu_id{gpu_id},
-    ggnn_query{dataset->N_query, D, KQuery, num_parts},
-    num_parts{num_parts}
-  {
-    CHECK_LE(L, MAX_LAYER);
-
-    LOG(INFO) << "GGNNGPUInstance(): CUDA device id: " << gpu_id;
-    {
-      int current_gpu_id;
-      cudaGetDevice(&current_gpu_id);
-      CHECK_EQ(current_gpu_id, gpu_id) << "cudaSetDevice() needs to be called in advance!";
-    }
-
-    ggnn_query.loadQueriesAsync(dataset->h_query, 0);
-
-    computeGraphParameters();
-
-    CHECK_LE(static_cast<size_t>(N_all) * static_cast<size_t>(KBuild),
-        static_cast<size_t>(std::numeric_limits<GAddrT>::max()))
-      << "address type is insufficient to address the requested graph.";
-
-    copyConstantsToGPU();
-
-    // allocate CPU memory first (fail early if out of memory)
-    ggnn_cpu_buffers.reserve(num_cpu_buffers);
-    for (int i=0; i < num_cpu_buffers; i++)
-      ggnn_cpu_buffers.emplace_back(N_shard, KBuild, N_all, ST_all);
-
-    //TODO (lukas): merge the buffer-code in here?
-
-    if (enable_construction)
-      ggnn_buffer = new GGNNGraphBuffer<KeyT, ValueT>{N_shard, KBuild, KF};
-
-    curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
-    curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
-
-    int max_shards;
-    {
-      size_t free, total;
-      CHECK_CUDA(cudaMemGetInfo(&free, &total));
-
-      size_t size_per_shard = getSizePerShard();
-
-      max_shards = free/size_per_shard;
-      LOG(INFO) << "remaining device memory (" << free/(1024.0f*1024.0f*1024.0f)
-                << " GB) suffices for " << max_shards << " shards ("
-                << size_per_shard/(1024.0f*1024.0f*1024.0f) << " GB each).";
-
-      CHECK_GT(max_shards, 0) << "use smaller shards.";
-    }
-
-    const int num_shards = min(max_shards, num_parts);
-    ggnn_shards.reserve(num_shards);
-
-    for (int i=0; i < num_shards; i++) {
-      ggnn_shards.emplace_back(N_shard, D, KBuild, N_all, ST_all);
-    }
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-  }
-
-  GGNNGPUInstance(const GGNNGPUInstance& other)
-   : dataset{nullptr}, ggnn_query{0, D, KQuery},
-     gpu_id{0}, N_shard{0}, num_parts{0} {
-    // this exists to allow using vector::emplace_back
-    // when it triggers a reallocation, this code will be called.
-    // always make sure that enough memory is reserved ahead of time.
-    LOG(FATAL) << "copying is not supported. reserve()!";
-  }
-
-   ~GGNNGPUInstance() {
-     CHECK_CUDA(cudaSetDevice(gpu_id));
-     ggnn_shards.clear();
-
-     delete ggnn_buffer;
-
-     CHECK_CUDA(cudaPeekAtLastError());
-     CHECK_CUDA(cudaDeviceSynchronize());
-     CHECK_CUDA(cudaPeekAtLastError());
-   }
-
-  void computeGraphParameters() {
-    /// theoretical growth factor (number of sub-graphs merged together per
-    /// layer)
-    const float growth = powf(N_shard / static_cast<float>(S), 1.f / (L - 1));
-
-    const int Gf = growth;
-    const int Gc = growth + 1;
-
-    const float S0f = N_shard / (pow(Gf, (L - 1)));
-    const float S0c = N_shard / (pow(Gc, (L - 1)));
-
-    const bool is_floor =
-        (growth > 0) && ((S0c < KBuild) || (fabs(S0f - S) < fabs(S0c - S)));
-
-    G = (is_floor) ? Gf : Gc;
-    S0 = (is_floor) ? S0f : S0c;
-    S0_off = N_shard - pow(G, L - 1) * S0;
-
-    VLOG(1) << "GGNNGPUInstance(): N: " << N_shard << ", L: " << L
-            << ", G: " << G << ", S: " << S << ", S0: " << S0
-            << ", S0_off: " << S0_off << ", K: " << KBuild << ", KF: " << KF;
-
-    N_all = 0;
-    ST_all = 0;
-    int N_current = N_shard;
-    for (int l = 0; l < L; l++) {
-      Ns[l] = N_current;
-      Ns_offsets[l] = N_all;
-      STs_offsets[l] = ST_all;
-      N_all += N_current;
-      if (l) {
-        ST_all += N_current;
-        N_current /= G;
-      }
-      else {
-        N_current = S;
-        for (int i=2;i<L; ++i)
-          N_current *= G;
-      }
-    }
-  }
-
-  size_t getSizePerShard() const {
-    const size_t graph_size = static_cast<GAddrT>(N_all) * KBuild * sizeof(KeyT);
-    const size_t selection_translation_size = ST_all * sizeof(KeyT);
-    // const size_t nn1_dist_buffer_size = N * sizeof(ValueT);
-    const size_t nn1_stats_size = 2 * sizeof(ValueT);
-    const size_t total_graph_size = graph_size + 2 * selection_translation_size
-        + nn1_stats_size;
-    const size_t base_size = static_cast<BAddrT>(N_shard) * D * sizeof(BaseT);
-
-    return total_graph_size + base_size;
-  }
-
-  void copyConstantsToGPU() const {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    VLOG(2) << "GGNNGPUInstance::copyConstantsToGPU().\n";
-
-    cudaMemcpyToSymbol(c_Ns, Ns.data(), L * sizeof(int));
-    cudaMemcpyToSymbol(c_Ns_offsets, Ns_offsets.data(), L * sizeof(int));
-
-    cudaMemcpyToSymbol(c_G, &G, sizeof(int));
-    cudaMemcpyToSymbol(c_L, &L, sizeof(int));
-    cudaMemcpyToSymbol(c_S0, &S0, sizeof(int));
-    cudaMemcpyToSymbol(c_S0_offset, &S0_off, sizeof(int));
-
-    cudaMemcpyToSymbol(c_tau_build, &tau_build, sizeof(float));
-    cudaMemcpyToSymbol(c_STs_offsets, STs_offsets.data(), L * sizeof(int));
-  }
-
-  // graph utilities
-
-  int getNs(const int layer) const { return Ns[layer]; }
-
-  int getS(const int layer) const { return layer ? S : S0; }
-
-  int getS_offset(const int layer) const { return layer ? 0 : S0_off; }
-
-  KeyT* getGraph(const int shard, const int layer) {
-    return &ggnn_shards.at(shard%ggnn_shards.size()).d_graph[static_cast<GAddrT>(Ns_offsets[layer]) * KBuild];
-  }
-
-  KeyT* getSelection(const int shard, const int layer) {
-    if (!layer) {
-      // there is no selection for layer 0
-      return nullptr;
-    }
-    return &ggnn_shards.at(shard%ggnn_shards.size()).d_selection[STs_offsets[layer]];
-  }
-
-  KeyT* getTranslation(const int shard, const int layer) {
-    if (!layer) {
-      // there is no translation for layer 0
-      return nullptr;
-    }
-    return &ggnn_shards.at(shard%ggnn_shards.size()).d_translation[STs_offsets[layer]];
-  }
-
-  // io
-
-  void waitForDiskIO(const int shard_id) {
-    auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-    if (cpu_buffer.disk_io_thread.joinable())
-      cpu_buffer.disk_io_thread.join();
-  }
-
-  void loadPartAsync(const std::string graph_dir, const int part_id, const int shard_id) {
-    waitForDiskIO(shard_id);
-    auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-    auto load_part = [this, graph_dir, part_id, shard_id]() -> void {
-      CHECK_CUDA(cudaSetDevice(gpu_id));
-      auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-      auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-
-      cudaStreamSynchronize(shard.stream);
-
-      if (shard.current_part_id == part_id) {
-        VLOG(4) << "[GPU: " << gpu_id << "] part " << part_id << " is already loaded on shard " << shard_id;
-        return;
-      }
-
-      shard.current_part_id = part_id;
-
-      loadShardBaseDataAsync(part_id, shard_id);
-
-      if (cpu_buffer.current_part_id == part_id) {
-        VLOG(4) << "[GPU: " << gpu_id << "] part " << part_id << " is already loaded on cpu buffer " << shard_id%ggnn_cpu_buffers.size();
-      }
-      else {
-        const std::string part_filename = graph_dir + "part_" + std::to_string(part_id) + ".ggnn";
-        cpu_buffer.load(part_filename);
-        VLOG(2) << "[GPU: " << gpu_id << "] loaded part " << part_id << " from " << part_filename.c_str();
-        cpu_buffer.current_part_id = part_id;
-      }
-
-      cpu_buffer.uploadAsync(shard);
-      cudaStreamSynchronize(shard.stream);
-      VLOG(4) << "[GPU: " << gpu_id << "] uploaded part " << part_id;
-    };
-    cpu_buffer.disk_io_thread = std::thread(load_part);
-  }
-
-  void uploadPartAsync(const int part_id, const int shard_id) {
-    waitForDiskIO(shard_id);
-    auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-    auto upload_part = [this, part_id, shard_id]() -> void {
-      CHECK_CUDA(cudaSetDevice(gpu_id));
-      auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-      auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-
-      cudaStreamSynchronize(shard.stream);
-
-      if (shard.current_part_id == part_id) {
-        VLOG(4) << "[GPU: " << gpu_id << "] part " << part_id << " is already loaded on shard " << shard_id;
-        return;
-      }
-
-      shard.current_part_id = part_id;
-      CHECK_EQ(cpu_buffer.current_part_id, part_id);
-
-      loadShardBaseDataAsync(part_id, shard_id);
-      cpu_buffer.uploadAsync(shard);
-      cudaStreamSynchronize(shard.stream);
-      VLOG(4) << "[GPU: " << gpu_id << "] uploaded part " << part_id;
-    };
-    cpu_buffer.disk_io_thread = std::thread(upload_part);
-  }
-
-  void storePartAsync(const std::string graph_dir, const int part_id, const int shard_id) {
-    waitForDiskIO(shard_id);
-    auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-    auto store_part = [this, graph_dir, part_id, shard_id]() -> void {
-      CHECK_CUDA(cudaSetDevice(gpu_id));
-      auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-      auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-
-      if (cpu_buffer.current_part_id == part_id) {
-        VLOG(4) << "[GPU: " << gpu_id << "] part " << part_id << " is already downloaded";
-      }
-      else {
-        cpu_buffer.downloadAsync(shard);
-        cudaStreamSynchronize(shard.stream);
-        VLOG(4) << "[GPU: " << gpu_id << "] downloaded part " << part_id;
-      }
-
-      const std::string part_filename = graph_dir + "part_" + std::to_string(part_id) + ".ggnn";
-      cpu_buffer.store(part_filename);
-      VLOG(2) << "[GPU: " << gpu_id << "] stored part " << part_id << " to " << part_filename.c_str();
-    };
-    cpu_buffer.disk_io_thread = std::thread(store_part);
-  }
-
-  void downloadPartAsync(const int part_id, const int shard_id) {
-    waitForDiskIO(shard_id);
-    auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-    auto download_part = [this, part_id, shard_id]() -> void {
-      CHECK_CUDA(cudaSetDevice(gpu_id));
-      auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-      auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
-
-      cpu_buffer.downloadAsync(shard);
-      cudaStreamSynchronize(shard.stream);
-      cpu_buffer.current_part_id = part_id;
-      VLOG(4) << "[GPU: " << gpu_id << "] downloaded part " << part_id;
-    };
-    cpu_buffer.disk_io_thread = std::thread(download_part);
-  }
-
-  void loadShardBaseDataAsync(const int part_id, const int shard_id) {
-    const size_t memsize = static_cast<BAddrT>(N_shard) * D * sizeof(BaseT);
-    const size_t N_offset = static_cast<BAddrT>(N_shard) * part_id;
-    auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-    CHECK_CUDA(cudaMemcpyAsync(shard.d_base, dataset->h_base + N_offset * D,
-                               memsize, cudaMemcpyHostToDevice, shard.stream));
-  }
-
-  void generateGTUsingBF(const int shard_id = 0) {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-
-    const int KGT = 100;
-    KeyT* m_gt = nullptr;
-    CHECK_CUDA(cudaMallocManaged(&m_gt, sizeof(KeyT)*KGT*dataset->N_query));
-
-    CHECK_LE(dataset->K_gt, KGT) << "The brute force query is set to " << KGT << " neighbors, but the dataset is configured for " << dataset->K_gt << ".";
-
-    typedef BruteForceQueryKernel<measure, ValueT, KeyT, D, KGT, 32,
-                                  BaseT, BAddrT, GAddrT, false>
-        QueryKernel;
-
-    LOG(INFO) << "Running brute force query to determine ground truth";
-
-    QueryKernel query_kernel;
-    query_kernel.d_base = shard.d_base;
-    query_kernel.d_query = ggnn_query.d_query;
-
-    query_kernel.d_query_results = m_gt;
-
-    query_kernel.N_base = N_shard; // this applies to potential subsets
-    query_kernel.N = dataset->N_query;
-    query_kernel.N_offset = 0;
-
-    time_launcher(0, &query_kernel, query_kernel.N, shard.stream);
-
-    cudaStreamSynchronize(shard.stream);
-
-    if (dataset->K_gt == KGT) {
-      std::copy_n(m_gt, KGT*dataset->N_query, dataset->gt);
-    }
-    else {
-      const size_t stride_results = static_cast<size_t>(dataset->N_query)*KGT;
-      const size_t stride_dest = static_cast<size_t>(dataset->N_query)*dataset->K_gt;
-      for (int n=0; n<dataset->N_query; ++n) {
-        std::copy_n(m_gt+n*stride_results, dataset->K_gt, dataset->gt+n*stride_dest);
-      }
-    }
-
-    CHECK_CUDA(cudaFree(m_gt));
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-  }
-
-  // graph operations
-
-  template <int BLOCK_DIM_X = 32, int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256, bool DIST_STATS = false>
-  void queryLayer(const int shard_id = 0) const {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-
-    typedef QueryKernel<measure, ValueT, KeyT, D, KBuild, KF, KQuery, S, BLOCK_DIM_X, BaseT,
-                        BAddrT, GAddrT, DIST_STATS, false, MAX_ITERATIONS, CACHE_SIZE, SORTED_SIZE, true>
-        QueryKernel;
-
-    int* m_dist_statistics = nullptr;
-    if (DIST_STATS)
-      cudaMallocManaged(&m_dist_statistics, dataset->N_query * sizeof(int));
-
-    QueryKernel query_kernel;
-    query_kernel.d_base = shard.d_base;
-    query_kernel.d_query = ggnn_query.d_query;
-
-    query_kernel.d_graph = shard.d_graph;
-    query_kernel.d_query_results = ggnn_query.d_query_result_ids;
-    query_kernel.d_query_results_dists = ggnn_query.d_query_result_dists;
-
-    query_kernel.d_translation = shard.d_translation;
-
-    query_kernel.d_nn1_stats = shard.d_nn1_stats;
-
-    query_kernel.N = dataset->N_query;
-    query_kernel.N_offset = 0;
-
-    query_kernel.d_dist_stats = m_dist_statistics;
-
-    query_kernel.part = shard_id;
-    query_kernel.num_parts = num_parts;
-    query_kernel.N_base = N_shard;
-
-    query_kernel.launch(shard.stream);
-
-    if (DIST_STATS)
-      cudaFree(m_dist_statistics);
-  }
-
-  void select(const int layer, const int shard_id = 0) {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-
-    typedef WRSSelectionKernel<ValueT, KeyT, 128, S> SelectionKernel;
-
-    SelectionKernel select_kernel;
-
-    select_kernel.d_selection = getSelection(shard_id, layer + 1);
-    select_kernel.d_translation = getTranslation(shard_id, layer + 1);
-
-    select_kernel.d_translation_layer = getTranslation(shard_id, layer);
-
-    select_kernel.layer = layer;
-
-    select_kernel.S = getS(layer);
-    select_kernel.S_offset = getS_offset(layer);
-
-    const int SG = S / G;
-    const int SG_offset = S - SG * G;
-
-    select_kernel.SG = SG;
-    select_kernel.SG_offset = SG_offset;
-
-    select_kernel.B = pow(G, L - 1 - layer);
-    select_kernel.B_offset = 0;
-
-    select_kernel.d_rng = ggnn_buffer->d_rng;
-    select_kernel.d_nn1_dist_buffer = ggnn_buffer->d_nn1_dist_buffer;
-
-    /* Generate n floats on device */
-    curandGenerateUniform(gen, ggnn_buffer->d_rng, getNs(layer));
-
-    time_launcher(2, &select_kernel, getNs(layer), shard.stream);
-  }
-
-  void top(const int layer, const int shard_id = 0) {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-
-    typedef TopMergeKernel<measure, ValueT, KeyT, D, KBuild, 128, BaseT, BAddrT, GAddrT>
-        TopMergeKernel;
-
-    TopMergeKernel top_kernel;
-    top_kernel.d_base = shard.d_base;
-    top_kernel.d_translation = getTranslation(shard_id, layer);
-    top_kernel.d_graph = getGraph(shard_id, layer);
-    top_kernel.d_nn1_dist_buffer = ggnn_buffer->d_nn1_dist_buffer;
-
-    top_kernel.layer = layer;
-
-    top_kernel.N = getNs(layer);
-    top_kernel.N_offset = 0;
-
-    top_kernel.S = getS(layer);
-    top_kernel.S_offset = getS_offset(layer);
-
-    time_launcher(2, &top_kernel, getNs(layer), shard.stream);
-  }
-
-  void mergeLayer(const int layer_top, const int layer_btm, const int shard_id = 0) {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-
-    typedef MergeKernel<measure, ValueT, KeyT, D, KBuild, KF, S, 32, BaseT, BAddrT,
-                        GAddrT>
-        MergeKernel;
-
-    const size_t graph_buffer_size =
-        static_cast<GAddrT>(getNs(layer_btm)) * KBuild *
-        sizeof(KeyT);
-
-    MergeKernel merge_kernel;
-    merge_kernel.d_base = shard.d_base;
-
-    merge_kernel.d_graph = shard.d_graph;
-    merge_kernel.d_graph_buffer = ggnn_buffer->d_graph_buffer;
-
-    merge_kernel.d_translation = shard.d_translation;
-    merge_kernel.d_selection = shard.d_selection;
-
-    merge_kernel.d_nn1_stats = shard.d_nn1_stats;
-    merge_kernel.d_nn1_dist_buffer = ggnn_buffer->d_nn1_dist_buffer;
-
-    merge_kernel.N = getNs(layer_btm);
-    merge_kernel.N_offset = 0;
-
-    merge_kernel.layer_top = layer_top;
-    merge_kernel.layer_btm = layer_btm;
-
-    time_launcher(2, &merge_kernel, getNs(layer_btm), shard.stream);
-
-    cudaMemcpyAsync((void*)getGraph(shard_id, layer_btm), (void*)ggnn_buffer->d_graph_buffer,
-               graph_buffer_size, cudaMemcpyDeviceToDevice, shard.stream);
-  };
-
-  void merge(const int layer_top, const int layer_btm, const int shard_id = 0) {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-
-    VLOG(2) << "merge: " << layer_top << layer_btm << std::endl;
-    if (layer_top == layer_btm)
-      top(layer_btm, shard_id);
-    else
-      mergeLayer(layer_top, layer_btm, shard_id);
-
-    if (!layer_btm)
-      computeNN1Stats(shard_id);
-  };
-
-  void computeNN1Stats(const int shard_id = 0) {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-
-    CHECK_CUDA(cub::DeviceReduce::Sum(ggnn_buffer->d_temp_storage_sum,
-                                      ggnn_buffer->temp_storage_bytes_sum,
-                                      ggnn_buffer->d_nn1_dist_buffer,
-                                      &shard.d_nn1_stats[0], N_shard,
-                                      shard.stream));
-
-    divide<ValueT><<<1, 1, 0, shard.stream>>>(shard.d_nn1_stats,
-        shard.d_nn1_stats, ValueT(N_shard));
-
-    CHECK_CUDA(cub::DeviceReduce::Max(ggnn_buffer->d_temp_storage_max,
-                                      ggnn_buffer->temp_storage_bytes_max,
-                                      ggnn_buffer->d_nn1_dist_buffer,
-                                      &shard.d_nn1_stats[1], N_shard,
-                                      shard.stream));
-
-    if(VLOG_IS_ON(2))
-    {
-      ValueT h_nn1_stats[2];
-      cudaMemcpyAsync(h_nn1_stats, shard.d_nn1_stats, 2*sizeof(ValueT), cudaMemcpyDeviceToHost, shard.stream);
-      cudaStreamSynchronize(shard.stream);
-      VLOG(2) << "mean: " << h_nn1_stats[0] << " | max: " << h_nn1_stats[1] << std::endl;
-    }
-  }
-
-  void sym(const int layer, const int shard_id = 0) {
-    CHECK_CUDA(cudaSetDevice(gpu_id));
-    const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
-
-    typedef SymQueryKernel<measure, ValueT, KeyT, D, KBuild, KF, 64, BaseT, BAddrT,
-                           GAddrT>
-        SymQueryKernel;
-
-    cudaMemsetAsync(
-        ggnn_buffer->d_sym_buffer, -1,
-        static_cast<GAddrT>(static_cast<GAddrT>(getNs(layer))) *
-            KF * sizeof(KeyT), shard.stream);
-
-    cudaMemsetAsync(ggnn_buffer->d_sym_atomic, 0, getNs(layer) * sizeof(int), shard.stream);
-
-    SymQueryKernel sym_kernel;
-
-    sym_kernel.d_base = shard.d_base;
-    sym_kernel.d_graph = getGraph(shard_id, layer);
-    sym_kernel.d_translation = getTranslation(shard_id, layer);
-
-    sym_kernel.d_sym_atomic = ggnn_buffer->d_sym_atomic;
-    sym_kernel.d_sym_buffer = ggnn_buffer->d_sym_buffer;
-
-    sym_kernel.d_nn1_stats = shard.d_nn1_stats;
-    sym_kernel.d_stats = ggnn_buffer->d_statistics;
-
-    sym_kernel.layer = layer;
-
-    sym_kernel.N = getNs(layer);
-
-    sym_kernel.N_offset = 0;
-
-    // CHECK_CUDA(cudaPeekAtLastError());
-    // CHECK_CUDA(cudaDeviceSynchronize());
-    // CHECK_CUDA(cudaPeekAtLastError());
-
-    time_launcher(2, &sym_kernel, getNs(layer), shard.stream);
-
-    // CHECK_CUDA(cudaPeekAtLastError());
-    // CHECK_CUDA(cudaDeviceSynchronize());
-    // CHECK_CUDA(cudaPeekAtLastError());
-
-    typedef SymBufferMergeKernel<ValueT, KeyT, KBuild, KF, 128, GAddrT>
-        SymBufferMergeKernel;
-    SymBufferMergeKernel sym_buffer_merge_kernel;
-
-    sym_buffer_merge_kernel.d_sym_buffer = ggnn_buffer->d_sym_buffer;
-    sym_buffer_merge_kernel.d_sym_atomic = ggnn_buffer->d_sym_atomic;
-    sym_buffer_merge_kernel.d_graph = getGraph(shard_id, layer);
-
-    sym_buffer_merge_kernel.N = getNs(layer);
-    sym_buffer_merge_kernel.N_offset = 0;
-
-    time_launcher(3, &sym_buffer_merge_kernel, getNs(layer), shard.stream);
-
-    // CHECK_CUDA(cudaPeekAtLastError());
-    // CHECK_CUDA(cudaDeviceSynchronize());
-    // CHECK_CUDA(cudaPeekAtLastError());
-
-    if(VLOG_IS_ON(2)){
-      int* h_sym_atomic;
-      //int* h_statistics;
-
-      CHECK_CUDA(cudaMallocHost(&h_sym_atomic, static_cast<size_t>(getNs(layer)) * sizeof(int)));
-      //CHECK_CUDA(cudaMallocHost(&h_statistics, static_cast<size_t>(getNs(layer)) * sizeof(int)));
-
-      cudaMemcpyAsync(h_sym_atomic, ggnn_buffer->d_sym_atomic, static_cast<size_t>(getNs(layer)) * sizeof(int), cudaMemcpyDeviceToHost, shard.stream);
-      //cudaMemcpyAsync(h_statistics, ggnn_buffer->d_statistics, static_cast<size_t>(getNs(layer)) * sizeof(int), cudaMemcpyDeviceToHost, shard.stream);
-
-      cudaStreamSynchronize(shard.stream);
-
-      int c = 0;
-      int m = 0;
-      // int unconnected = 0;
-      for (int i = 0; i < getNs(layer); i++) {
-        if (h_sym_atomic[i] > KF) c++;
-        m += (h_sym_atomic[i] > KF) ? KF : h_sym_atomic[i];
-        // unconnected += h_statistics[i];
-      }
-      VLOG(2) << "Layer " << layer
-              << " [N: " << getNs(layer)
-              << "] | overflow: " << c << " (" << c / float(getNs(layer))
-              << ") | added_links: " << m << " (" << m / float(getNs(layer))
-              << ") || unconnected: OVERFLOW_STATS currently not computed. )\n";
-
-      cudaFreeHost(h_sym_atomic);
-    }
-
-    // cudaFree(d_sym_buffer);
-    // cudaFree(m_sym_atomic);
-    // cudaFree(m_statistics);
-
-    // CHECK_CUDA(cudaPeekAtLastError());
-    // CHECK_CUDA(cudaDeviceSynchronize());
-    // CHECK_CUDA(cudaPeekAtLastError());
-  };
-
-  void build(const int part_id, const int shard_id = 0) {
-    CHECK(ggnn_buffer) << "the construction buffer is not allocated.";
-
-    VLOG(1) << "build(): part_id: " << part_id << " shard_id: " << shard_id;
-    for (int layer_top = 0; layer_top < L; layer_top++) {
-      for (int layer_btm = layer_top; layer_btm >= 0; layer_btm--) {
-        VLOG(2) << "layer_top: " << layer_top << " -> layer_btm: " << layer_btm << std::endl;
-
-        merge(layer_top, layer_btm, shard_id);
-
-        if (layer_top < (L - 1) && layer_top == layer_btm)
-          select(layer_top, shard_id);
-
-        sym(layer_btm, shard_id);
-      }
-    }
-  }
-
-  void refine(const int shard_id = 0) {
-    for (int layer = L - 2; layer >= 0; layer--) {
-      merge(L - 1, layer, shard_id);
-      sym(layer, shard_id);
-    }
-  }
-};
-
-#endif  // INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_
diff --git a/include/ggnn/cuda_knn_ggnn_multi_gpu.cuh b/include/ggnn/cuda_knn_ggnn_multi_gpu.cuh
deleted file mode 100644
index 0a8d607..0000000
--- a/include/ggnn/cuda_knn_ggnn_multi_gpu.cuh
+++ /dev/null
@@ -1,568 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_CUDA_KNN_GGNN_MULTI_GPU_CUH_
-#define INCLUDE_GGNN_CUDA_KNN_GGNN_MULTI_GPU_CUH_
-
-#include <chrono>
-#include <limits>
-#include <string>
-#include <thread>
-#include <stdio.h>
-#include <cstring>
-#include <vector>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include "cub/cub.cuh"
-#include "ggnn/cuda_knn_ggnn_gpu_instance.cuh"
-#include "ggnn/graph/cuda_knn_ggnn_graph_device.cuh"
-#include "ggnn/graph/cuda_knn_ggnn_graph_host.cuh"
-#include "ggnn/query/cuda_knn_query_layer.cuh"
-#include "ggnn/query/cuda_knn_ggnn_query.cuh"
-#include "ggnn/query/cuda_knn_bf_query_layer.cuh"
-#include "ggnn/query/cuda_knn_stats_query_layer.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_ggnn_results.cuh"
-
-// only needed for getTotalSystemMemory()
-#include <unistd.h>
-
-size_t getTotalSystemMemory()
-{
-    size_t pages = sysconf(_SC_PHYS_PAGES);
-    // this excludes memory used for caching files...
-    //size_t free_pages = sysconf(_SC_AVPHYS_PAGES);
-    size_t page_size  = sysconf(_SC_PAGE_SIZE);
-    return pages * page_size;
-}
-
-
-/**
- * GGNN multi-GPU wrapper
- *
- * @param measure distance measure: Euclidean or Cosine
- * @param KeyT datatype of dataset indices (needs to be able to represent
- * N_base, signed integer required)
- * @param ValueT distance value type
- * @param GAddrT address type used to access neighborhood vectors (needs to be
- * able to represent N_all*K)
- * @param BaseT datatype of dataset vector elements
- * @param BAddrT address type used to access dataset vectors (needs to be able
- * to represent N_base*D)
- * @param D dimension of dataset
- * @param KBuild neighbors per node in the GGNN graph
- * @param KF maximum number of inverse links per node in the GGNN graph
- * @param KQuery number of nearest neighbors to retrieve during query
- * @param S segment size
- */
-template <DistanceMeasure measure,
-          typename KeyT, typename ValueT, typename GAddrT, typename BaseT,
-          typename BAddrT, int D, int KBuild, int KF, int KQuery, int S>
-struct GGNNMultiGPU {
-
-  using Dataset = Dataset<KeyT, BaseT, BAddrT>;
-  using GGNNGPUInstance = GGNNGPUInstance<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF, KQuery, S>;
-  using GGNNResults = GGNNResults<measure, KeyT, ValueT, BaseT, BAddrT, KQuery>;
-
-  Dataset dataset;
-
-  /// one instance per GPU
-  std::vector<GGNNGPUInstance> ggnn_gpu_instances;
-
-  int num_parts {0};
-  bool swap_to_disk {false};
-  bool swap_to_ram {false};
-  bool process_shards_back_to_front {false};
-  std::string graph_dir;
-
-  const int L;
-  const float tau_build;
-
-  const bool generate_gt;
-
-  GGNNMultiGPU(const std::string& basePath, const std::string& queryPath,
-       const std::string& gtPath, const int L, const float tau_build, const size_t N_base = std::numeric_limits<size_t>::max())
-      : dataset{basePath, queryPath, gtPath, N_base},
-        L{L},
-        tau_build{tau_build},
-        generate_gt{gtPath.empty()} {
-    CHECK_EQ(dataset.D, D) << "DIM needs to be the same";
-  }
-
-  void ggnnMain(const std::vector<int>& gpus, const std::string& mode,
-                const int N_shard, const std::string& graph_dir,
-                const int refinement_iterations,
-                const bool grid_search) {
-
-    const bool build = mode.find('b') != std::string::npos;
-    const bool store =  build && mode.find('s') != std::string::npos;
-    const bool load  = !build && mode.find('l') != std::string::npos;
-    const bool query = mode.find('q') != std::string::npos;
-
-    {
-      std::string mode("Mode: ");
-      if (build)
-        mode += "BUILD";
-      else if (load)
-        mode += "LOAD";
-      if (store)
-        mode += " AND STORE";
-      if (query)
-        mode += " AND QUERY";
-      VLOG(0) << mode;
-    }
-
-    configure(gpus, build, N_shard, graph_dir);
-
-    if (build) {
-      this->build(refinement_iterations);
-      if (store)
-        this->store();
-    }
-    else if (load)
-      this->load();
-    if (query) {
-      if (grid_search) {
-        for (int i=0; i<70; ++i)
-          this->query(i*0.01f);
-        for (int i=7; i<=20; ++i)
-          this->query(i*0.1f);
-      }
-      else {
-        this->query(0.3f);
-        this->query(0.4f);
-        this->query(0.5f);
-        this->query(0.6f);
-      }
-    }
-  }
-
-  static size_t computeGraphSize(const int N_shard, const int L) {
-    /// theoretical growth factor (number of sub-graphs merged together per
-    /// layer)
-    const float growth = powf(N_shard / static_cast<float>(S), 1.f / (L - 1));
-
-    const int Gf = growth;
-    const int Gc = growth + 1;
-
-    const float S0f = N_shard / (pow(Gf, (L - 1)));
-    const float S0c = N_shard / (pow(Gc, (L - 1)));
-
-    const bool is_floor =
-        (growth > 0) && ((S0c < KBuild) || (fabs(S0f - S) < fabs(S0c - S)));
-
-    const int G = (is_floor) ? Gf : Gc;
-    const int S0 = (is_floor) ? S0f : S0c;
-    const int S0_off = N_shard - pow(G, L - 1) * S0;
-
-    int N_all = 0;
-    int ST_all = 0;
-
-    int N_current = N_shard;
-    for (int l = 0; l < L; l++) {
-      N_all += N_current;
-      if (l) {
-        ST_all += N_current;
-        N_current /= G;
-      }
-      else {
-        N_current = S;
-        for (int i=2;i<L; ++i)
-          N_current *= G;
-      }
-    }
-
-    // just to make sure that everything is sufficiently aligned
-    auto align8 = [](size_t size) -> size_t {return ((size+7)/8)*8;};
-
-    const size_t graph_size = align8(static_cast<size_t>(N_all) * KBuild * sizeof(KeyT));
-    const size_t selection_translation_size = align8(ST_all * sizeof(KeyT));
-    // const size_t nn1_dist_buffer_size = N * sizeof(ValueT);
-    const size_t nn1_stats_size = align8(2 * sizeof(ValueT));
-    const size_t total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size;
-
-    return total_graph_size;
-  }
-
-  void configure(const std::vector<int>& gpu_ids={0}, bool enable_construction=true,
-                 int N_shard=-1, const std::string graph_dir="") {
-    ggnn_gpu_instances.clear();
-
-    CHECK(!graph_dir.empty());
-    if (graph_dir.back() == '/')
-      this->graph_dir = graph_dir;
-    else
-      this->graph_dir = graph_dir+'/';
-
-    const int num_gpus = gpu_ids.size();
-    // determine shard sizes and number of iterations
-    if (N_shard < 0)
-      N_shard = dataset.N_base/num_gpus;
-    const int num_iterations = dataset.N_base/(N_shard * num_gpus);
-    num_parts = num_gpus*num_iterations;
-    CHECK_EQ(N_shard*num_gpus*num_iterations, dataset.N_base) << "N_shard x num_gpus xnum_iterations needs to be equal to N_base, for now.";
-
-    // determine number of cpu-side buffers
-    const size_t total_graph_size = computeGraphSize(N_shard, L);
-    const size_t total_memory = getTotalSystemMemory();
-    // guess the available memory (assume 1/8 used elsewhere, subtract dataset)
-    const size_t available_memory = total_memory-total_memory/8-sizeof(ValueT)*static_cast<size_t>(dataset.N_base)*D;
-
-    const int max_parts_per_gpu = available_memory/(total_graph_size*num_gpus);
-    LOG(INFO) << "estimated remaining host memory (" << available_memory/(1024.0f*1024.0f*1024.0f)
-              << " GB) suffices for " << max_parts_per_gpu << " parts per GPU ("
-              << total_graph_size/(1024.0f*1024.0f*1024.0f) << " GB each).";
-
-    CHECK_GT(max_parts_per_gpu, 0) << "use smaller shards.";
-
-    const int num_cpu_buffers_per_gpu = min(num_iterations, max_parts_per_gpu);
-
-    swap_to_disk = num_cpu_buffers_per_gpu < num_iterations;
-
-    ggnn_gpu_instances.reserve(num_gpus);
-
-    VLOG(4) << "allocating shards...";
-    for (int device_i=0; device_i<num_gpus; ++device_i) {
-      const int gpu_id = gpu_ids[device_i];
-      CHECK_CUDA(cudaSetDevice(gpu_id));
-
-      ggnn_gpu_instances.emplace_back(gpu_id, &dataset, N_shard, L, enable_construction, tau_build, num_iterations, num_cpu_buffers_per_gpu);
-
-      swap_to_ram |= ggnn_gpu_instances.at(device_i).ggnn_shards.size() < num_iterations;
-
-      if (!swap_to_disk) {
-        const size_t num_gpu_shards = ggnn_gpu_instances.at(device_i).ggnn_shards.size();
-        for (int i=0; i<num_gpu_shards; ++ i)
-          ggnn_gpu_instances.at(device_i).loadShardBaseDataAsync(device_i * num_iterations + i, i);
-      }
-    }
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-    VLOG(4) << "GGNN multi-GPU setup configured.";
-    if (swap_to_disk)
-      VLOG(4) << "shards will be swapped to disk. (not all parts fit into ram simultaneously)";
-    if (swap_to_ram)
-      VLOG(4) << "shards will be swapped to ram. (not all shards fit onto the gpu simultaneously)";
-  }
-
-  void build(const int refinement_iterations) {
-    CHECK(!ggnn_gpu_instances.empty()) << "configure() the multi-GPU setup first!";
-
-    const int num_gpus = int(ggnn_gpu_instances.size());
-    const int N_shard = ggnn_gpu_instances[0].N_shard;
-    const int num_iterations = int(num_parts/ggnn_gpu_instances.size());
-
-    std::vector<int64_t> build_times(num_parts);
-    VLOG(0) << "GGNN::build()"
-            << " | num_gpus: " << num_gpus
-            << " | N_shard: " << N_shard
-            << " | num_iterations: " << num_iterations;
-
-    std::vector<std::thread> threads;
-    threads.reserve(num_gpus);
-
-    for (int device_i = 0; device_i < num_gpus; device_i++) {
-      std::thread t([&, device_i]() {
-        auto& gpu_instance = ggnn_gpu_instances.at(device_i);
-        const int gpu_id = gpu_instance.gpu_id;
-        const int num_gpu_buffers = gpu_instance.ggnn_shards.size();
-        const int num_cpu_buffers = gpu_instance.ggnn_cpu_buffers.size();
-        CHECK_CUDA(cudaSetDevice(gpu_id));
-
-        cudaEvent_t start, stop;
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-
-        // printf("[gpu: %d] N_shard: %d \n", gpu_id, N_shard);
-        VLOG(1) << "[GPU: " << gpu_id << "] N_shard: " << N_shard;
-
-        if (swap_to_disk) {
-          for (int i = 0; i < num_gpu_buffers; i++)
-            gpu_instance.loadShardBaseDataAsync(device_i * num_iterations + i, i);
-        }
-        if (swap_to_ram) {
-          for (int i = 0; i < num_cpu_buffers; i++)
-            gpu_instance.ggnn_cpu_buffers[i].current_part_id = -1;
-        }
-
-        for (int i = 0; i < num_iterations; i++)
-        {
-          const int part_id = device_i * num_iterations + i;
-
-          auto& shard = gpu_instance.ggnn_shards.at(i%gpu_instance.ggnn_shards.size());
-
-          cudaStreamSynchronize(shard.stream);
-
-          cudaEventRecord(start, shard.stream);
-          gpu_instance.build(part_id, i);
-
-          for (int refinement_step = 0; refinement_step < refinement_iterations;
-              ++refinement_step) {
-            DLOG(INFO) << "Refinement step " << refinement_step;
-            gpu_instance.refine(i);
-          }
-          cudaEventRecord(stop, shard.stream);
-
-          cudaEventSynchronize(stop);
-          float milliseconds = 0;
-          cudaEventElapsedTime(&milliseconds, start, stop);
-          VLOG(0) << "[GPU: " << gpu_id << "] part: " << part_id << " => seconds: " << milliseconds/1000.f << " [" << N_shard << " points build -> " << milliseconds*1000.0f/N_shard << " us/point] \n";
-          build_times[part_id] = milliseconds;
-
-          if (swap_to_disk || swap_to_ram) {
-            if (swap_to_disk)
-              gpu_instance.storePartAsync(graph_dir, part_id, i);
-            else
-              gpu_instance.downloadPartAsync(part_id, i);
-
-            if (i+num_gpu_buffers < num_iterations)
-              gpu_instance.loadShardBaseDataAsync(part_id+num_gpu_buffers, i+num_gpu_buffers);
-          }
-        }
-
-        if (swap_to_disk || swap_to_ram)
-        {
-          for (int i = 0; i < num_iterations; i++)
-            gpu_instance.waitForDiskIO(i);
-        }
-
-        VLOG(0) << "[GPU: " << gpu_id << "] build() done.";
-      });
-      threads.push_back(std::move(t));
-    }
-
-    for (auto&& t : threads) {
-        t.join();
-    }
-
-    float build_time_ms = 0.f;
-    for (auto&& b : build_times)
-    {
-      build_time_ms += static_cast<float>(b);
-    }
-
-    VLOG(0) << "Combined build time: " << build_time_ms/1000.f << " s \n";
-
-    process_shards_back_to_front = true;
-  }
-
-  void store() {
-    CHECK(!ggnn_gpu_instances.empty()) << "configure() the multi-GPU setup first!";
-    if (swap_to_disk) {
-      VLOG(4) << "graph should already be stored on-the-fly";
-      return;
-    }
-
-    const int num_gpus = int(ggnn_gpu_instances.size());
-    const int num_iterations = int(num_parts/ggnn_gpu_instances.size());
-
-    std::vector<std::thread> threads;
-    threads.reserve(num_gpus);
-
-    for (int device_i = 0; device_i < num_gpus; device_i++) {
-      std::thread t([&, device_i]() {
-        auto& gpu_instance = ggnn_gpu_instances.at(device_i);
-        const int gpu_id = gpu_instance.gpu_id;
-
-        for (int i = 0; i < num_iterations; i++) {
-          const int part_id = device_i * num_iterations + i;
-          gpu_instance.storePartAsync(graph_dir, part_id, i);
-        }
-        for (int i = 0; i < num_iterations; i++) {
-          gpu_instance.waitForDiskIO(i);
-        }
-
-        VLOG(0) << "[GPU: " << gpu_id << "] store() done.";
-      });
-      threads.push_back(std::move(t));
-    }
-
-    for (auto&& t : threads) {
-        t.join();
-    }
-  }
-
-  void load() {
-    CHECK(!ggnn_gpu_instances.empty()) << "configure() the multi-GPU setup first!";
-    if (swap_to_disk) {
-      VLOG(4) << "graph will be loaded on-the-fly";
-      return;
-    }
-
-    const int num_gpus = int(ggnn_gpu_instances.size());
-    const int num_iterations = int(num_parts/ggnn_gpu_instances.size());
-
-    std::vector<std::thread> threads;
-    threads.reserve(num_gpus);
-
-    for (int device_i = 0; device_i < num_gpus; device_i++) {
-      std::thread t([&, device_i]() {
-        auto& gpu_instance = ggnn_gpu_instances.at(device_i);
-        const int gpu_id = gpu_instance.gpu_id;
-
-        for (int i = 0; i < num_iterations; i++) {
-          const int part_id = device_i * num_iterations + i;
-          gpu_instance.loadPartAsync(graph_dir, part_id, i);
-        }
-        for (int i = 0; i < num_iterations; i++)
-          gpu_instance.waitForDiskIO(i);
-
-        VLOG(0) << "[GPU: " << gpu_id << "] load() done.";
-      });
-      threads.push_back(std::move(t));
-    }
-
-    for (auto&& t : threads) {
-        t.join();
-    }
-  }
-
-  void query(const float tau_query) {
-    CHECK(!ggnn_gpu_instances.empty()) << "configure() the multi-GPU setup first!";
-
-    dataset.template checkForDuplicatesInGroundTruth<measure, ValueT>(KQuery);
-
-    const int num_gpus = int(ggnn_gpu_instances.size());
-    const int N_shard = ggnn_gpu_instances[0].N_shard;
-    const int num_iterations = int(num_parts/ggnn_gpu_instances.size());
-
-    VLOG(0) << "GGNN::query()"
-            << " | tau_query: " << tau_query
-            << " | num_gpus: " << num_gpus
-            << " | N_shard: " << N_shard
-            << " | num_iterations: " << num_iterations;
-
-    GGNNResults ggnn_results{&dataset, num_gpus, num_iterations};
-
-    std::vector<std::thread> threads;
-    threads.reserve(num_gpus);
-
-    for (int device_i = 0; device_i < num_gpus; device_i++) {
-      std::thread t([&, device_i]() {
-        auto& gpu_instance = ggnn_gpu_instances.at(device_i);
-        const int gpu_id = gpu_instance.gpu_id;
-        const int num_gpu_buffers = gpu_instance.ggnn_shards.size();
-        const int num_cpu_buffers = gpu_instance.ggnn_cpu_buffers.size();
-        const int prefetch_amount = min(num_cpu_buffers, num_gpu_buffers);
-        CHECK_CUDA(cudaSetDevice(gpu_id));
-
-        cudaEvent_t start, stop;
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-        float milliseconds = 0;
-
-        cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-
-        CHECK_CUDA(cudaPeekAtLastError());
-        CHECK_CUDA(cudaDeviceSynchronize());
-        CHECK_CUDA(cudaPeekAtLastError());
-
-        if (swap_to_disk || swap_to_ram) {
-          // initially, prefetch for the entire gpu
-          for (int i = 0; i < num_gpu_buffers; i++) {
-            const int j = process_shards_back_to_front ? num_iterations-i-1 : i;
-            const int part_id = device_i * num_iterations + j;
-            gpu_instance.loadPartAsync(graph_dir, part_id, j);
-          }
-        }
-
-        // TODO: warmup (here or in another function?)
-
-        for (int i = 0; i < num_iterations; i++)
-        {
-          const int j = process_shards_back_to_front ? num_iterations-i-1 : i;
-          const int part_id = device_i * num_iterations + j;
-
-          auto& shard = gpu_instance.ggnn_shards.at(j%gpu_instance.ggnn_shards.size());
-
-          if (swap_to_disk || swap_to_ram) {
-            auto begin = std::chrono::high_resolution_clock::now();
-            gpu_instance.waitForDiskIO(j);
-            auto end = std::chrono::high_resolution_clock::now();
-            auto cpu_us = std::chrono::duration_cast<std::chrono::microseconds>(end - begin);
-            VLOG(0) << "[GPU: " << gpu_id << "] shard-swap delay: " << cpu_us.count()*0.001f << " ms.";
-          }
-
-          cudaStreamSynchronize(shard.stream);
-
-          cudaEventRecord(start, shard.stream);
-          gpu_instance.template queryLayer<32, 400, 448, 64>(j);
-          cudaEventRecord(stop, shard.stream);
-
-          if (swap_to_disk || swap_to_ram) {
-            // start the upload for the next shard after starting the current query
-            // then, it should be able to overlap
-            // prefetch only as much in parallel as there are cpu buffers
-            if (process_shards_back_to_front) {
-              if (j-prefetch_amount < num_iterations-num_gpu_buffers && j-prefetch_amount >= 0) {
-                gpu_instance.loadPartAsync(graph_dir, part_id-prefetch_amount, j-prefetch_amount);
-              }
-            }
-            else if (j+prefetch_amount >= num_gpu_buffers && j+prefetch_amount < num_iterations) {
-              gpu_instance.loadPartAsync(graph_dir, part_id+prefetch_amount, j+prefetch_amount);
-            }
-          }
-
-          cudaEventSynchronize(stop);
-
-          cudaEventElapsedTime(&milliseconds, start, stop);
-          VLOG(0) << "[GPU: " << gpu_id << "] query part: " << part_id << " => ms: " << milliseconds << " [" << dataset.N_query << " points query -> " << milliseconds*1000.0f/dataset.N_query << " us/point] \n";
-        }
-
-        const cudaStream_t shard0Stream = gpu_instance.ggnn_shards.at(0).stream;
-
-        cudaEventRecord(start, shard0Stream);
-        gpu_instance.ggnn_query.sortAsync(shard0Stream);
-        cudaEventRecord(stop, shard0Stream);
-        cudaEventSynchronize(stop);
-
-        cudaEventElapsedTime(&milliseconds, start, stop);
-        if(num_iterations > 1) {
-          VLOG(0) << "[GPU: " << device_i << "] query sort: " << " => ms: " << milliseconds << " [" << dataset.N_query << " points query -> " << milliseconds*1000.0f/dataset.N_query << " us/point] \n";
-        }
-
-        ggnn_results.loadAsync(gpu_instance.ggnn_query, device_i, shard0Stream);
-        cudaStreamSynchronize(shard0Stream);
-
-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
-
-        CHECK_CUDA(cudaPeekAtLastError());
-        CHECK_CUDA(cudaDeviceSynchronize());
-        CHECK_CUDA(cudaPeekAtLastError());
-
-        VLOG(0) << "[GPU: " << gpu_id << "] query() done.";
-      });
-      threads.push_back(std::move(t));
-    }
-
-    for (auto&& t : threads) {
-        t.join();
-    }
-
-    // CPU Zone:
-    ggnn_results.merge();
-    ggnn_results.evaluateResults();
-
-    // process the shards in reverse order during the next query for improved cache utilization
-    process_shards_back_to_front = !process_shards_back_to_front;
-  }
-}; // GGNN
-
-#endif  // INCLUDE_GGNN_CUDA_KNN_GGNN_MULTI_GPU_CUH_
diff --git a/include/ggnn/cuda_utils/check.cuh b/include/ggnn/cuda_utils/check.cuh
new file mode 100644
index 0000000..43bcdf6
--- /dev/null
+++ b/include/ggnn/cuda_utils/check.cuh
@@ -0,0 +1,35 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_CHECK_CUH
+#define INCLUDE_GGNN_CHECK_CUH
+
+#include <cuda_runtime.h>
+
+#include <glog/logging.h>
+
+namespace ggnn {
+
+#define CHECK_CUDA(instruction)                                                   \
+  {                                                                               \
+    const cudaError res = instruction;                                            \
+    CHECK_EQ(res, cudaSuccess) << #instruction << " " << cudaGetErrorString(res); \
+  }
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_CHECK_CUH
diff --git a/include/ggnn/cuda_utils/distance.cuh b/include/ggnn/cuda_utils/distance.cuh
new file mode 100644
index 0000000..3a44350
--- /dev/null
+++ b/include/ggnn/cuda_utils/distance.cuh
@@ -0,0 +1,168 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_DISTANCE_CUH
+#define INCLUDE_GGNN_DISTANCE_CUH
+
+#include <ggnn/base/def.h>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <cub/cub.cuh>
+
+namespace ggnn {
+
+/**
+ * Distance calculates the distance/difference between the base vector and
+ * other_id vector.
+ */
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_DIM_X,
+          uint32_t DIST_ITEMS_PER_THREAD>
+struct Distance {
+  const uint32_t D;
+  const DistanceMeasure measure;
+
+  // only valid in thread 0, only needed if measure == Cosine
+  ValueT r_query_norm;
+
+  using AddrT = size_t;
+
+  struct DistanceAndNorm {
+    ValueT dist{0.0f};
+    ValueT norm{0.0f};
+
+    struct Sum {
+      __host__ __device__ __forceinline__ DistanceAndNorm operator()(const DistanceAndNorm& a,
+                                                                     const DistanceAndNorm& b) const
+      {
+        return {a.dist + b.dist, a.norm + b.norm};
+      }
+    };
+  };
+
+  using BlockReduceDist = cub::BlockReduce<ValueT, BLOCK_DIM_X>;
+  using BlockReduceDistAndNorm = cub::BlockReduce<DistanceAndNorm, BLOCK_DIM_X>;
+
+  union TempStorage {
+    typename BlockReduceDist::TempStorage dist_temp_storage;
+    typename BlockReduceDistAndNorm::TempStorage dist_and_norm_temp_storage;
+  };
+
+  const BaseT* d_base;
+  BaseT r_query[DIST_ITEMS_PER_THREAD];
+
+  TempStorage& s_temp_storage;
+  __device__ __forceinline__ TempStorage& PrivateTmpStorage()
+  {
+    __shared__ TempStorage s_tmp;
+    return s_tmp;
+  }
+  ValueT& s_dist;
+  __device__ __forceinline__ ValueT& DistTmpStorage()
+  {
+    __shared__ ValueT s_dist;
+    return s_dist;
+  }
+
+  __device__ __forceinline__ Distance(const uint32_t D, DistanceMeasure measure,
+                                      const BaseT* d_base, const BaseT* d_query, const KeyT n)
+      : D(D),
+        measure(measure),
+        d_base(d_base),
+        s_temp_storage(PrivateTmpStorage()),
+        s_dist(DistTmpStorage())
+  {
+    loadQueryPos(d_query + static_cast<AddrT>(n) * D);
+  }
+
+  __device__ __forceinline__ Distance(const uint32_t D, DistanceMeasure measure,
+                                      const BaseT* d_base, const KeyT n)
+      : D(D),
+        measure(measure),
+        d_base(d_base),
+        s_temp_storage(PrivateTmpStorage()),
+        s_dist(DistTmpStorage())
+  {
+    loadQueryPos(d_base + static_cast<AddrT>(n) * D);
+  }
+
+  __device__ __forceinline__ void loadQueryPos(const BaseT* d_query)
+  {
+    ValueT query_norm = 0.0f;
+    for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+      const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+      r_query[item] = (read_dim < D) ? d_query[read_dim] : 0;
+      if (measure == DistanceMeasure::Cosine)
+        query_norm += static_cast<ValueT>(r_query[item]) * static_cast<ValueT>(r_query[item]);
+    }
+    if (measure == DistanceMeasure::Cosine) {
+      // only needed by thread 0
+      r_query_norm = BlockReduceDist(s_temp_storage.dist_temp_storage).Sum(query_norm);
+    }
+  }
+
+  __device__ __forceinline__ ValueT distance_synced(const KeyT other_id)
+  {
+    BaseT r_other[DIST_ITEMS_PER_THREAD];
+
+    for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+      const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+      r_other[item] = (read_dim < D) ? d_base[static_cast<AddrT>(other_id) * D + read_dim] : 0;
+    }
+    ValueT dist{0.0f};
+    if (measure == DistanceMeasure::Euclidean) {
+      for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+        const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+        const ValueT diff =
+            (read_dim < D) ? static_cast<ValueT>(r_other[item]) - static_cast<ValueT>(r_query[item])
+                           : 0;
+        dist += diff * diff;
+      }
+      dist = BlockReduceDist(s_temp_storage.dist_temp_storage).Sum(dist);
+      if (!threadIdx.x)
+        s_dist = dist;
+    }
+    if (measure == DistanceMeasure::Cosine) {
+      DistanceAndNorm dist_and_norm{0.0f, 0.0f};
+      for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+        const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+        dist_and_norm.dist +=
+            (read_dim < D) ? static_cast<ValueT>(r_other[item]) * static_cast<ValueT>(r_query[item])
+                           : 0;
+        dist_and_norm.norm +=
+            (read_dim < D) ? static_cast<ValueT>(r_other[item]) * static_cast<ValueT>(r_other[item])
+                           : 0;
+      }
+      dist_and_norm = BlockReduceDistAndNorm(s_temp_storage.dist_and_norm_temp_storage)
+                          .Reduce(dist_and_norm, DistanceAndNorm::Sum());
+      if (!threadIdx.x) {
+        // need to normalize by the vectors' lengths (in high dimensions, no vector has length 1.0f)
+        const ValueT norm_sqr = r_query_norm * dist_and_norm.norm;
+        // use negative dot product, as larger values are closer to each other
+        s_dist = (norm_sqr > 0.0f) ? fabs(1.0f - dist_and_norm.dist / sqrtf(norm_sqr)) : 1.0f;
+      }
+    }
+    __syncthreads();
+
+    return s_dist;
+  }
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_DISTANCE_CUH
diff --git a/include/ggnn/cuda_utils/k_best_list.cuh b/include/ggnn/cuda_utils/k_best_list.cuh
new file mode 100644
index 0000000..abed733
--- /dev/null
+++ b/include/ggnn/cuda_utils/k_best_list.cuh
@@ -0,0 +1,146 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_K_BEST_LIST_CUH
+#define INCLUDE_GGNN_K_BEST_LIST_CUH
+
+#include <cstdint>
+#include <limits>
+
+namespace ggnn {
+
+/**
+ * KBestList stores the K best elements in parallel.
+ */
+template <typename KeyT, typename ValueT, uint32_t BLOCK_DIM_X>
+struct KBestList {
+  const uint32_t BEST_SIZE;
+
+  ValueT* s_dists;
+  KeyT* s_ids;
+
+  static constexpr KeyT EMPTY_KEY = -1;
+
+  __device__ __forceinline__ void initSharedStorage(uint32_t BEST_SIZE)
+  {
+    extern __shared__ ValueT shared_kBestList[];
+    s_dists = shared_kBestList;
+    s_ids = reinterpret_cast<KeyT*>(&s_dists[BEST_SIZE]);
+  }
+
+  __device__ __forceinline__ void init()
+  {
+    for (uint32_t i = 0; i < BEST_SIZE; i += BLOCK_DIM_X) {
+      const uint32_t k = i + threadIdx.x;
+      if (k < BEST_SIZE) {
+        s_dists[k] = std::numeric_limits<ValueT>::infinity();
+        s_ids[k] = EMPTY_KEY;
+      }
+    }
+    __syncthreads();
+  }
+
+  __device__ __forceinline__ KBestList(uint32_t BEST_SIZE) : BEST_SIZE(BEST_SIZE)
+  {
+    initSharedStorage(BEST_SIZE);
+    init();
+  }
+
+  __device__ __forceinline__ ValueT worst()
+  {
+    return s_dists[BEST_SIZE - 1];
+  }
+
+  /**
+   * Enters element with dist and id to list. [parallel call]:
+   * On same distances the entry is placed to the left.
+   *
+   * `list.add_unique(dist, id)`
+   *
+   * Note: __syncthreads() need before next 'list' call.
+   *
+   */
+  __device__ __forceinline__ void add_unique(ValueT dist, KeyT id)
+  {
+    // process blocks from right to left (we shift to the right)
+    for (uint32_t i = ((BEST_SIZE - 1) / BLOCK_DIM_X) * BLOCK_DIM_X;; i -= BLOCK_DIM_X) {
+      const uint32_t k = i + threadIdx.x;
+      ValueT r_dist;
+      KeyT r_id;
+      // read current value
+      if (k < BEST_SIZE) {
+        r_dist = s_dists[k];
+        r_id = s_ids[k];
+      }
+      __syncthreads();
+      if (k < BEST_SIZE) {
+        // shift and enter new point if new distance is smalller
+        if (dist < r_dist) {
+          // shift current value to next position
+          if (k < (BEST_SIZE - 1)) {
+            s_dists[k + 1] = r_dist;
+            s_ids[k + 1] = r_id;
+          }
+
+          // enter new point if left index is smaller and will not shift into my position
+          if (!k || s_dists[k - 1] <= dist) {
+            s_dists[k] = dist;
+            s_ids[k] = id;
+          }
+        }
+      }
+      if (!i)
+        break;
+    }
+  }
+
+  /**
+   * Transforms all ids w.r.t. a transformation list. [parallel call]:
+   *
+   * `list.transform(transform_list)`
+   *
+   * Note: __syncthreads() need before next 'list' call.
+   *
+   */
+  __device__ __forceinline__ void transform(const KeyT* transform)
+  {
+    for (int i = 0; i < BEST_SIZE; i += BLOCK_DIM_X) {
+      const uint32_t k = i + threadIdx.x;
+      if (k < BEST_SIZE) {
+        const KeyT id = s_ids[k];
+        if (id != EMPTY_KEY)
+          s_ids[k] = transform[id];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void print(int len = -1)
+  {
+    __syncthreads();
+    if (!threadIdx.x) {
+      printf("KBestList: \n");
+      for (int i = 0; i < BEST_SIZE && (len < 0 || i < len); i++) {
+        printf("(%d -> %f [%d]) ", i, s_dists[i], s_ids[i]);
+      }
+      printf("\n");
+    }
+  }
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_K_BEST_LIST_CUH
diff --git a/include/ggnn/cuda_utils/simple_knn_cache.cuh b/include/ggnn/cuda_utils/simple_knn_cache.cuh
new file mode 100644
index 0000000..2678925
--- /dev/null
+++ b/include/ggnn/cuda_utils/simple_knn_cache.cuh
@@ -0,0 +1,403 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_SIMPLE_KNN_CACHE_CUH
+#define INCLUDE_GGNN_SIMPLE_KNN_CACHE_CUH
+
+#include <ggnn/base/def.h>
+#include <ggnn/cuda_utils/distance.cuh>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_DIM_X,
+          uint32_t DIST_ITEMS_PER_THREAD, bool DIST_STATS = false>
+struct SimpleKNNCache {
+  static constexpr KeyT EMPTY_KEY = static_cast<KeyT>(-1);
+  static constexpr ValueT EMPTY_DIST = std::numeric_limits<ValueT>::infinity();
+
+ private:
+  using Distance = ggnn::Distance<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD>;
+
+ public:
+  const uint32_t BEST_SIZE;
+  const uint32_t SORTED_SIZE;
+  const uint32_t CACHE_SIZE;
+
+  KeyT* s_cache;
+  ValueT* s_dists;
+  uint32_t r_prioQ_head;
+  uint32_t r0_visited_head;
+
+  bool& s_sync;
+  Distance rs_dist_calc;
+
+  ValueT r_xi;
+
+  // # threadIdx.x == 0 stats registers only
+  uint32_t dist_calc_counter;
+
+  __device__ __forceinline__ void initSharedStorage()
+  {
+    extern __shared__ KeyT shared_cache[];
+
+    s_cache = shared_cache;
+    s_dists = reinterpret_cast<ValueT*>(
+        &s_cache[CACHE_SIZE]);  // cacheSize = numIds in the cache, after that the dists start
+  }
+
+  __device__ __forceinline__ bool& SyncPrivateTmpStorage()
+  {
+    __shared__ bool s_sync_tmp;
+    return s_sync_tmp;
+  }
+
+  __device__ __forceinline__ void init()
+  {
+    for (uint32_t i = threadIdx.x; i < CACHE_SIZE; i += BLOCK_DIM_X) {
+      s_cache[i] = EMPTY_KEY;
+      if (i < SORTED_SIZE)
+        s_dists[i] = EMPTY_DIST;
+    }
+    r_prioQ_head = BEST_SIZE;
+    if (!threadIdx.x) {
+      if constexpr (DIST_STATS)
+        dist_calc_counter = 0;
+      r0_visited_head = SORTED_SIZE;
+    }
+    __syncthreads();
+  }
+
+  __device__ __forceinline__ SimpleKNNCache(const uint32_t D, const DistanceMeasure measure,
+                                            const uint32_t BEST_SIZE, const uint32_t SORTED_SIZE,
+                                            const uint32_t CACHE_SIZE, const BaseT* d_base,
+                                            const KeyT n, const ValueT xi_criteria)
+      : BEST_SIZE{BEST_SIZE},
+        SORTED_SIZE{SORTED_SIZE},
+        CACHE_SIZE(CACHE_SIZE),
+        s_sync(SyncPrivateTmpStorage()),
+        rs_dist_calc(D, measure, d_base, n),
+        r_xi(xi_criteria)
+  {
+    initSharedStorage();
+    init();
+  }
+
+  // dieses hier:
+  __device__ __forceinline__ SimpleKNNCache(const uint32_t D, const DistanceMeasure measure,
+                                            const uint32_t BEST_SIZE, const uint32_t SORTED_SIZE,
+                                            const uint32_t CACHE_SIZE, const BaseT* d_base,
+                                            const BaseT* d_query, const KeyT n,
+                                            const ValueT xi_criteria)
+      : BEST_SIZE{BEST_SIZE},
+        SORTED_SIZE{SORTED_SIZE},
+        CACHE_SIZE(CACHE_SIZE),
+        s_sync(SyncPrivateTmpStorage()),
+        rs_dist_calc(D, measure, d_base, d_query, n),
+        r_xi(xi_criteria)
+  {
+    initSharedStorage();
+    init();
+  }
+
+  __device__ __forceinline__ ValueT criteria() const
+  {
+    return s_dists[BEST_SIZE - 1] + r_xi;
+  }
+
+  __device__ __forceinline__ void push(const KeyT key, const ValueT dist)
+  {
+    __syncthreads();
+    // Register for insertion in best and prioq
+
+    // check for duplicates
+    {
+      if (!threadIdx.x)
+        s_sync = false;
+
+      __syncthreads();
+
+      for (uint32_t idx = threadIdx.x; idx < SORTED_SIZE && !s_sync; idx += BLOCK_DIM_X) {
+        if (s_cache[idx] == key)
+          s_sync = true;
+      }
+
+      __syncthreads();
+      if (s_sync)
+        return;
+    }
+
+    const uint32_t head_idx_prioQ = r_prioQ_head;
+    const uint32_t head_idx_in_prioQ = head_idx_prioQ - BEST_SIZE;
+
+    // process blocks from right to left (we shift to the right)
+    {
+      KeyT r_cache;
+      ValueT r_dists;
+
+      uint32_t idx;
+      bool active = false;
+
+      // start with the last block
+      uint32_t block_start = ((SORTED_SIZE + BLOCK_DIM_X - 1) / BLOCK_DIM_X) * BLOCK_DIM_X;
+
+      while (true) {
+        // shift
+        if (active) {
+          // Don't move if no entry or end of best or prioq.
+          if (r_cache != EMPTY_KEY) {
+            const uint32_t idx_next = (idx + 1 == SORTED_SIZE) ? BEST_SIZE : idx + 1;
+            const bool has_next = idx_next != BEST_SIZE && idx_next != head_idx_prioQ;
+            if (has_next) {
+              s_cache[idx_next] = r_cache;
+              s_dists[idx_next] = r_dists;
+            }
+          }
+
+          // Find insert points.
+          const bool has_prev = idx != 0 && idx != head_idx_prioQ;
+          const uint32_t idx_prev = idx != BEST_SIZE ? idx - 1 : SORTED_SIZE - 1;
+          if (!has_prev || s_dists[idx_prev] < dist) {
+            // insert into best list and priority queue
+            s_cache[idx] = key;
+            s_dists[idx] = dist;
+          }
+        }
+
+        if (!block_start)
+          break;
+
+        // update index
+        block_start -= BLOCK_DIM_X;
+        idx = block_start + threadIdx.x;
+        active = idx < SORTED_SIZE;
+
+        // read
+        if (active) {
+          // handle ringbuffer addresses
+          // TODO: reorder between threads to fix bank conflicts
+          if (idx >= BEST_SIZE) {
+            idx = (idx + head_idx_in_prioQ < SORTED_SIZE)
+                      ? idx + head_idx_in_prioQ
+                      : idx + head_idx_in_prioQ - SORTED_SIZE + BEST_SIZE;
+          }
+
+          r_cache = s_cache[idx];
+          r_dists = s_dists[idx];
+
+          // shift all elements with larger/equal distance to the right
+          active &= r_dists >= dist;
+        }
+
+        __syncthreads();
+      }
+    }
+  }
+
+  __device__ __forceinline__ KeyT pop()
+  {
+    __syncthreads();
+    const uint32_t head_idx_prioQ = r_prioQ_head;
+    const KeyT key = s_cache[head_idx_prioQ];
+    // Pop on empty prioQ.
+    const ValueT dist = s_dists[head_idx_prioQ];
+    __syncthreads();
+    if (key == EMPTY_KEY || dist >= criteria())
+      return EMPTY_KEY;
+
+    if (!threadIdx.x) {
+      // update visited list
+      const uint32_t head_idx_visited = r0_visited_head;
+      s_cache[head_idx_visited] = key;
+      r0_visited_head = (head_idx_visited + 1) >= CACHE_SIZE ? SORTED_SIZE : head_idx_visited + 1;
+      // remove from prioQ
+      s_cache[head_idx_prioQ] = EMPTY_KEY;
+      s_dists[head_idx_prioQ] = EMPTY_DIST;
+    }
+    // Move ring-buffer head forward.
+    r_prioQ_head = (head_idx_prioQ + 1) >= SORTED_SIZE ? BEST_SIZE : head_idx_prioQ + 1;
+    __syncthreads();
+    return key;
+  }
+
+  template <bool filter_known_keys = true>
+  __device__ __forceinline__ void fetch(
+      std::conditional_t<filter_known_keys, KeyT, const KeyT>* s_keys, const KeyT* d_translation,
+      uint32_t len)
+  {
+    if constexpr (filter_known_keys) {
+      __syncthreads();
+      // filter known indices in the cache
+      for (uint32_t i = threadIdx.x; i < CACHE_SIZE; i += BLOCK_DIM_X) {
+        const KeyT n = s_cache[i];
+        if (n == EMPTY_KEY) {
+          if (i >= SORTED_SIZE)
+            break;
+          continue;
+        }
+        for (uint32_t k = 0; k < len; ++k) {
+          if (s_keys[k] == n)
+            s_keys[k] = EMPTY_KEY;
+        }
+      }
+    }
+
+    __syncthreads();
+
+    KeyT n_cache;
+    uint32_t mask = 0;
+
+    for (uint32_t k = 0; k < len || mask;) {
+      if (!mask) {
+        uint32_t idx = k + (threadIdx.x % 32);  // assuming block dim x is a multiple of 32
+        n_cache = idx < len ? s_keys[idx] : EMPTY_KEY;
+        mask = __ballot_sync(0xffffffff, n_cache != EMPTY_KEY);
+        k += 32;
+        continue;
+      }
+
+      const uint32_t first = __ffs(mask) - 1;
+      mask ^= 1 << first;
+      const KeyT other_n = __shfl_sync(0xffffffff, n_cache, first);
+
+      const KeyT other_m = (d_translation) ? d_translation[other_n] : other_n;
+      const ValueT dist = rs_dist_calc.distance_synced(other_m);
+
+      if (dist < criteria())
+        push(other_n, dist);
+    }
+
+    __syncthreads();
+  }
+
+  __device__ __forceinline__ void fetch_unfiltered(const KeyT* s_keys, const KeyT* d_translation,
+                                                   const uint32_t len)
+  {
+    fetch<false>(s_keys, d_translation, len);
+  }
+
+  __device__ __forceinline__ void transform(const KeyT* transform)
+  {
+    __syncthreads();
+
+    for (uint32_t i = threadIdx.x; i < CACHE_SIZE; i += BLOCK_DIM_X) {
+      if (i < BEST_SIZE) {
+        // transform best
+        KeyT key = s_cache[i];
+        if (key != EMPTY_KEY)
+          key = transform[key];
+        s_cache[i] = key;
+
+        // copy best into prio queue
+        if (i + BEST_SIZE < SORTED_SIZE) {
+          s_cache[i + BEST_SIZE] = key;
+          s_dists[i + BEST_SIZE] = s_dists[i];
+        }
+      }
+      else if (i < 2 * BEST_SIZE && i < SORTED_SIZE) {
+        // do nothing (handled by previous threads)
+      }
+      else {
+        // reset remainder of the prio queue and visited cache
+        s_cache[i] = EMPTY_KEY;
+        if (i < SORTED_SIZE)
+          s_dists[i] = EMPTY_DIST;
+      }
+    }
+
+    // reset heads.
+    r_prioQ_head = BEST_SIZE;
+    if (!threadIdx.x) {
+      r0_visited_head = SORTED_SIZE;
+    }
+
+    __syncthreads();
+  }
+
+  __device__ __forceinline__ void write_best(KeyT* d_buffer, const KeyT n, uint32_t stride)
+  {
+#pragma unroll
+    for (uint32_t i = threadIdx.x; i < BEST_SIZE; i += BLOCK_DIM_X) {
+      const KeyT idx = s_cache[i];
+      d_buffer[static_cast<size_t>(n) * stride + i] = idx;
+    }
+  }
+
+  __device__ __forceinline__ void write_best(KeyT* d_buffer, const KeyT n, uint32_t stride,
+                                             uint32_t idx_offset)
+  {
+#pragma unroll
+    for (uint32_t i = threadIdx.x; i < BEST_SIZE; i += BLOCK_DIM_X) {
+      const KeyT idx = s_cache[i];
+      d_buffer[static_cast<size_t>(n) * stride + i] = idx + idx_offset;
+    }
+  }
+
+  __device__ __forceinline__ uint32_t get_dist_stats()
+  {
+    return dist_calc_counter;
+  }
+
+  /**
+   * Prints first 'len' elements in the Cache. [parallel call]:
+   * cash.print(8);
+   *
+   */
+  __device__ __forceinline__ void print(uint32_t len = -1U)
+  {
+    if (len == -1U)
+      len = CACHE_SIZE;
+    __syncthreads();
+    if (!threadIdx.x)
+      printf("print \n");
+    if (!threadIdx.x) {
+      printf("Cache: ring: %d BEST_SIZE: %f (+xi -> %f) \n", r_prioQ_head, s_dists[BEST_SIZE - 1],
+             s_dists[BEST_SIZE - 1] + r_xi);
+      for (uint32_t i = 0; i < len; ++i) {
+        if (i < BEST_SIZE) {
+          printf("%d -> %d %f \n", i, s_cache[i], s_dists[i]);
+        }
+        else {
+          if (i < SORTED_SIZE) {
+            printf("%d -> %d %f | ", i, s_cache[i], s_dists[i]);
+            if (i == r_prioQ_head)
+              printf("X");
+            printf("\n");
+          }
+          else {
+            printf("%d -> %d | ", i, s_cache[i]);
+            if (i == r0_visited_head)
+              printf("X");
+            printf("\n");
+          }
+        }
+      }
+    }
+    __syncthreads();
+  }
+};
+
+};  // namespace ggnn
+
+// for checking for warnings with clangd - will be instantiated implicitly on demand
+// template struct SimpleKNNCache<int32_t, float, float, Euclidean, 128, 10, 256, 32, false>;
+
+#endif  // INCLUDE_GGNN_SIMPLE_KNN_CACHE_CUH
diff --git a/include/ggnn/cuda_utils/simple_knn_sym_cache.cuh b/include/ggnn/cuda_utils/simple_knn_sym_cache.cuh
new file mode 100644
index 0000000..3a0e5a2
--- /dev/null
+++ b/include/ggnn/cuda_utils/simple_knn_sym_cache.cuh
@@ -0,0 +1,492 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_SIMPLE_KNN_SYM_CACHE_CUH
+#define INCLUDE_GGNN_SIMPLE_KNN_SYM_CACHE_CUH
+
+#include <ggnn/base/def.h>
+#include <ggnn/cuda_utils/distance.cuh>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include <cub/cub.cuh>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_DIM_X,
+          uint32_t DIST_ITEMS_PER_THREAD, bool DIST_STATS = false>
+struct SimpleKNNSymCache {
+  using AddrT = size_t;
+
+  static constexpr KeyT EMPTY_KEY = static_cast<KeyT>(-1);
+  static constexpr ValueT EMPTY_DIST = std::numeric_limits<ValueT>::infinity();
+  static constexpr float EPS = 0.1f;
+
+ private:
+  const uint32_t D;
+  const DistanceMeasure measure;
+
+  struct DistQueryAndHalf {
+    ValueT dist_query;
+    ValueT dist_half;
+
+    struct Sum {
+      __host__ __device__ __forceinline__ DistQueryAndHalf
+      operator()(const DistQueryAndHalf& a, const DistQueryAndHalf& b) const
+      {
+        return {a.dist_query + b.dist_query, a.dist_half + b.dist_half};
+      }
+    };
+  };
+
+  typedef cub::BlockReduce<ValueT, BLOCK_DIM_X> DistReduce;
+  typedef cub::BlockReduce<DistQueryAndHalf, BLOCK_DIM_X> DistQueryAndHalfReduce;
+
+  struct CacheTempStorage {
+    typename DistReduce::TempStorage dist_reduce;
+    typename DistQueryAndHalfReduce::TempStorage dist_query_half_reduce;
+  };
+
+ public:
+  const uint32_t BEST_SIZE;
+  const uint32_t SORTED_SIZE;
+  const uint32_t CACHE_SIZE;
+
+  KeyT* s_cache;
+  ValueT* s_dists;
+  uint32_t r_prioQ_head;
+  uint32_t r0_visited_head;
+
+  CacheTempStorage& s_storage;
+  bool& s_sync;
+  DistQueryAndHalf& s_dist;
+
+  ValueT r_criteria_half;
+  ValueT r_xi;
+
+  const BaseT* d_base;
+  BaseT r_query[DIST_ITEMS_PER_THREAD];
+  ValueT r_half[DIST_ITEMS_PER_THREAD];
+
+  // only valid in thread 0
+  ValueT r0_query_norm;
+  ValueT r0_half_norm;
+
+  // # threadIdx.x == 0 stats registers only
+  uint32_t dist_calc_counter;
+
+  __device__ __forceinline__ void initSharedStorage()
+  {
+    extern __shared__ KeyT shared_cache[];
+
+    s_cache = shared_cache;
+    s_dists = reinterpret_cast<ValueT*>(&s_cache[CACHE_SIZE]);
+  }
+
+  __device__ __forceinline__ CacheTempStorage& CachePrivateTmpStorage()
+  {
+    __shared__ CacheTempStorage cache_tmp_storage;
+    return cache_tmp_storage;
+  }
+
+  __device__ __forceinline__ bool& SyncPrivateTmpStorage()
+  {
+    __shared__ bool s_sync_tmp;
+    return s_sync_tmp;
+  }
+
+  __device__ __forceinline__ DistQueryAndHalf& DistTmpStorage()
+  {
+    __shared__ DistQueryAndHalf s_dist;
+    return s_dist;
+  }
+
+  __device__ __forceinline__ SimpleKNNSymCache(const uint32_t D, const DistanceMeasure measure,
+                                               const uint32_t BEST_SIZE, const uint32_t SORTED_SIZE,
+                                               const uint32_t CACHE_SIZE, const BaseT* d_base,
+                                               const KeyT n, const ValueT xi_criteria)
+      : D(D),
+        measure(measure),
+        BEST_SIZE{BEST_SIZE},
+        SORTED_SIZE{SORTED_SIZE},
+        CACHE_SIZE{CACHE_SIZE},
+        s_storage(CachePrivateTmpStorage()),
+        d_base(d_base),
+        r_xi(xi_criteria),
+        s_sync(SyncPrivateTmpStorage()),
+        s_dist(DistTmpStorage())
+  {
+    initSharedStorage();
+    // init(); // will be initialized later with init_start_point
+    if constexpr (DIST_STATS)
+      if (!threadIdx.x)
+        dist_calc_counter = 0;
+    loadQueryPos(d_base + static_cast<AddrT>(n) * D);
+  }
+
+  __device__ __forceinline__ void loadQueryPos(const BaseT* d_query)
+  {
+    ValueT query_norm = 0.0f;
+    for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+      const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+      r_query[item] = (read_dim < D) ? d_query[read_dim] : 0;
+      if (measure == DistanceMeasure::Cosine)
+        query_norm += static_cast<ValueT>(r_query[item]) * static_cast<ValueT>(r_query[item]);
+    }
+    if (measure == DistanceMeasure::Cosine) {
+      // only needed by thread 0
+      r0_query_norm = DistReduce(s_storage.dist_reduce).Sum(query_norm);
+      __syncthreads();
+    }
+  }
+
+  __device__ __forceinline__ void init_start_point(const KeyT other_n, const KeyT* d_translation)
+  {
+    const KeyT other_m = (d_translation == nullptr) ? other_n : d_translation[other_n];
+    DistQueryAndHalf norms{0.0f, 0.0f};
+    for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+      const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+      r_half[item] = (read_dim < D) ? d_base[static_cast<AddrT>(other_m) * D + read_dim] : 0;
+    }
+    for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+      const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+      if (read_dim < D) {
+        r_half[item] = static_cast<ValueT>(r_query[item]) +
+                       (0.5f - EPS) * (static_cast<ValueT>(r_half[item]) - r_query[item]);
+        if (measure == DistanceMeasure::Cosine) {
+          norms.dist_query += static_cast<ValueT>(r_query[item]) * r_query[item];
+          norms.dist_half += r_half[item] * r_half[item];
+        }
+      }
+    }
+    __syncthreads();
+    if (measure == DistanceMeasure::Cosine) {
+      DistQueryAndHalf norms_sum = DistQueryAndHalfReduce(s_storage.dist_query_half_reduce)
+                                       .Reduce(norms, DistQueryAndHalf::Sum());
+      if (!threadIdx.x) {
+        r0_query_norm = norms_sum.dist_query;
+        r0_half_norm = norms_sum.dist_half;
+      }
+      __syncthreads();
+    }
+    const DistQueryAndHalf dists = distance_synced(other_m);
+    r_criteria_half = dists.dist_half + r_xi;
+
+    // clear cache and add start point to best list and prioQ
+    for (uint32_t i = threadIdx.x; i < CACHE_SIZE; i += BLOCK_DIM_X) {
+      s_cache[i] = (i == 0 || i == BEST_SIZE) ? other_n : EMPTY_KEY;
+      if (i < SORTED_SIZE)
+        s_dists[i] = (i == 0 || i == BEST_SIZE) ? dists.dist_query : EMPTY_DIST;
+    }
+    r_prioQ_head = BEST_SIZE;
+    if (!threadIdx.x)
+      r0_visited_head = SORTED_SIZE;
+    __syncthreads();
+  }
+
+  /**
+   * Calculates synced distance of base vector to other_id vector.
+   *
+   * [parallel call]:
+   * ValueT dist = cache.distance(other_id)
+   *
+   * Return:
+   *   ValueT distance
+   *
+   * Note: distance valid in all threads.
+   */
+  __device__ __forceinline__ DistQueryAndHalf distance_synced(const KeyT other_id)
+  {
+    BaseT r_other[DIST_ITEMS_PER_THREAD];
+
+    for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+      const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+      r_other[item] = (read_dim < D) ? d_base[static_cast<AddrT>(other_id) * D + read_dim] : 0;
+    }
+
+    DistQueryAndHalf dist{0.f, 0.f};
+    ValueT norm_other = 0.0f;
+    if (measure == DistanceMeasure::Euclidean) {
+      for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+        const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+        if (read_dim < D) {
+          const ValueT dist_query =
+              static_cast<ValueT>(r_query[item]) - static_cast<ValueT>(r_other[item]);
+          dist.dist_query += dist_query * dist_query;
+          const ValueT dist_half =
+              static_cast<ValueT>(r_half[item]) - static_cast<ValueT>(r_other[item]);
+          dist.dist_half += dist_half * dist_half;
+        }
+      }
+    }
+    else if (measure == DistanceMeasure::Cosine) {
+      for (uint32_t item = 0; item < DIST_ITEMS_PER_THREAD; ++item) {
+        const uint32_t read_dim = item * BLOCK_DIM_X + threadIdx.x;
+        if (read_dim < D) {
+          const ValueT dist_query =
+              static_cast<ValueT>(r_query[item]) * static_cast<ValueT>(r_other[item]);
+          dist.dist_query += dist_query;
+          const ValueT dist_half =
+              static_cast<ValueT>(r_half[item]) * static_cast<ValueT>(r_other[item]);
+          dist.dist_half += dist_half;
+          norm_other += static_cast<ValueT>(r_other[item]) * static_cast<ValueT>(r_other[item]);
+        }
+      }
+    }
+
+    dist = DistQueryAndHalfReduce(s_storage.dist_query_half_reduce)
+               .Reduce(dist, DistQueryAndHalf::Sum());
+    if (measure == DistanceMeasure::Cosine) {
+      __syncthreads();
+      // need to normalize by the vectors' lengths (in high dimensions, no
+      // vector has length 1.0f)
+      norm_other = DistReduce(s_storage.dist_reduce).Sum(norm_other);
+      if (!threadIdx.x) {
+        const ValueT query_norm_sqr = norm_other * r0_query_norm;
+        const ValueT half_norm_sqr = norm_other * r0_half_norm;
+        // use negative dot product, as larger values are closer to each other
+        // otherwise, we would need to adjust each and every distance comparison
+        // in the code
+        dist.dist_query =
+            (query_norm_sqr > 0.0f) ? fabs(1.0f - dist.dist_query / sqrtf(query_norm_sqr)) : 1.0f;
+        // while this could be computed in parallel to the query distance,
+        // the necessary shuffling and synchronization costs more.
+        dist.dist_half =
+            (half_norm_sqr > 0.0f) ? fabs(1.0f - dist.dist_half / sqrtf(half_norm_sqr)) : 1.0f;
+      }
+    }
+
+    if (!threadIdx.x) {
+      if constexpr (DIST_STATS)
+        dist_calc_counter++;
+      s_dist = dist;
+    }
+    __syncthreads();
+
+    return s_dist;
+  }
+
+  __device__ __forceinline__ ValueT criteria_sym()
+  {
+    return s_dists[0] + r_xi;
+  }
+
+  __device__ __forceinline__ void push(const KeyT key, const ValueT dist)
+  {
+    __syncthreads();
+    // Register for insertion in best and prioq
+
+    // check for duplicates
+    {
+      if (!threadIdx.x)
+        s_sync = false;
+
+      __syncthreads();
+
+      for (uint32_t idx = threadIdx.x; idx < SORTED_SIZE && !s_sync; idx += BLOCK_DIM_X) {
+        if (s_cache[idx] == key)
+          s_sync = true;
+      }
+
+      __syncthreads();
+      if (s_sync)
+        return;
+    }
+
+    const uint32_t head_idx_prioQ = r_prioQ_head;
+    const uint32_t head_idx_in_prioQ = head_idx_prioQ - BEST_SIZE;
+
+    // process blocks from right to left (we shift to the right)
+    {
+      KeyT r_cache;
+      ValueT r_dists;
+
+      uint32_t idx;
+      bool active = false;
+
+      // start with the last block
+      uint32_t block_start = ((SORTED_SIZE + BLOCK_DIM_X - 1) / BLOCK_DIM_X) * BLOCK_DIM_X;
+
+      while (true) {
+        // shift
+        if (active) {
+          // Don't move if no entry or end of best or prioq.
+          if (r_cache != EMPTY_KEY) {
+            const uint32_t idx_next = (idx + 1 == SORTED_SIZE) ? BEST_SIZE : idx + 1;
+            const bool has_next = idx_next != BEST_SIZE && idx_next != head_idx_prioQ;
+            if (has_next) {
+              s_cache[idx_next] = r_cache;
+              s_dists[idx_next] = r_dists;
+            }
+          }
+
+          // Find insert points.
+          const bool has_prev = idx != 0 && idx != head_idx_prioQ;
+          const uint32_t idx_prev = idx != BEST_SIZE ? idx - 1 : SORTED_SIZE - 1;
+          if (!has_prev || s_dists[idx_prev] < dist) {
+            // insert into best list and priority queue
+            s_cache[idx] = key;
+            s_dists[idx] = dist;
+          }
+        }
+
+        if (!block_start)
+          break;
+
+        // update index
+        block_start -= BLOCK_DIM_X;
+        idx = block_start + threadIdx.x;
+        active = idx < SORTED_SIZE;
+
+        // read
+        if (active) {
+          // handle ringbuffer addresses
+          // TODO: reorder between threads to fix bank conflicts
+          if (idx >= BEST_SIZE) {
+            idx = (idx + head_idx_in_prioQ < SORTED_SIZE)
+                      ? idx + head_idx_in_prioQ
+                      : idx + head_idx_in_prioQ - SORTED_SIZE + BEST_SIZE;
+          }
+
+          r_cache = s_cache[idx];
+          r_dists = s_dists[idx];
+
+          // shift all elements with larger/equal distance to the right
+          active &= r_dists >= dist;
+        }
+
+        __syncthreads();
+      }
+    }
+  }
+
+  __device__ __forceinline__ KeyT pop()
+  {
+    __syncthreads();
+    const uint32_t head_idx_prioQ = r_prioQ_head;
+    const KeyT key = s_cache[head_idx_prioQ];
+    // Pop on empty prioQ.
+    const ValueT dist = s_dists[head_idx_prioQ];
+    __syncthreads();
+    if (key == EMPTY_KEY || dist >= criteria_sym())
+      return EMPTY_KEY;
+
+    if (!threadIdx.x) {
+      // update visited list
+      const uint32_t head_idx_visited = r0_visited_head;
+      s_cache[head_idx_visited] = key;
+      r0_visited_head = (head_idx_visited + 1) >= CACHE_SIZE ? SORTED_SIZE : head_idx_visited + 1;
+      // remove from prioQ
+      s_cache[head_idx_prioQ] = EMPTY_KEY;
+      s_dists[head_idx_prioQ] = EMPTY_DIST;
+    }
+    // Move ring-buffer head forward.
+    r_prioQ_head = (head_idx_prioQ + 1) >= SORTED_SIZE ? BEST_SIZE : head_idx_prioQ + 1;
+    __syncthreads();
+    return key;
+  }
+
+  template <bool filter_known_keys = true>
+  __device__ __forceinline__ void fetch(KeyT* s_keys, const KeyT* d_translation, uint32_t len)
+  {
+    if constexpr (filter_known_keys) {
+      __syncthreads();
+      for (uint32_t i = threadIdx.x; i < CACHE_SIZE; i += BLOCK_DIM_X) {
+        const KeyT n = s_cache[i];
+        if (n != EMPTY_KEY) {
+          for (uint32_t k = 0; k < len; ++k) {
+            if (s_keys[k] == n)
+              s_keys[k] = EMPTY_KEY;
+          }
+        }
+      }
+    }
+
+    __syncthreads();
+
+    for (uint32_t k = 0; k < len; ++k) {
+      const KeyT other_n = s_keys[k];
+      if (other_n == EMPTY_KEY)
+        continue;
+
+      const KeyT other_m = (d_translation == nullptr) ? other_n : d_translation[other_n];
+      const DistQueryAndHalf dist = distance_synced(other_m);
+
+      if (dist.dist_query < criteria_sym() && dist.dist_half < r_criteria_half)
+        push(other_n, dist.dist_query);
+    }
+
+    __syncthreads();
+  }
+
+  __device__ __forceinline__ void write_best(KeyT* d_buffer, const KeyT n, uint32_t stride)
+  {
+    for (uint32_t i = threadIdx.x; i < BEST_SIZE; i += BLOCK_DIM_X) {
+      const KeyT idx = s_cache[i];
+      d_buffer[n * stride + i] = idx;
+    }
+  }
+
+  __device__ __forceinline__ uint32_t get_dist_stats()
+  {
+    return dist_calc_counter;
+  }
+
+  /**
+   * Prints first 'len' elements in the Cache. [parallel call]:
+   * cash.print(8);
+   *
+   */
+  __device__ __forceinline__ void print(uint32_t len = -1U)
+  {
+    if (len == -1U)
+      len = CACHE_SIZE;
+    __syncthreads();
+    if (!threadIdx.x)
+      printf("print \n");
+    if (!threadIdx.x) {
+      printf("Cache: ring: %d BEST_SIZE: %f (+xi -> %f) \n", r_prioQ_head, s_dists[BEST_SIZE - 1],
+             s_dists[BEST_SIZE - 1] + r_xi);
+      for (uint32_t i = 0; i < len; ++i) {
+        if (i < BEST_SIZE) {
+          printf("%d -> %d %f \n", i, s_cache[i], s_dists[i]);
+        }
+        else {
+          if (i < SORTED_SIZE) {
+            printf("%d -> %d %f | ", i, s_cache[i], s_dists[i]);
+            if (i == r_prioQ_head)
+              printf("X");
+            printf("\n");
+          }
+          else {
+            printf("%d -> %d | ", i, s_cache[i]);
+            if (i == r0_visited_head)
+              printf("X");
+            printf("\n");
+          }
+        }
+      }
+    }
+    __syncthreads();
+  }
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_SIMPLE_KNN_SYM_CACHE_CUH
diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_buffer.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_buffer.cuh
deleted file mode 100644
index 8934351..0000000
--- a/include/ggnn/graph/cuda_knn_ggnn_graph_buffer.cuh
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright 2021 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_BUFFER_CUH_
-#define INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_BUFFER_CUH_
-
-#include <stdio.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cub/cub.cuh>
-
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-/**
- * GGNN graph buffer data
- * auxiliary data needed for graph construction once per GPU
- *
- * @param KeyT datatype of dataset indices
- * @param ValueT distance value type
- */
-template <typename KeyT, typename ValueT>
-struct GGNNGraphBuffer {
-  /// distance to nearest known neighbor per point
-  ValueT* d_nn1_dist_buffer;
-
-  //BUFFER
-  KeyT* d_graph_buffer;
-  KeyT* d_sym_buffer;
-
-  float* d_rng;
-
-  int* d_sym_atomic;
-  int* d_statistics;
-
-  // cub buffer
-  void* d_temp_storage_sum;
-  void* d_temp_storage_max;
-
-  size_t temp_storage_bytes_sum{0};
-  size_t temp_storage_bytes_max{0};
-
-  char* d_memory;
-
-  GGNNGraphBuffer(const int N, const int K, const int KF) {
-    // just to make sure that everything is sufficiently aligned
-    auto align8 = [](size_t size) -> size_t {return ((size+7)/8)*8;};
-
-    const size_t graph_buffer_size = align8(static_cast<size_t>(N) * K * sizeof(KeyT));
-    const size_t sym_buffer_size = align8(static_cast<size_t>(N) * KF * sizeof(KeyT));
-    const size_t rng_size = align8(static_cast<size_t>(N) * sizeof(float));
-    const size_t sym_atomic_size = align8(static_cast<size_t>(N) * sizeof(int));
-    const size_t sym_statistics_size = align8(static_cast<size_t>(N) * sizeof(int));
-    const size_t nn1_dist_buffer_size = align8(N * sizeof(ValueT));
-
-    // stats
-    {
-      ValueT* d_nn1_stats_unused, *d_nn1_dist_buffer_unused;
-
-      CHECK_CUDA(cudaMalloc(&d_nn1_stats_unused, nn1_dist_buffer_size+2*sizeof(ValueT)));
-      d_nn1_dist_buffer_unused = d_nn1_stats_unused+2;
-
-      cub::DeviceReduce::Sum(nullptr, temp_storage_bytes_sum,
-          d_nn1_dist_buffer_unused, &d_nn1_stats_unused[0], N);
-      cub::DeviceReduce::Max(nullptr, temp_storage_bytes_max,
-          d_nn1_dist_buffer_unused, &d_nn1_stats_unused[1], N);
-
-      temp_storage_bytes_sum = align8(temp_storage_bytes_sum);
-      temp_storage_bytes_sum = align8(temp_storage_bytes_max);
-
-      CHECK_CUDA(cudaFree(d_nn1_stats_unused));
-    }
-
-    // const size_t total_size = graph_buffer_size + sym_buffer_size + rng_size + sym_atomic_size + sym_statistics_size + nn1_dist_buffer_size + temp_storage_bytes_sum + temp_storage_bytes_max;
-
-    // this will work as long as the construction code remains as is
-    const size_t merge_size  = nn1_dist_buffer_size + graph_buffer_size;
-    const size_t select_size = nn1_dist_buffer_size + rng_size;
-    const size_t stats_size  = nn1_dist_buffer_size + temp_storage_bytes_sum + temp_storage_bytes_max;
-    const size_t sym_size    = sym_buffer_size + sym_atomic_size + sym_statistics_size;
-
-    const size_t overlapped_size = max(max(merge_size, select_size), max(stats_size, sym_size));
-
-    VLOG(2) << "GGNNGraphBuffer(): allocating GPU memory... ("
-            << overlapped_size/(1024.0f*1024.0f*1024.0f) << " GB total).\n";
-
-    {
-      size_t free, total;
-      CHECK_CUDA(cudaMemGetInfo(&free, &total));
-      CHECK_GE(free, overlapped_size) << "out of memory.";
-    }
-
-    CHECK_CUDA(cudaMalloc(&d_memory, overlapped_size));
-
-    d_nn1_dist_buffer  = reinterpret_cast<ValueT*>(d_memory);
-    d_graph_buffer     = reinterpret_cast<KeyT*>(d_memory + nn1_dist_buffer_size);
-    d_rng              = reinterpret_cast<float*>(d_memory + nn1_dist_buffer_size);
-    d_temp_storage_sum = d_memory + nn1_dist_buffer_size;
-    d_temp_storage_max = d_memory + nn1_dist_buffer_size + temp_storage_bytes_sum;
-    d_sym_buffer       = reinterpret_cast<KeyT*>(d_memory);
-    d_sym_atomic       = reinterpret_cast<int*>(d_memory + sym_buffer_size);
-    d_statistics       = reinterpret_cast<int*>(d_memory + sym_buffer_size + sym_atomic_size);
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-
-
-    VLOG(2) << "GGNNGraphBuffer(): done.\n";
-  }
-
-  ~GGNNGraphBuffer() {
-    CHECK_CUDA(cudaFree(d_memory));
-  }
-};
-
-#endif  // INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_BUFFER_CUH_
diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
deleted file mode 100644
index c94a8f1..0000000
--- a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_DEVICE_CUH_
-#define INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_DEVICE_CUH_
-
-#include <stdio.h>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cub/cub.cuh>
-
-#include <glog/logging.h>
-
-#include "ggnn/utils/cuda_knn_utils.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-
-/**
- * GGNN graph data (on the GPU)
- *
- * @param KeyT datatype of dataset indices
- * @param BaseT datatype of dataset values
- * @param ValueT distance value type
- */
-template <typename KeyT, typename BaseT, typename ValueT>
-struct GGNNGraphDevice {
-  /// neighborhood vectors
-  KeyT* d_graph;
-  /// translation of upper layer points into lowest layer
-  KeyT* d_translation;
-  /// translation of upper layer points into one layer below
-  KeyT* d_selection;
-
-  /// average and maximum distance to nearest known neighbors
-  ValueT* d_nn1_stats;
-
-  /// base data pointer for the shard.
-  BaseT* d_base;
-
-  /// combined memory pool
-  char* d_memory;
-  size_t base_size {0};
-  size_t total_graph_size {0};
-
-  int current_part_id {-1};
-
-  cudaStream_t stream;
-
-  GGNNGraphDevice(const int N, const int D, const int K, const int N_all, const int ST_all) {
-    // just to make sure that everything is sufficiently aligned
-    auto align8 = [](size_t size) -> size_t {return ((size+7)/8)*8;};
-
-    const size_t graph_size = align8(static_cast<size_t>(N_all) * K * sizeof(KeyT));
-    const size_t selection_translation_size = align8(ST_all * sizeof(KeyT));
-    const size_t nn1_stats_size = align8(2 * sizeof(ValueT));
-    total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size;
-    base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
-
-    const size_t total_size = base_size+total_graph_size;
-
-    VLOG(2) << "GGNNGraphDevice(): allocating GPU memory... ("
-            << total_graph_size/(1024.0f*1024.0f*1024.0f) << " GB graph + "
-            << base_size/(1024.0f*1024.0f*1024.0f) << " GB base)\n";
-
-    {
-      size_t free, total;
-      CHECK_CUDA(cudaMemGetInfo(&free, &total));
-      CHECK_GE(free, total_size) << "out of memory.";
-    }
-
-    CHECK_CUDA(cudaMalloc(&d_memory, total_size));
-
-    size_t pos = 0;
-    d_base = reinterpret_cast<BaseT*>(d_memory+pos);
-    pos += base_size;
-    d_graph = reinterpret_cast<KeyT*>(d_memory+pos);
-    pos += graph_size;
-    d_translation = reinterpret_cast<KeyT*>(d_memory+pos);
-    pos += selection_translation_size;
-    d_selection = reinterpret_cast<KeyT*>(d_memory+pos);
-    pos += selection_translation_size;
-    d_nn1_stats = reinterpret_cast<ValueT*>(d_memory+pos);
-    pos += nn1_stats_size;
-
-    CHECK_EQ(pos, total_size);
-
-    CHECK_CUDA(cudaStreamCreate(&stream));
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-  }
-
-  GGNNGraphDevice(const GGNNGraphDevice& other) {
-    // this exists to allow using vector::emplace_back
-    // when it triggers a reallocation, this code will be called.
-    // always make sure that enough memory is reserved ahead of time.
-    LOG(FATAL) << "copying is not supported. reserve()!";
-  }
-
-  ~GGNNGraphDevice() {
-    cudaFree(d_memory);
-
-    CHECK_CUDA(cudaStreamDestroy(stream));
-  }
-};
-
-#endif  // INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_DEVICE_CUH_
diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
deleted file mode 100644
index 2055f9e..0000000
--- a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Rupert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_HOST_CUH_
-#define INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_HOST_CUH_
-
-#include <stdio.h>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <thread>
-#include <vector>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cub/cub.cuh>
-
-#include "ggnn/utils/cuda_knn_utils.cuh"
-#include "ggnn/graph/cuda_knn_ggnn_graph_device.cuh"
-
-/**
- * GGNN graph data (on the CPU)
- *
- * @param KeyT datatype of dataset indices
- * @param BaseT datatype of dataset values
- * @param ValueT distance value type
- */
-template <typename KeyT, typename BaseT, typename ValueT>
-struct GGNNGraphHost {
-  typedef GGNNGraphDevice<KeyT, BaseT, ValueT> GGNNGraphDevice;
-
-  /// neighborhood vectors
-  KeyT* h_graph;
-  /// translation of upper layer points into lowest layer
-  KeyT* h_translation;
-  /// translation of upper layer points into one layer below
-  KeyT* h_selection;
-
-  /// average and maximum distance to nearest known neighbors
-  ValueT* h_nn1_stats;
-
-  /// combined memory pool
-  char* h_memory;
-
-  size_t total_graph_size;
-
-  int current_part_id {-1};
-
-  std::thread disk_io_thread;
-
-  GGNNGraphHost(const int N, const int K, const int N_all, const int ST_all) {
-    // just to make sure that everything is sufficiently aligned
-    auto align8 = [](size_t size) -> size_t {return ((size+7)/8)*8;};
-
-    const size_t graph_size = align8(static_cast<size_t>(N_all) * K * sizeof(KeyT));
-    const size_t selection_translation_size = align8(ST_all * sizeof(KeyT));
-    // const size_t nn1_dist_buffer_size = N * sizeof(ValueT);
-    const size_t nn1_stats_size = align8(2 * sizeof(ValueT));
-    total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size;
-
-    VLOG(1) << "GGNNGraphHost(): N: " << N << ", K: " << K
-            << ", N_all: " << N_all << ", ST_all: " << ST_all
-            << " (" << total_graph_size/(1024.0f*1024.0f*1024.0f) <<" GB total)\n";
-
-    CHECK_CUDA(cudaMallocHost(&h_memory, total_graph_size));
-
-    size_t pos = 0;
-    h_graph = reinterpret_cast<KeyT*>(h_memory+pos);
-    pos += graph_size;
-    h_translation = reinterpret_cast<KeyT*>(h_memory+pos);
-    pos += selection_translation_size;
-    h_selection = reinterpret_cast<KeyT*>(h_memory+pos);
-    pos += selection_translation_size;
-    h_nn1_stats = reinterpret_cast<ValueT*>(h_memory+pos);
-    pos += nn1_stats_size;
-
-    CHECK_EQ(pos, total_graph_size);
-
-    CHECK_CUDA(cudaPeekAtLastError());
-    CHECK_CUDA(cudaDeviceSynchronize());
-    CHECK_CUDA(cudaPeekAtLastError());
-  }
-
-  GGNNGraphHost(const GGNNGraphHost& other) {
-    // this exists to allow using vector::emplace_back
-    // when it triggers a reallocation, this code will be called.
-    // always make sure that enough memory is reserved ahead of time.
-    LOG(FATAL) << "copying is not supported. reserve()!";
-  }
-
-  ~GGNNGraphHost() {
-    cudaFreeHost(h_memory);
-  }
-
-  void downloadAsync(const GGNNGraphDevice& graph) {
-    cudaMemcpyAsync(h_graph, graph.d_graph, total_graph_size, cudaMemcpyDeviceToHost, graph.stream);
-  }
-
-  void uploadAsync(GGNNGraphDevice& graph) {
-    cudaMemcpyAsync(graph.d_graph, h_graph, total_graph_size, cudaMemcpyHostToDevice, graph.stream);
-  }
-
-  void store(const std::string& filename){
-    std::ofstream outFile;
-
-    outFile.open(filename, std::ofstream::out | std::ofstream::binary |
-                               std::ofstream::trunc);
-
-    CHECK(outFile.is_open()) << "Unable to open " << filename;
-
-    outFile.write(h_memory, total_graph_size);
-
-    outFile.close();
-  }
-
-  void load(const std::string& filename){
-    std::ifstream inFile;
-
-    inFile.open(filename, std::ifstream::in | std::ifstream::binary);
-
-    CHECK(inFile.is_open()) << "Unable to open " << filename;
-
-    inFile.seekg(0, std::ifstream::end);
-    size_t filesize = inFile.tellg();
-    inFile.seekg(0, std::ifstream::beg);
-
-    CHECK_EQ(filesize, total_graph_size) << "Error on loading" << filename <<
-       ". File size of GGNNGraph does not match the expected size.";
-
-    inFile.read(h_memory, total_graph_size);
-
-    inFile.close();
-  }
-};
-
-#endif  // INCLUDE_GGNN_GRAPH_CUDA_KNN_GGNN_GRAPH_HOST_CUH_
diff --git a/include/ggnn/merge/cuda_knn_merge_layer.cuh b/include/ggnn/merge/cuda_knn_merge_layer.cuh
deleted file mode 100644
index 843311f..0000000
--- a/include/ggnn/merge/cuda_knn_merge_layer.cuh
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_MERGE_CUDA_KNN_MERGE_LAYER_CUH_
-#define INCLUDE_GGNN_MERGE_CUDA_KNN_MERGE_LAYER_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-#include "ggnn/cache/cuda_simple_knn_cache.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <typename T>
-__global__ void
-merge(const T kernel) {
-  kernel();
-}
-
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int D, int K, int KF, int S,
-          int BLOCK_DIM_X, typename BaseT = ValueT, typename BAddrT = int32_t,
-          typename GAddrT = int32_t>
-struct MergeKernel {
-  static constexpr int KL = K - KF;
-  static constexpr int MAX_ITERATIONS = 200;
-
-  // this allows for loop unrolling
-  static constexpr int ITERATIONS_FOR_K = (K+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-  static constexpr int ITERATIONS_FOR_S = (S+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-
-  static constexpr int KQuery = K;
-  static constexpr int SORTED_SIZE = 128;
-  // keep the visited list just long enough to keep track of all visited points
-  static constexpr int CACHE_SIZE = ((SORTED_SIZE+MAX_ITERATIONS+BLOCK_DIM_X-1)
-                                     /BLOCK_DIM_X)*BLOCK_DIM_X;
-
-  static constexpr int BEST_SIZE = KQuery;
-  static constexpr int VISITED_SIZE = CACHE_SIZE - SORTED_SIZE;
-  static constexpr int PRIOQ_SIZE = SORTED_SIZE - BEST_SIZE;
-
-  static constexpr bool DIST_STATS = false;
-  static constexpr bool OVERFLOW_STATS = false;
-
-  typedef SimpleKNNCache<measure, ValueT, KeyT, KQuery, D, BLOCK_DIM_X, VISITED_SIZE,
-                          PRIOQ_SIZE, BEST_SIZE, BaseT, BAddrT, DIST_STATS,
-                          OVERFLOW_STATS>
-    Cache;
-
-  void launch(const cudaStream_t stream = 0) {
-    CHECK_GT(layer_top, layer_btm);
-    VLOG(1) << "MergeKernel -- Layer: " << layer_top << " -> " << layer_btm
-            << " |  N: " << N << " [" << N_offset << " " << N_offset+N << "] \n";
-    merge<<<N, BLOCK_DIM_X, 0, stream>>>((*this));
-  }
-
-  // determine the start of the top-layer segment (always 0 for layer_top = L-1)
-  __device__ __forceinline__ int get_top_seg_offset(const KeyT n) const {
-    int seg_btm;
-    if (!layer_btm) {
-      seg_btm = n / (c_S0 + 1);
-      if (seg_btm >= c_S0_offset)
-        seg_btm = c_S0_offset + (n - (c_S0_offset * (c_S0 + 1))) / c_S0;
-    } else {
-      seg_btm = n / S;
-    }
-
-    int powG = c_G; //assuming layer_top > layer_btm (which should always be the case)
-    for (int i=1; i<layer_top-layer_btm; ++i)
-      powG *= c_G;
-
-    return (seg_btm / powG) * S;
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    const float xi = (measure == Euclidean) ?
-        (d_nn1_stats[0] * d_nn1_stats[0]) * c_tau_build * c_tau_build
-        : d_nn1_stats[0] * c_tau_build;
-
-    const KeyT n = N_offset + static_cast<int>(blockIdx.x);
-
-    const KeyT m =
-        (!layer_btm) ? n : d_translation[c_STs_offsets[layer_btm] + n];
-
-    Cache cache(d_base, m, xi);
-
-    const int s_offset = get_top_seg_offset(n);
-
-    __shared__ KeyT s_knn[(K > S) ? K : S];
-
-    for (int i=0; i < ITERATIONS_FOR_S; ++i) {
-      const int s = i*BLOCK_DIM_X+threadIdx.x;
-      if (s < S) {
-        s_knn[s] = s_offset + s;
-      }
-    }
-    __syncthreads();
-    cache.fetch(s_knn, &d_translation[c_STs_offsets[layer_top]], S);
-
-    for (int layer = layer_top - 1; layer >= layer_btm; layer--) {
-      __syncthreads();
-
-      cache.transform(&d_selection[c_STs_offsets[layer + 1]]);
-      __syncthreads();
-
-      if (layer == layer_btm) {
-        if (!threadIdx.x) s_knn[0] = n;
-        __syncthreads();
-        cache.fetch(
-            s_knn, (layer > 0) ? &d_translation[c_STs_offsets[layer]] : nullptr,
-            1);
-      }
-      __syncthreads();
-
-      for (int ite = 0; ite < MAX_ITERATIONS; ++ite) {
-        __syncthreads();
-        const KeyT anchor = cache.pop();
-        if (anchor == Cache::EMPTY_KEY) {
-          break;
-        }
-        for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-          const int k = i*BLOCK_DIM_X+threadIdx.x;
-          if (k < K) {
-            s_knn[k] =
-                d_graph[(static_cast<GAddrT>(c_Ns_offsets[layer]) + anchor)
-                        * K + k];
-          }
-        }
-        __syncthreads();
-
-        cache.fetch(s_knn,
-                    (!layer) ? nullptr : &d_translation[c_STs_offsets[layer]],
-                    K);
-
-      }
-    }
-
-    __syncthreads();
-
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        const KeyT idx = cache.s_cache[k + 1];
-        d_graph_buffer[static_cast<GAddrT>(n) * K + k] =
-            (idx != Cache::EMPTY_KEY) ? idx : n;
-      }
-    }
-
-    if (!layer_btm && !threadIdx.x) {
-      d_nn1_dist_buffer[n] = cache.get_nn1_dist();
-    }
-  }
-
-  const BaseT* d_base;        // [Nall,D]
-  const KeyT* d_translation;  // [Nall]
-  const KeyT* d_selection;    // [Sall]
-
-  const KeyT* d_graph;   // [N,K]
-  KeyT* d_graph_buffer;  // [N,K]
-
-  const float* d_nn1_stats;  // [sum,max]
-  float* d_nn1_dist_buffer;  // [N0]
-
-  int N;         // number of points to work on
-  int N_offset;  // gpu offset in N
-
-  int layer_btm;  // layer to merge
-  int layer_top;  // layer to start
-};
-
-#endif  // INCLUDE_GGNN_MERGE_CUDA_KNN_MERGE_LAYER_CUH_
diff --git a/include/ggnn/merge/cuda_knn_top_merge_layer.cuh b/include/ggnn/merge/cuda_knn_top_merge_layer.cuh
deleted file mode 100644
index 83c52ed..0000000
--- a/include/ggnn/merge/cuda_knn_top_merge_layer.cuh
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_MERGE_CUDA_KNN_TOP_MERGE_LAYER_CUH_
-#define INCLUDE_GGNN_MERGE_CUDA_KNN_TOP_MERGE_LAYER_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_k_best_list.cuh"
-#include "ggnn/utils/cuda_knn_distance.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <typename T>
-__global__ void
-top(const T kernel) {
-  kernel();
-}
-
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int D, int K, int BLOCK_DIM_X,
-          typename BaseT = ValueT, typename BAddrT = int32_t,
-          typename GAddrT = int32_t>
-struct TopMergeKernel {
-  static constexpr int K_BEST = K;
-
-  // this allows for loop unrolling
-  static constexpr int ITERATIONS_FOR_K = (K+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-
-  void launch(const cudaStream_t stream = 0) {
-    VLOG(1) << "TopMergeKernel -- Layer: " << layer << " | N: " << N << " [" << N_offset << " " << N_offset+N << "]\n";
-
-    top<<<N, BLOCK_DIM_X, 0, stream>>>((*this));
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    typedef Distance<measure, ValueT, KeyT, D, BLOCK_DIM_X, BaseT, BAddrT> Distance;
-    typedef KBestList<ValueT, KeyT, K, BLOCK_DIM_X> KBestList;
-
-    const int n = N_offset + blockIdx.x;
-    const int m = (!layer) ? n : d_translation[n];
-
-    Distance distCalc(d_base, m);
-    KBestList best;
-
-    const int S_plus_offset = S_offset * (S + 1);
-    const int S_actual = (!layer && n < S_plus_offset) ? S + 1 : S;
-
-    const int start =
-        (layer || n < S_plus_offset)
-            ? (n / S_actual) * S_actual
-            : S_plus_offset + ((n - S_plus_offset) / S_actual) * S_actual;
-    const int end = start + S_actual;
-
-    for (int other_n = start; other_n < end; other_n++) {
-      __syncthreads();
-      const int other_m = (layer) ? d_translation[other_n] : other_n;
-
-      if (m == other_m) continue;
-      ValueT dist = distCalc.distance_synced(other_m);
-
-      best.add_unique(dist, other_n);
-    }
-
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        const GAddrT addr = static_cast<GAddrT>(n) * K + k;
-        d_graph[addr] = best.ids[k];
-      }
-    }
-    if (!threadIdx.x) {
-      if (measure == Euclidean) {
-        d_nn1_dist_buffer[n] = sqrt(best.dists[1]);
-      }
-      else if (measure == Cosine) {
-        d_nn1_dist_buffer[n] = best.dists[1];
-      }
-    }
-  }
-
-  int N_offset;
-  int N;
-
-  int S;
-  int S_offset;
-
-  int layer;
-
-  const BaseT* d_base;
-  const KeyT* d_translation;
-
-  KeyT* d_graph;
-  ValueT* d_nn1_dist_buffer;
-};
-
-#endif  // INCLUDE_GGNN_MERGE_CUDA_KNN_TOP_MERGE_LAYER_CUH_
diff --git a/include/ggnn/query/bf_query_layer.cuh b/include/ggnn/query/bf_query_layer.cuh
new file mode 100644
index 0000000..7f066e5
--- /dev/null
+++ b/include/ggnn/query/bf_query_layer.cuh
@@ -0,0 +1,69 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_BF_QUERY_LAYER_CUH
+#define INCLUDE_GGNN_BF_QUERY_LAYER_CUH
+
+#include <ggnn/base/def.h>
+#include <glog/logging.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void bf_query(const T kernel);
+
+/**
+ * query which loops through all points and can be used to create ground truth data
+ */
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          bool WRITE_DISTS = false>
+struct BruteForceQueryKernel {
+  static constexpr uint32_t BLOCK_DIM_X = BLOCK_SIZE;
+  static constexpr uint32_t DIST_ITEMS_PER_THREAD = 4;
+
+  void launch(const uint32_t N, const cudaStream_t stream = 0)
+  {
+    VLOG(1) << "BruteForceQueryKernel -- KQuery: " << KQuery;
+
+    CHECK_LE(D, BLOCK_DIM_X * DIST_ITEMS_PER_THREAD);
+
+    const size_t sm_size = KQuery * (sizeof(KeyT) + sizeof(ValueT));
+
+    bf_query<<<N, BLOCK_DIM_X, sm_size, stream>>>((*this));
+  }
+
+  __device__ __forceinline__ void operator()() const;
+
+  const uint32_t D;
+  const DistanceMeasure measure;
+  const uint32_t KQuery;
+
+  KeyT N_base;  // number of base points
+
+  const BaseT* d_base;   // [Nall,D]
+  const BaseT* d_query;  // [Nq,D]
+
+  KeyT* d_query_results;          // [Nq,KQuery]
+  ValueT* d_query_results_dists;  // [Nq,KQuery]
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_BF_QUERY_LAYER_CUH
diff --git a/include/ggnn/query/cuda_knn_bf_query_layer.cuh b/include/ggnn/query/cuda_knn_bf_query_layer.cuh
deleted file mode 100644
index 86a8cce..0000000
--- a/include/ggnn/query/cuda_knn_bf_query_layer.cuh
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_QUERY_CUDA_KNN_BF_QUERY_LAYER_CUH_
-#define INCLUDE_GGNN_QUERY_CUDA_KNN_BF_QUERY_LAYER_CUH_
-
-#include <algorithm>
-#include <limits>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cub/cub.cuh>
-
-#include "ggnn/utils/cuda_knn_k_best_list.cuh"
-#include "ggnn/utils/cuda_knn_distance.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <typename T>
-__global__ void
-bf_query(const T kernel) {
-  kernel();
-}
-
-/**
- * query which loops through all points and can be used to create ground truth data
- */
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int D, int KQuery, int BLOCK_DIM_X,
-          typename BaseT = ValueT, typename BAddrT = int32_t, typename GAddrT = int32_t,
-          bool WRITE_DISTS = false>
-struct BruteForceQueryKernel {
-  typedef Distance<measure, ValueT, KeyT, D, BLOCK_DIM_X, BaseT, BAddrT> Distance;
-  typedef KBestList<ValueT, KeyT, KQuery, BLOCK_DIM_X> KBestList;
-
-  static constexpr int ITERATIONS_FOR_K_QUERY = (KQuery+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-
-  void launch(const cudaStream_t stream = 0) {
-    DLOG(INFO) << "BruteForceQueryKernel -- KQuery: " << KQuery;
-    bf_query<<<N, BLOCK_DIM_X, 0, stream>>>((*this));
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    const KeyT n = N_offset + static_cast<int>(blockIdx.x);
-
-    Distance distCalc(d_base, d_query, n);
-    KBestList best;
-    __syncthreads();
-
-    for (KeyT i=0; i<N_base; ++i)
-    {
-      // fetch the entire base, one by one
-      ValueT dist = distCalc.distance_synced(i);
-      if (dist < best.worst()) // should be faster than checking all elements
-        best.add_unique(dist, i);
-    }
-    __syncthreads();
-
-    for (int i=0; i < ITERATIONS_FOR_K_QUERY; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < KQuery) {
-        const GAddrT addr = static_cast<GAddrT>(n) * KQuery + k;
-        d_query_results[addr] = best.ids[k];
-        if (WRITE_DISTS)
-          d_query_results_dists[addr] = best.dists[k];
-      }
-    }
-  }
-
-  const BaseT* d_base;        // [Nall,D]
-  const BaseT* d_query;       // [Nq,D]
-
-  KeyT* d_query_results;          // [Nq,KQuery]
-  ValueT* d_query_results_dists;  // [Nq,KQuery]
-
-  int N_base;    // number of base points
-  int N;         // number of points to query for -> Nq
-  int N_offset;  // gpu offset in N
-};
-
-#endif  // INCLUDE_GGNN_QUERY_CUDA_KNN_BF_QUERY_LAYER_CUH_
diff --git a/include/ggnn/query/cuda_knn_ggnn_query.cuh b/include/ggnn/query/cuda_knn_ggnn_query.cuh
deleted file mode 100644
index 09cec3a..0000000
--- a/include/ggnn/query/cuda_knn_ggnn_query.cuh
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef INCLUDE_GGNN_UTILS_CUDA_KNN_GGNN_QUERY_CUH_
-#define INCLUDE_GGNN_UTILS_CUDA_KNN_GGNN_QUERY_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#include <limits>
-#include <vector>
-
-#include <cub/cub.cuh>
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-/**
- * GGNN graph data
- *
- * @param KeyT datatype of dataset indices
- * @param ValueT dists value type
- * @param BaseT base value type
- */
-
-template <typename KeyT, typename ValueT, typename BaseT>
-struct GGNNQuery {
-  /// query vectors
-  BaseT* d_query{nullptr};
-  KeyT* d_query_result_ids{nullptr};
-  ValueT* d_query_result_dists{nullptr};
-
-  KeyT* d_query_result_ids_sorted{nullptr};
-  ValueT* d_query_result_dists_sorted{nullptr};
-
-  /// number of dataset vectors
-  const int N_query;
-  /// dimension of vectors in the dataset and query
-  const int D;
-  /// number of nearest neighbors per ground truth entry
-  const int K_query;
-
-  const int num_parts;
-
-  // Sort buffer:
-  void* d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  int num_items = 0;
-  int num_segments = 0;
-
-  int* d_offsets{nullptr};
-
-
-  GGNNQuery(const int N_query, const int D, const int K_query, const int num_parts = 1) : N_query{N_query}, D{D}, K_query{K_query}, num_parts{num_parts} {
-    CHECK_CUDA(cudaMalloc(
-        &d_query, static_cast<size_t>(N_query) * D * sizeof(BaseT)));
-    
-    CHECK_CUDA(cudaMalloc(
-        &d_query_result_ids, static_cast<size_t>(N_query) * K_query * num_parts * sizeof(KeyT)));
-    CHECK_CUDA(cudaMalloc(
-        &d_query_result_ids_sorted, static_cast<size_t>(N_query) * K_query * num_parts * sizeof(KeyT)));
-    
-    CHECK_CUDA(cudaMalloc(
-        &d_query_result_dists, static_cast<size_t>(N_query) * K_query * num_parts * sizeof(ValueT)));
-    CHECK_CUDA(cudaMalloc(
-        &d_query_result_dists_sorted, static_cast<size_t>(N_query) * K_query * num_parts * sizeof(ValueT)));
-
-    num_items = static_cast<size_t>(N_query) * K_query * num_parts;
-    num_segments = N_query;
-
-    if (num_parts > 1) {
-      const size_t segments_size = (num_segments+1)*sizeof(int);
-      int* h_offsets = (int*) malloc(segments_size);
-      CHECK_CUDA(cudaMalloc(&d_offsets, segments_size));
-
-      for (int i = 0; i < (num_segments + 1); i++) {
-        h_offsets[i] = i * K_query * num_parts;
-      }
-      CHECK_CUDA(cudaMemcpy(d_offsets, h_offsets, segments_size, cudaMemcpyHostToDevice));
-      free(h_offsets);
-
-      cub::DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes,
-      d_query_result_dists, d_query_result_dists_sorted, d_query_result_ids, d_query_result_ids_sorted,
-      num_items, num_segments, d_offsets, d_offsets + 1);
-
-      CHECK_CUDA(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    }
-  }
-
-  ~GGNNQuery() {
-    cudaFree(d_query);
-    cudaFree(d_query_result_ids);
-    cudaFree(d_query_result_dists);
-  }
-
-  GGNNQuery(const GGNNQuery&) = delete;
-  GGNNQuery(GGNNQuery&&) = delete;
-  GGNNQuery& operator=(const GGNNQuery&) = delete;
-  GGNNQuery& operator=(GGNNQuery&&) = delete;
-
-  void loadQueriesAsync(const BaseT* h_query, const cudaStream_t stream){
-    CHECK_CUDA(cudaMemcpyAsync(d_query,h_query,
-                static_cast<size_t>(N_query) * D * sizeof(BaseT),
-                cudaMemcpyHostToDevice, stream));
-  }
-
-
-  void sortAsync(const cudaStream_t stream){
-    if(num_parts == 1){
-      CHECK_CUDA(cudaMemcpyAsync(d_query_result_ids_sorted, d_query_result_ids,
-                static_cast<size_t>(N_query) * K_query * sizeof(KeyT),
-                cudaMemcpyDeviceToHost, stream));
-      CHECK_CUDA(cudaMemcpyAsync(d_query_result_dists_sorted, d_query_result_dists,
-            static_cast<size_t>(N_query) * K_query * sizeof(ValueT),
-            cudaMemcpyDeviceToHost, stream));
-      
-    }
-    else {
-      cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-      d_query_result_dists, d_query_result_dists_sorted, d_query_result_ids, d_query_result_ids_sorted,
-      num_items, num_segments, d_offsets, d_offsets + 1, 0, sizeof(ValueT)*8, stream);
-    }
-  }
-};
-
-#endif  // INCLUDE_GGNN_UTILS_CUDA_KNN_GGNN_QUERY_CUH_
diff --git a/include/ggnn/query/cuda_knn_no_slack_query_layer.cuh b/include/ggnn/query/cuda_knn_no_slack_query_layer.cuh
deleted file mode 100644
index a9e590d..0000000
--- a/include/ggnn/query/cuda_knn_no_slack_query_layer.cuh
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_QUERY_CUDA_KNN_NO_SLACK_QUERY_LAYER_CUH_
-#define INCLUDE_GGNN_QUERY_CUDA_KNN_NO_SLACK_QUERY_LAYER_CUH_
-
-#include <algorithm>
-#include <limits>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cub/cub.cuh>
-
-// #include "ggnn/cache/cuda_knn_sorted_buffer_cache.cuh"
-#include "ggnn/cache/cuda_simple_knn_cache_no_slack.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-
-template <typename T>
-__global__ void
-no_slack_query(const T kernel) {
-  kernel();
-}
-
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int D, int K, int KF, int KQuery,
-          int S, int BLOCK_DIM_X, typename BaseT = ValueT,
-          typename BAddrT = int32_t, typename GAddrT = int32_t,
-          bool DIST_STATS = false, bool OVERFLOW_STATS = false,
-          int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256,
-          int BEST_SIZE = 128, bool WRITE_DISTS = false>
-struct NoSlackQueryKernel {
-  static constexpr int KL = K - KF;
-  static constexpr int KS = (K > S) ? K : S;
-
-  static_assert(BEST_SIZE >= KQuery, "best size needs to be at least KQuery");
-  static constexpr int VISITED_SIZE = CACHE_SIZE - SORTED_SIZE;
-  static constexpr int PRIOQ_SIZE = SORTED_SIZE - BEST_SIZE;
-
-  static constexpr int ITERATIONS_FOR_K = (K+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-  static constexpr int ITERATIONS_FOR_S = (S+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-
-  typedef SimpleKNNCacheNoSlack<measure, ValueT, KeyT, KQuery, D, BLOCK_DIM_X,
-                            VISITED_SIZE, PRIOQ_SIZE, BEST_SIZE,
-                            BaseT, BAddrT, DIST_STATS, OVERFLOW_STATS>
-      Cache;
-
-  void launch(const cudaStream_t stream = 0) {
-    VLOG(1) << "NoSlackQueryKernel -- BLOCK_DIM_X: " << BLOCK_DIM_X
-               << " || KQuery: " << KQuery
-               << " MAX_ITERATIONS: " << MAX_ITERATIONS
-               << " CACHE_SIZE: " << CACHE_SIZE
-               << " SORTED_SIZE: " << SORTED_SIZE
-               << " || BEST_SIZE: " << BEST_SIZE
-               << " PRIOQ_SIZE: " << PRIOQ_SIZE
-               << " VISITED_SIZE: " << VISITED_SIZE;
-    no_slack_query<<<N, BLOCK_DIM_X, 0, stream>>>((*this));
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    const KeyT n = N_offset + static_cast<int>(blockIdx.x);
-
-    Cache cache(d_base, d_query, n);
-    __syncthreads();
-
-
-    __shared__ KeyT s_knn[KS];
-    for (int i=0; i < ITERATIONS_FOR_S; ++i) {
-      const int s = i*BLOCK_DIM_X+threadIdx.x;
-      if (s < S)
-        s_knn[s] = d_translation[c_STs_offsets[c_L - 1]+s];
-    }
-    __syncthreads();
-
-    cache.fetch(s_knn, nullptr, S);
-    __syncthreads();
-
-    for (int ite = 0; ite < MAX_ITERATIONS; ++ite) {
-      __syncthreads();
-
-      const KeyT anchor = cache.pop();
-      if (anchor == Cache::EMPTY_KEY) {
-        break;
-      }
-      __syncthreads();
-
-      for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-        const int k = i*BLOCK_DIM_X+threadIdx.x;
-        if (k < K)
-          s_knn[k] =
-              d_graph[static_cast<GAddrT>(anchor) * K + k];
-      }
-
-      __syncthreads();
-      cache.fetch(s_knn, nullptr, K);
-
-    }  // end iterations
-
-    __syncthreads();
-    cache.write_best(d_query_results, n, KQuery);
-
-    if (WRITE_DISTS) {
-      if (threadIdx.x < KQuery) {
-        d_query_results_dists[n * KQuery + threadIdx.x] =
-            cache.s_dists[threadIdx.x];
-      }
-    }
-
-    if (DIST_STATS) {
-      if (!threadIdx.x) {
-        d_dist_stats[n] = cache.get_dist_stats();
-      }
-    }
-  }
-
-  const BaseT* d_base;        // [Nall,D]
-  const BaseT* d_query;       // [Nq,D]
-  const KeyT* d_translation;  // [Nall]
-
-  const KeyT* d_graph;            // [Nall,K]
-  KeyT* d_query_results;          // [Nq,KQuery]
-  ValueT* d_query_results_dists;  // [Nq,KQuery]
-
-  const float* d_nn1_stats;  // [sum,max]
-
-  int* d_dist_stats;          // [Nq]
-
-  int N;         // number of points to query for -> Nq
-  int N_offset;  // gpu offset in N
-};
-
-#endif  // INCLUDE_GGNN_QUERY_CUDA_KNN_NO_SLACK_QUERY_LAYER_CUH_
diff --git a/include/ggnn/query/cuda_knn_query_layer.cuh b/include/ggnn/query/cuda_knn_query_layer.cuh
deleted file mode 100644
index a5237cc..0000000
--- a/include/ggnn/query/cuda_knn_query_layer.cuh
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_QUERY_CUDA_KNN_QUERY_LAYER_CUH_
-#define INCLUDE_GGNN_QUERY_CUDA_KNN_QUERY_LAYER_CUH_
-
-#include <algorithm>
-#include <limits>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cub/cub.cuh>
-
-// #include "ggnn/cache/cuda_knn_sorted_buffer_cache.cuh"
-#include "ggnn/cache/cuda_simple_knn_cache.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <typename T>
-__global__ void query(const T kernel) {
-  kernel();
-}
-
-template <DistanceMeasure measure, typename ValueT, typename KeyT, int D, int K,
-          int KF, int KQuery, int S, int BLOCK_DIM_X, typename BaseT = ValueT,
-          typename BAddrT = int32_t, typename GAddrT = int32_t,
-          bool DIST_STATS = false, bool OVERFLOW_STATS = false,
-          int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256,
-          bool WRITE_DISTS = false>
-struct QueryKernel {
-  static constexpr int KL = K - KF;
-  static constexpr int KS = (K > S) ? K : S;
-
-  static constexpr int BEST_SIZE = KQuery;
-  static constexpr int VISITED_SIZE = CACHE_SIZE - SORTED_SIZE;
-  static constexpr int PRIOQ_SIZE = SORTED_SIZE - BEST_SIZE;
-
-  static constexpr int ITERATIONS_FOR_K = (K + BLOCK_DIM_X - 1) / BLOCK_DIM_X;
-  static constexpr int ITERATIONS_FOR_S = (S + BLOCK_DIM_X - 1) / BLOCK_DIM_X;
-
-  typedef SimpleKNNCache<measure, ValueT, KeyT, KQuery, D, BLOCK_DIM_X,
-                         VISITED_SIZE, PRIOQ_SIZE, BEST_SIZE, BaseT, BAddrT,
-                         DIST_STATS, OVERFLOW_STATS>
-      Cache;
-
-  void launch(const cudaStream_t stream = 0) {
-    VLOG(1) << "QueryKernel -- BLOCK_DIM_X: " << BLOCK_DIM_X
-            << " || KQuery: " << KQuery << " MAX_ITERATIONS: " << MAX_ITERATIONS
-            << " CACHE_SIZE: " << CACHE_SIZE << " SORTED_SIZE: " << SORTED_SIZE
-            << " || BEST_SIZE: " << BEST_SIZE << " PRIOQ_SIZE: " << PRIOQ_SIZE
-            << " VISITED_SIZE: " << VISITED_SIZE;
-    query<<<N, BLOCK_DIM_X, 0, stream>>>((*this));
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    const float xi =
-        (measure == Euclidean)
-            ? (d_nn1_stats[1] * d_nn1_stats[1]) * c_tau_query * c_tau_query
-            : d_nn1_stats[1] * c_tau_query;
-
-    const KeyT n = N_offset + static_cast<int>(blockIdx.x);
-
-    Cache cache(d_base, d_query, n, xi);
-    __syncthreads();
-
-    __shared__ KeyT s_knn[KS];
-    for (int i = 0; i < ITERATIONS_FOR_S; ++i) {
-      const int s = i * BLOCK_DIM_X + threadIdx.x;
-      if (s < S) s_knn[s] = d_translation[c_STs_offsets[c_L - 1] + s];
-    }
-    __syncthreads();
-
-    cache.fetch(s_knn, nullptr, S);
-    __syncthreads();
-
-    for (int ite = 0; ite < MAX_ITERATIONS; ++ite) {
-      __syncthreads();
-
-      if (measure == Euclidean) {
-        cache.xi = min(xi, cache.s_dists[0] * c_tau_query * c_tau_query);
-      } else if (measure == Cosine) {
-        cache.xi = min(xi, cache.s_dists[0] * c_tau_query);
-      }
-
-      const KeyT anchor = cache.pop();
-      if (anchor == Cache::EMPTY_KEY) {
-        break;
-      }
-      __syncthreads();
-
-      for (int i = 0; i < ITERATIONS_FOR_K; ++i) {
-        const int k = i * BLOCK_DIM_X + threadIdx.x;
-        if (k < K) s_knn[k] = d_graph[static_cast<GAddrT>(anchor) * K + k];
-      }
-
-      __syncthreads();
-      cache.fetch(s_knn, nullptr, K);
-    }  // end iterations
-
-    __syncthreads();
-    cache.write_best(d_query_results, n * num_parts + part, KQuery,
-                     part * N_base);
-
-    if (WRITE_DISTS) {
-      if (threadIdx.x < KQuery) {
-        d_query_results_dists[(n * num_parts + part) * KQuery + threadIdx.x] =
-            cache.s_dists[threadIdx.x];
-      }
-    }
-
-    if (DIST_STATS) {
-      if (!threadIdx.x) {
-        d_dist_stats[n] = cache.get_dist_stats();
-      }
-    }
-  }
-
-  const BaseT* d_base;        // [Nall,D]
-  const BaseT* d_query;       // [Nq,D]
-  const KeyT* d_translation;  // [Nall]
-
-  const KeyT* d_graph;            // [Nall,K]
-  KeyT* d_query_results;          // [Nq,KQuery]
-  ValueT* d_query_results_dists;  // [Nq,KQuery]
-
-  const float* d_nn1_stats;  // [sum,max]
-
-  int* d_dist_stats;          // [Nq]
-
-  int N;         // number of points to query for -> Nq
-  int N_offset;  // gpu offset in N
-  int N_base;    // number of points in the dataset
-
-  int num_parts {1};
-  int part      {0};
-};
-
-#endif  // INCLUDE_GGNN_QUERY_CUDA_KNN_QUERY_LAYER_CUH_
diff --git a/include/ggnn/query/cuda_knn_stats_query_layer.cuh b/include/ggnn/query/cuda_knn_stats_query_layer.cuh
deleted file mode 100644
index 998c970..0000000
--- a/include/ggnn/query/cuda_knn_stats_query_layer.cuh
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_QUERY_CUDA_KNN_STATS_QUERY_LAYER_CUH_
-#define INCLUDE_GGNN_QUERY_CUDA_KNN_STATS_QUERY_LAYER_CUH_
-
-#include <algorithm>
-#include <limits>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cub/cub.cuh>
-
-#include "ggnn/cache/cuda_simple_knn_cache.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <typename T>
-__global__ void
-stats_query(const T kernel) {
-  kernel();
-}
-
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int D, int K, int KF, int KQuery,
-          int S, int BLOCK_DIM_X, typename BaseT = ValueT,
-          typename BAddrT = int32_t, typename GAddrT = int32_t,
-          bool DIST_STATS = false, bool OVERFLOW_STATS = false,
-          int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256,
-          bool WRITE_DISTS = false>
-struct StatsQueryKernel {
-  static constexpr int KL = K - KF;
-  static constexpr int KS = (K > S) ? K : S;
-
-  static constexpr int BEST_SIZE = KQuery;
-  static constexpr int VISITED_SIZE = CACHE_SIZE - SORTED_SIZE;
-  static constexpr int PRIOQ_SIZE = SORTED_SIZE - BEST_SIZE;
-
-  typedef SimpleKNNCache<measure, ValueT, KeyT, KQuery, D, BLOCK_DIM_X, VISITED_SIZE,
-                            PRIOQ_SIZE, BEST_SIZE, BaseT, BAddrT, DIST_STATS,
-                            OVERFLOW_STATS>
-      Cache;
-
-  void launch(const cudaStream_t stream = 0) {
-    DLOG(INFO) << "StatsQueryKernel -- BLOCK_DIM_X: " << BLOCK_DIM_X
-               << " || KQuery: " << KQuery
-               << " MAX_ITERATIONS: " << MAX_ITERATIONS
-               << " CACHE_SIZE: " << CACHE_SIZE
-               << " SORTED_SIZE: " << SORTED_SIZE
-               << " || BEST_SIZE: " << BEST_SIZE
-               << " PRIOQ_SIZE: " << PRIOQ_SIZE
-               << " VISITED_SIZE: " << VISITED_SIZE;
-    stats_query<<<N, BLOCK_DIM_X, 0, stream>>>((*this));
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    const float xi = (measure == Euclidean) ?
-        (d_nn1_stats[1] * d_nn1_stats[1]) * c_tau_query * c_tau_query : d_nn1_stats[1]*c_tau_query;
-
-    const KeyT n = N_offset + static_cast<int>(blockIdx.x);
-
-    Cache cache(d_base, d_query, n, xi);
-    __syncthreads();
-
-
-    __shared__ KeyT s_knn[KS];
-    if (threadIdx.x < S)
-      s_knn[threadIdx.x] = d_translation[c_STs_offsets[c_L - 1]+threadIdx.x];
-    __syncthreads();
-
-    cache.fetch(s_knn, nullptr, S);
-    __syncthreads();
-
-    if (!threadIdx.x) {
-      d_dist_1_best_stats[n*(MAX_ITERATIONS+1)] = cache.s_dists[0];
-      d_dist_k_best_stats[n*(MAX_ITERATIONS+1)] = cache.s_dists[KQuery-1];
-    }
-
-    for (int ite = 0; ite < MAX_ITERATIONS; ++ite) {
-      __syncthreads();
-
-      if (measure == Euclidean) {
-        cache.xi = min(xi, cache.s_dists[0] * c_tau_query * c_tau_query);
-      }
-      else if (measure == Cosine) {
-        cache.xi = min(xi, cache.s_dists[0] * c_tau_query);
-      }
-
-      const KeyT anchor = cache.pop();
-      if (anchor == Cache::EMPTY_KEY) {
-        break;
-      }
-      if (blockIdx.x == debug_query_id && !threadIdx.x)
-        d_debug_query_visited_ids[ite] = anchor;
-      __syncthreads();
-
-      if (threadIdx.x < K) {
-        s_knn[threadIdx.x] =
-            d_graph[static_cast<GAddrT>(anchor) * K + threadIdx.x];
-      }
-
-      __syncthreads();
-      cache.fetch(s_knn, nullptr, K);
-
-      if (!threadIdx.x) {
-        d_dist_1_best_stats[n*(MAX_ITERATIONS+1)+ite+1] = cache.s_dists[0];
-        d_dist_k_best_stats[n*(MAX_ITERATIONS+1)+ite+1] = cache.s_dists[KQuery-1];
-      }
-    }  // end iterations
-
-    __syncthreads();
-    cache.write_best(d_query_results, n, KQuery);
-
-    if (WRITE_DISTS) {
-      if (threadIdx.x < KQuery) {
-        d_query_results_dists[n * KQuery + threadIdx.x] =
-            cache.s_dists[threadIdx.x];
-      }
-    }
-
-    if (DIST_STATS) {
-      if (!threadIdx.x) {
-        d_dist_stats[n] = cache.get_dist_stats();
-      }
-    }
-  }
-
-  const BaseT* d_base;        // [Nall,D]
-  const BaseT* d_query;       // [Nq,D]
-  const KeyT* d_translation;  // [Nall]
-
-  const KeyT* d_graph;            // [Nall,K]
-  KeyT* d_query_results;          // [Nq,KQuery]
-  ValueT* d_query_results_dists;  // [Nq,KQuery]
-
-  const float* d_nn1_stats;  // [sum,max]
-
-  int* d_dist_stats;          // [Nq]
-  ValueT* d_dist_1_best_stats;  //[Nq*MAX_ITERATIONS]
-  ValueT* d_dist_k_best_stats;  //[Nq*MAX_ITERATIONS]
-  KeyT* d_debug_query_visited_ids; //[MAX_ITERATIONS]
-  KeyT debug_query_id; // query for which to fill d_debug_query_visited_ids
-
-  int N;         // number of points to query for -> Nq
-  int N_offset;  // gpu offset in N
-};
-
-#endif  // INCLUDE_GGNN_QUERY_CUDA_KNN_STATS_QUERY_LAYER_CUH_
diff --git a/include/ggnn/query/query_kernels.cuh b/include/ggnn/query/query_kernels.cuh
new file mode 100644
index 0000000..ca24254
--- /dev/null
+++ b/include/ggnn/query/query_kernels.cuh
@@ -0,0 +1,65 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_QUERY_KERNELS_CUH
+#define INCLUDE_GGNN_QUERY_KERNELS_CUH
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/fwd.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT, typename BaseT>
+struct GPUInstance;
+
+template <typename KeyT, typename ValueT, typename BaseT>
+class QueryKernels {
+ public:
+  using GPUInstance = ggnn::GPUInstance<KeyT, ValueT, BaseT>;
+  using Graph = ggnn::Graph<KeyT, ValueT>;
+  using Results = ggnn::Results<KeyT, ValueT>;
+
+  QueryKernels() = default;
+  QueryKernels(const DistanceMeasure measure);
+  virtual ~QueryKernels() = default;
+  QueryKernels(const QueryKernels&) = delete;
+  QueryKernels(QueryKernels&&) noexcept = default;
+  QueryKernels& operator=(const QueryKernels&) = delete;
+  QueryKernels& operator=(QueryKernels&&) noexcept = default;
+
+  virtual void query(const GPUInstance& gpu_instance, const uint32_t shard_id,
+                     const Dataset<BaseT>& query, const uint32_t KQuery, const uint32_t max_iters,
+                     const float tau_query, Results& results)
+  {
+    pimpl->query(gpu_instance, shard_id, query, KQuery, max_iters, tau_query, results);
+  }
+  virtual void bruteForceQuery(const Dataset<BaseT>& base, const Dataset<BaseT>& query,
+                               const uint32_t KQuery, Results& results, cudaStream_t stream = 0)
+  {
+    pimpl->bruteForceQuery(base, query, KQuery, results, stream);
+  }
+
+ private:
+  std::unique_ptr<QueryKernels> pimpl;
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_QUERY_KERNELS_CUH
diff --git a/include/ggnn/query/query_layer.cuh b/include/ggnn/query/query_layer.cuh
new file mode 100644
index 0000000..83dbdbd
--- /dev/null
+++ b/include/ggnn/query/query_layer.cuh
@@ -0,0 +1,90 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#ifndef INCLUDE_GGNN_QUERY_LAYER_CUH
+#define INCLUDE_GGNN_QUERY_LAYER_CUH
+
+#include <ggnn/base/def.h>
+#include <glog/logging.h>
+
+#include <cstdint>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void query(const T kernel);
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          bool WRITE_DISTS = false, bool DIST_STATS = false>
+struct QueryKernel {
+  static constexpr uint32_t BLOCK_DIM_X = BLOCK_SIZE;
+  static constexpr uint32_t DIST_ITEMS_PER_THREAD = 4;
+
+  static constexpr uint32_t MAX_SM = 48 * 1024;
+
+  void launch(const uint32_t N, const cudaStream_t stream = 0)
+  {
+    VLOG(1) << "QueryKernel -- BLOCK_DIM_X: " << BLOCK_DIM_X << " || KQuery: " << KQuery
+            << " MAX_ITERATIONS: " << max_iterations << " CACHE_SIZE: " << cache_size
+            << " SORTED_SIZE: " << sorted_size;
+    uint32_t sm_size = cache_size * sizeof(KeyT) + sorted_size * sizeof(ValueT);
+    CHECK_LT(KQuery, sorted_size);
+    CHECK_LT(sorted_size, cache_size);
+    CHECK_LT(sm_size, MAX_SM);
+
+    CHECK_LE(D, BLOCK_DIM_X * DIST_ITEMS_PER_THREAD);
+
+    query<<<N, BLOCK_DIM_X, sm_size, stream>>>((*this));
+  }
+
+  __device__ __forceinline__ void operator()() const;
+
+  const uint32_t D;
+  const DistanceMeasure measure;
+
+  const uint32_t KQuery;
+  const uint32_t sorted_size;
+  const uint32_t cache_size;
+
+  const float tau_query;
+  const uint32_t max_iterations;
+
+  KeyT N_base;  // number of points in the dataset
+
+  const uint32_t KBuild;
+  const uint32_t num_starting_points;
+
+  const BaseT* d_base;   // [Nall,D]
+  const BaseT* d_query;  // [Nq,D]
+
+  const KeyT* d_graph;            // [Nall,K]
+  const KeyT* d_starting_points;  // [S]
+
+  const float* d_nn1_stats;  // [sum,max]
+
+  KeyT* d_query_results;          // [Nq,KQuery]
+  ValueT* d_query_results_dists;  // [Nq,KQuery]
+
+  uint32_t* d_dist_stats;  // [Nq]
+
+  const uint32_t shards_per_gpu{1};
+  const uint32_t on_gpu_shard_id{0};
+};
+
+};  // namespace ggnn
+
+#endif  // INCLUDE_GGNN_QUERY_LAYER_CUH
diff --git a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
deleted file mode 100644
index 49d76a1..0000000
--- a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_SELECT_CUDA_KNN_WRS_SELECT_LAYER_CUH_
-#define INCLUDE_GGNN_SELECT_CUDA_KNN_WRS_SELECT_LAYER_CUH_
-
-#include <limits>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <gflags/gflags.h>
-#include <cub/cub.cuh>
-
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <typename T>
-__global__ void
-select(const T kernel) {
-  kernel();
-}
-
-/*
- * Selection of K Points per B for Layers.
- */
-template <typename ValueT, typename KeyT, int BLOCK_DIM_X, int Sglob>
-struct WRSSelectionKernel {
-  static constexpr int ITEMS_PER_THREAD = (2 * Sglob - 1) / BLOCK_DIM_X + 1;
-  typedef cub::BlockRadixSort<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, KeyT>
-      BlockRadixSort;
-
-  void launch(const cudaStream_t stream = 0) {
-    VLOG(2) << "SelectionKernel -- B: " << B << " | B_offset: " << B_offset;
-    select<<<B, BLOCK_DIM_X, 0, stream>>>((*this));
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    __shared__ typename BlockRadixSort::TempStorage temp_storage;
-    const int b = B_offset + blockIdx.x;
-
-    const int S_current = S + int(b < S_offset);
-    const int start = b*S + min(b, S_offset);
-
-    ValueT keys[ITEMS_PER_THREAD];
-    KeyT values[ITEMS_PER_THREAD];
-
-    for (int item = 0; item < ITEMS_PER_THREAD; item++) {
-      const int i = item * BLOCK_DIM_X + threadIdx.x;
-      if (i < S_current) {
-        const KeyT n = start + i;
-        const float e =
-            (-1 * logf(d_rng[n])) /
-            // the top merge kernel is configured to output the matching values for the current layer
-            // otherwise, we would need to translate n to the bottom layer
-            (d_nn1_dist_buffer[n]
-             + std::numeric_limits<float>::epsilon());
-        keys[item] = e;
-        values[item] = n;
-      } else { //FIXME: if this happens, the following sym query will fail
-        keys[item] = -1.f;
-        values[item] = -1;
-      }
-    }
-
-    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values);
-
-    __syncthreads();
-
-    // block index / growth ==> index of the upper segment
-    const int s_segment = b / c_G;
-    // b % c_G ==> n-th segment contributing to the upper segment
-    const int sg_segment = b - s_segment * c_G;
-
-    // number of points contributed by the current block
-    const int SG_current = SG + int(sg_segment < SG_offset);
-
-    const int s_offset = s_segment * Sglob + sg_segment*SG + min(sg_segment, SG_offset);
-
-    for (int item = 0; item < ITEMS_PER_THREAD; item++) {
-      const int s = threadIdx.x + item * BLOCK_DIM_X;
-      if (s < SG_current) {
-        const KeyT n = values[item];
-
-        d_selection[s_offset + s] = n;
-        d_translation[s_offset + s] = (!layer) ? n : d_translation_layer[n];
-      }
-    }
-  }
-
-  int B;  // number of blocks to work on
-  int B_offset;
-
-  int S; // segment/block size in current layer
-  int S_offset; // number of blocks with S+1 elements = remainder in division by block size (can only be non-zero for the base-layer)
-
-  int SG; // S/G = number of points contributed from current segment to upper segment
-  int SG_offset; // S%G = number of segments which contribute an additional point to the upper segment
-
-  int layer;
-
-  const KeyT* d_translation_layer;
-  const float* d_nn1_dist_buffer;
-  const float* d_rng;
-
-  KeyT* d_selection;
-  KeyT* d_translation;
-};
-
-#endif  // INCLUDE_GGNN_SELECT_CUDA_KNN_WRS_SELECT_LAYER_CUH_
diff --git a/include/ggnn/sym/cuda_knn_sym_buffer_merge_layer.cuh b/include/ggnn/sym/cuda_knn_sym_buffer_merge_layer.cuh
deleted file mode 100644
index fd94ea3..0000000
--- a/include/ggnn/sym/cuda_knn_sym_buffer_merge_layer.cuh
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_SYM_CUDA_KNN_SYM_BUFFER_MERGE_LAYER_CUH_
-#define INCLUDE_GGNN_SYM_CUDA_KNN_SYM_BUFFER_MERGE_LAYER_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <typename T>
-__global__ void
-sym_buffer_merge(const T kernel) {
-  kernel();
-}
-
-template <typename ValueT, typename KeyT, int K, int KF, int BLOCK_DIM_X,
-          typename GAddrT = int32_t>
-struct SymBufferMergeKernel {
-  static constexpr int POINTS_PER_BLOCK = BLOCK_DIM_X / KF;
-  static constexpr int KL = K - KF;
-
-  void launch(const cudaStream_t stream = 0) {
-    VLOG(2) << "SymBufferMergeKernel -- N: " << N << " [" << N_offset << " " << N_offset+N << "]\n";
-    dim3 block(KF, POINTS_PER_BLOCK);
-    sym_buffer_merge<<<(N - 1) / POINTS_PER_BLOCK + 1, block, 0, stream>>>((*this));
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    const GAddrT n = N_offset + blockIdx.x * POINTS_PER_BLOCK + threadIdx.y;
-    const int kf = threadIdx.x;
-
-    if (n >= N) return;
-
-    __shared__ KeyT s_sym_buffer[POINTS_PER_BLOCK * KF]; // inverse links which need to be added to the graph
-    __shared__ KeyT s_graph_buffer[POINTS_PER_BLOCK * KF]; // current contents of the graph's foreign/inverse link storage
-    __shared__ bool s_found[POINTS_PER_BLOCK]; // whether the foreign link in the graph exists in the list of inverse links to be added
-
-    // number of inverse links to be entered per point (only valid for threadIdx.x == 0)
-    int r_num_links;
-    if (!threadIdx.x) {
-      r_num_links = d_sym_atomic[N_offset + blockIdx.x * POINTS_PER_BLOCK + threadIdx.y];
-    }
-
-    const GAddrT addr_graph = n * K + KL + kf;
-    const int tid = threadIdx.y * KF + threadIdx.x;
-    //# load buffer
-    s_sym_buffer[tid] = d_sym_buffer[n * KF + kf];
-    s_graph_buffer[tid] = d_graph[addr_graph];
-
-    // add existing foreign links to the inverse link list if there is still room
-    for (int i = 0; i < KF; i++) {
-      if (!threadIdx.x) {
-        // only search if there is a spot where we could add another link
-        s_found[threadIdx.y] = r_num_links >= KF;
-      }
-      __syncthreads();
-
-      KeyT r_graph;
-
-      if (!s_found[threadIdx.y])
-      {
-        // read all requested inverse links per point
-        const KeyT r_sym_buffer = s_sym_buffer[tid];
-        // read existing foreign link i per point from graph
-        r_graph = s_graph_buffer[threadIdx.y * KF + i];
-        // existing foreign link exists in requested inverse link list? ==> found
-        if (r_graph == r_sym_buffer) s_found[threadIdx.y] = true;
-      }
-      __syncthreads();
-
-      // if there is still room and the existing foreign link is not part of the requested inverse links, add it
-      if (!threadIdx.x && !s_found[threadIdx.y]) {
-        s_sym_buffer[threadIdx.y * KF + r_num_links] = r_graph;
-        ++r_num_links;
-      }
-    }
-
-    __syncthreads();
-
-    // store requested inverse links and added previous foreign links in the graph's foreign link list.
-    // if there aren't enough links, store the points own index (to avoid entries with -1)
-    const KeyT res = s_sym_buffer[tid];
-    d_graph[addr_graph] = (res >= 0) ? res : n;
-  }
-
-  const KeyT* d_sym_buffer;  // [N, KF]
-  const int* d_sym_atomic;  // [N]
-  KeyT* d_graph;             // [N, K]
-
-  int N;  // number of points to work on
-  int N_offset;
-};
-
-#endif  // INCLUDE_GGNN_SYM_CUDA_KNN_SYM_BUFFER_MERGE_LAYER_CUH_
diff --git a/include/ggnn/sym/cuda_knn_sym_query_layer.cuh b/include/ggnn/sym/cuda_knn_sym_query_layer.cuh
deleted file mode 100644
index dd6fa11..0000000
--- a/include/ggnn/sym/cuda_knn_sym_query_layer.cuh
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_SYM_CUDA_KNN_SYM_QUERY_LAYER_CUH_
-#define INCLUDE_GGNN_SYM_CUDA_KNN_SYM_QUERY_LAYER_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-#include "ggnn/cache/cuda_simple_knn_sym_cache.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-template <typename T>
-__global__ void
-sym(const T kernel) {
-  kernel();
-}
-
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int D, int K, int KF, int BLOCK_DIM_X,
-          typename BaseT = ValueT, typename BAddrT = int32_t,
-          typename GAddrT = int32_t>
-struct SymQueryKernel {
-  static constexpr int KL = K - KF;
-
-  // this allows for loop unrolling
-  static constexpr int ITERATIONS_FOR_K = (K+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-  static constexpr int ITERATIONS_FOR_KL = (KL+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-
-  static constexpr int MAX_PER_PATH_ITERATIONS = 20;
-
-  static constexpr int KQuery = KL;
-  static constexpr int CACHE_SIZE = 256;
-  static constexpr int SORTED_SIZE = 128;
-
-  static constexpr int BEST_SIZE = KQuery;
-  static constexpr int VISITED_SIZE = CACHE_SIZE - SORTED_SIZE;
-  static constexpr int PRIOQ_SIZE = SORTED_SIZE - BEST_SIZE;
-
-  static constexpr bool DIST_STATS = false;
-  static constexpr bool OVERFLOW_STATS = false;
-
-  typedef SimpleKNNSymCache<measure, ValueT, KeyT, KL, D, BLOCK_DIM_X, VISITED_SIZE,
-                            PRIOQ_SIZE, BEST_SIZE, BaseT, BAddrT, DIST_STATS,
-                            OVERFLOW_STATS>
-      Cache;
-
-  void launch(const cudaStream_t stream = 0) {
-    VLOG(1) << "SymQueryKernel -- Layer: " << layer << " | N: " << N << " ["
-               << N_offset << " " << N_offset + N << "]";
-    sym<<<N, BLOCK_DIM_X, 0, stream>>>((*this));
-
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    const float xi =
-        (measure == Euclidean)
-            ? (d_nn1_stats[0] * d_nn1_stats[0]) * c_tau_build * c_tau_build
-            : d_nn1_stats[0] * c_tau_build;
-
-    const KeyT n = N_offset + static_cast<int>(blockIdx.x);
-
-    Cache cache(d_base, (layer) ? d_translation[n] : n, xi);
-
-    int counter = 0;
-
-    __shared__ KeyT s_knn[K];
-    __shared__ KeyT s_sym_ids[KL];
-    __shared__ bool s_connected;
-
-    // fetch neighbors in local neighbor list
-    for (int i=0; i < ITERATIONS_FOR_KL; ++i) {
-      const int kl = i*BLOCK_DIM_X+threadIdx.x;
-      if (kl < KL) {
-        const KeyT sym_n = d_graph[static_cast<GAddrT>(n) * K + kl];
-        s_sym_ids[kl] = sym_n;
-      }
-    }
-    for (int k = 0; k < KL; k++) {
-      __syncthreads();
-      if (!threadIdx.x) s_connected = false;
-
-      // search for k-th local neighbor
-      cache.init_start_point(s_sym_ids[k], (layer) ? d_translation : nullptr);
-
-      bool result = false;
-
-      for (int ite = 0; ite < MAX_PER_PATH_ITERATIONS && !result; ++ite) {
-        __syncthreads();
-
-        const KeyT anchor = cache.pop();
-
-        if (anchor == Cache::EMPTY_KEY) {
-          break;
-        }
-
-        // fetch neighbors at anchor point + points in sym buffer
-        for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-          const int k = i*BLOCK_DIM_X+threadIdx.x;
-          if (k < K) {
-            const KeyT other_id =
-                (k < KL)
-                    ? d_graph[static_cast<GAddrT>(anchor) * K + k]
-                    : d_sym_buffer[static_cast<GAddrT>(anchor) * KF +
-                                   k - KL];
-            if (other_id == n) {
-              s_connected = true;
-            }
-            s_knn[k] = other_id;
-          }
-        }
-        __syncthreads();
-
-
-        // stop if the original index n has been found as a neighbor
-        if(s_connected){
-          result = true;
-        }
-        else
-        {
-          cache.fetch(s_knn, (layer) ? d_translation : nullptr, K);
-        }
-
-      }  // end per k iteration
-
-      if (!result) {
-        // we need to add a symmetric link to the original index n
-        if (!threadIdx.x) {
-          for (int i = 0; i < BEST_SIZE && !result; i++) {
-            // try to enter the symmetric link at the i-th nearest neighbor
-            // found on the path
-            const KeyT other_n = cache.s_cache[i];
-            if (other_n == Cache::EMPTY_KEY) break;
-            const int pos = atomicAdd(&d_sym_atomic[other_n], 1);
-            if (pos < KF) {
-              d_sym_buffer[static_cast<GAddrT>(other_n) * KF + pos] = n;
-              // cache.set_connected(other_n);
-              result = true;
-            }
-          }
-          // could not add a link, increment the counter
-          if (!result) {
-            counter++;
-          }
-        }
-      }
-    }  // end k neighbors
-
-    if (OVERFLOW_STATS && !threadIdx.x) {
-      d_stats[n] = counter;
-    }
-  }
-
-  const BaseT* d_base;        // [N0,D]
-  const KeyT* d_graph;        // [N,K]
-  const KeyT* d_translation;  // [N] or nullptr if on base layer
-
-  int* d_sym_atomic;  // [N]
-  KeyT* d_sym_buffer;  // [N,KF]
-
-  const float* d_nn1_stats;
-  int* d_stats;  // number of links which could not be established
-
-  // although this provides no additional useful information to the kernel,
-  // it compiles to a faster version than checking for d_translation == nullptr
-  int layer;
-
-  int N;  // number of points to work on
-  int N_offset;
-};
-
-#endif  // INCLUDE_GGNN_SYM_CUDA_KNN_SYM_QUERY_LAYER_CUH_
diff --git a/include/ggnn/utils/cuda_knn_constants.cuh b/include/ggnn/utils/cuda_knn_constants.cuh
deleted file mode 100644
index 404b0ae..0000000
--- a/include/ggnn/utils/cuda_knn_constants.cuh
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_CUDA_KNN_CONSTANTS_CUH_
-#define INCLUDE_GGNN_CUDA_KNN_CONSTANTS_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include "cuda_knn_utils.cuh"
-
-static constexpr int MAX_LAYER = 20;
-
-__constant__ int c_Ns[MAX_LAYER];
-__constant__ int c_Ns_offsets[MAX_LAYER];
-__constant__ int c_STs_offsets[MAX_LAYER];
-
-__constant__ int c_G;
-__constant__ int c_L;
-
-__constant__ float c_tau_build;
-__constant__ float c_tau_query;
-
-__constant__ int c_S0;
-__constant__ int c_S0_offset;
-
-struct ConstantInfoKernel {
-  void launch() {
-    printf("launch ConstantInfoKernel devId: %d L: %d \n", dev_id, L);
-    launcher<<<L, 1>>>((*this));
-  }
-
-  __device__ __forceinline__ void operator()() const {
-    const int l = blockIdx.x;
-    if (!threadIdx.x) {
-      printf(
-          "l: %d dev: %d -> N: %d | Noff: %d | SToff: %d "
-          " | G: %d | L: %d | S0: %d S0_offset: %d\n",
-          l, dev_id, c_Ns[l], c_Ns_offsets[l], c_STs_offsets[l], c_G, c_L, c_S0,
-          c_S0_offset);
-    }
-  }
-
-  int L;
-  int dev_id;
-};
-
-#endif  // INCLUDE_GGNN_CUDA_KNN_CONSTANTS_CUH_
diff --git a/include/ggnn/utils/cuda_knn_dataset.cuh b/include/ggnn/utils/cuda_knn_dataset.cuh
deleted file mode 100644
index 743de10..0000000
--- a/include/ggnn/utils/cuda_knn_dataset.cuh
+++ /dev/null
@@ -1,339 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef INCLUDE_GGNN_UTILS_CUDA_KNN_DATASET_CUH_
-#define INCLUDE_GGNN_UTILS_CUDA_KNN_DATASET_CUH_
-
-#include <algorithm>
-#include <limits>
-#include <string>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#include "io/loader_ann.hpp"
-#include "io/storer_ann.hpp"
-
-/**
- * KNN database data that will be shared with the GPU
- * and some utilities to load (and store) that data
- *
- * @param KeyT datatype of dataset indices
- * @param BaseT datatype of dataset vector elements
- * @param BAddrT address type used to access dataset vectors (needs to be able
- * to represent N_base*D)
- */
-template <typename KeyT, typename BaseT, typename BAddrT>
-struct Dataset {
-  /// dataset vectors
-  BaseT* h_base{nullptr};
-
-  /// query vectors
-  BaseT* h_query{nullptr};
-  /// ground truth indices in the dataset for the given queries
-  KeyT* gt{nullptr};
-
-  /// number of dataset vectors
-  int N_base{0};
-  /// number of query vectors (and ground truth indices)
-  int N_query{0};
-  /// dimension of vectors in the dataset and query
-  int D{0};
-  /// number of nearest neighbors per ground truth entry
-  int K_gt{0};
-
-  // indices within the ground truth list per point up to which result ids
-  // need to be compared.
-  // without duplicates in the dataset, each entry should just be 1 / KQuery
-  std::vector<uint8_t> top1DuplicateEnd;
-  std::vector<uint8_t> topKDuplicateEnd;
-
-  Dataset(const std::string& basePath, const std::string& queryPath,
-          const std::string& gtPath, const size_t N_base = std::numeric_limits<size_t>::max()) {
-
-    VLOG(1) << "N_base: " << N_base;
-
-    bool success = loadBase(basePath, 0, N_base) && loadQuery(queryPath) && loadGT(gtPath);
-
-    if (!success)
-      throw std::runtime_error(
-          "failed to load dataset (see previous log entries for details).\n");
-  }
-
-  //TODO(fabi): cleanup.
-  ~Dataset() {
-    freeBase();
-    freeQuery();
-    freeGT();
-  }
-
-  Dataset(const Dataset&) = delete;
-  Dataset(Dataset&&) = delete;
-  Dataset& operator=(const Dataset&) = delete;
-  Dataset& operator=(Dataset&&) = delete;
-
-  void freeBase() {
-    cudaFreeHost(h_base);
-    h_base = nullptr;
-    N_base = 0;
-    if (!h_query) D = 0;
-  }
-
-  void freeQuery() {
-    cudaFreeHost(h_query);
-    h_query = nullptr;
-    if (!gt) N_query = 0;
-    if (!h_base) D = 0;
-  }
-
-  void freeGT() {
-    free(gt);
-    gt = nullptr;
-    if (!h_query) N_query = 0;
-    K_gt = 0;
-  }
-
-  /// load base vectors from file
-  bool loadBase(const std::string& base_file, size_t from = 0,
-                size_t num = std::numeric_limits<size_t>::max()) {
-    freeBase();
-    XVecsLoader<BaseT> base_loader(base_file);
-
-    num = std::min(num, base_loader.Num() - from);
-    CHECK_GT(num, 0) << "The requested range contains no vectors.";
-
-    N_base = num;
-    if (D == 0) {
-      D = base_loader.Dim();
-    }
-    CHECK_EQ(D, base_loader.Dim()) << "Dimension mismatch";
-
-    const size_t dataset_max_index =
-        static_cast<size_t>(N_base) * static_cast<size_t>(D);
-    CHECK_LT(dataset_max_index, std::numeric_limits<BAddrT>::max())
-        << "Address type is insufficient to address "
-           "the requested dataset. aborting";
-
-    const size_t base_memsize = static_cast<BAddrT>(N_base) * D * sizeof(BaseT);
-
-    CHECK_CUDA(cudaMallocHost(&h_base, base_memsize, cudaHostAllocPortable | cudaHostAllocWriteCombined));
-
-    base_loader.load(h_base, from, num);
-
-    return true;
-  }
-
-  /// load query vectors from file
-  bool loadQuery(const std::string& query_file, KeyT from = 0,
-                 KeyT num = std::numeric_limits<KeyT>::max()) {
-    freeQuery();
-    XVecsLoader<BaseT> query_loader(query_file);
-
-    num = std::min(num, query_loader.Num() - from);
-    CHECK_GT(num, 0) << "The requested range contains no vectors.";
-
-    if (N_query == 0) {
-      N_query = num;
-    }
-    CHECK_EQ(N_query, num) << "Number mismatch";
-
-    if (D == 0) {
-      D = query_loader.Dim();
-    }
-    CHECK_EQ(D, query_loader.Dim()) << "Dimension mismatch";
-
-    const size_t dataset_max_index =
-        static_cast<size_t>(N_query) * static_cast<size_t>(D);
-    CHECK_LT(dataset_max_index, std::numeric_limits<BAddrT>::max())
-        << "Address type is insufficient to address "
-           "the requested dataset. aborting";
-
-
-    const size_t query_memsize = static_cast<BAddrT>(N_query) * D * sizeof(BaseT);
-
-    CHECK_CUDA(cudaMallocHost(&h_query, query_memsize, cudaHostAllocPortable));
-
-    query_loader.load(h_query, from, num);
-
-    return true;
-  }
-
-  /// load ground truth indices from file
-  bool loadGT(const std::string& gt_file, KeyT from = 0,
-              KeyT num = std::numeric_limits<KeyT>::max()) {
-    freeGT();
-
-
-    if (gt_file.empty()) {
-      LOG(INFO) << "No ground truth file loaded. Make sure to compute it yourself before evaluating any queries.";
-
-      CHECK_GT(N_query, 0) << "Cannot determine the number of GT entries which need to be computed if the query is not yet loaded.";
-      K_gt = 100;
-
-      //TODO(fabi): move out of if branch.
-      gt = (KeyT*) malloc(static_cast<BAddrT>(N_query) * K_gt * sizeof(KeyT));
-      CHECK(gt);
-
-      return true;
-    }
-
-    XVecsLoader<KeyT> gt_loader(gt_file);
-
-    num = std::min(num, gt_loader.Num() - from);
-    CHECK_GT(num, 0) << "The requested range contains no vectors.";
-
-    if (N_query == 0) {
-      N_query = num;
-    }
-    CHECK_EQ(N_query, num) << "Number mismatch";
-
-    K_gt = gt_loader.Dim();
-
-    const size_t dataset_max_index =
-        static_cast<size_t>(N_query) * static_cast<size_t>(K_gt);
-    CHECK_LT(dataset_max_index, std::numeric_limits<BAddrT>::max())
-        << "Address type is insufficient to address "
-           "the requested dataset. aborting";
-
-    gt = (KeyT*) malloc(static_cast<BAddrT>(N_query) * K_gt * sizeof(KeyT));
-    CHECK(gt);
-
-    gt_loader.load(gt, from, num);
-    return true;
-  }
-
-  template <DistanceMeasure measure, typename ValueT>
-  ValueT compute_distance_query(KeyT index, KeyT query) const {
-    CHECK_GE(index, 0);
-    CHECK_GE(query, 0);
-    CHECK_LT(index, N_base);
-    CHECK_LT(query, N_query);
-
-    ValueT distance = 0.0f, index_norm = 0.0f, query_norm = 0.0f;
-    for (int d=0; d<D; ++d)
-    {
-      if (measure == Euclidean) {
-        distance += (h_query[static_cast<size_t>(query)*D+d]
-                    -h_base [static_cast<size_t>(index)*D+d])
-                   *(h_query[static_cast<size_t>(query)*D+d]
-                    -h_base [static_cast<size_t>(index)*D+d]);
-      }
-      else if (measure == Cosine) {
-        distance   += h_query[static_cast<size_t>(query)*D+d]
-                     *h_base [static_cast<size_t>(index)*D+d];
-        query_norm += h_query[static_cast<size_t>(query)*D+d]
-                     *h_query[static_cast<size_t>(query)*D+d];
-        index_norm += h_base [static_cast<size_t>(index)*D+d]
-                     *h_base [static_cast<size_t>(index)*D+d];
-      }
-    }
-    if (measure == Euclidean) {
-      distance = sqrtf(distance);
-    }
-    else if (measure == Cosine) {
-      if (index_norm*query_norm > 0.0f)
-        distance = fabs(1.0f-distance/sqrtf(index_norm*query_norm));
-      else
-        distance = 1.0f;
-    }
-    return distance;
-  };
-
-  template <DistanceMeasure measure, typename ValueT>
-  ValueT compute_distance_base_to_base(KeyT a, KeyT b) const {
-    CHECK_GE(a, 0);
-    CHECK_GE(b, 0);
-    CHECK_LT(a, N_base);
-    CHECK_LT(b, N_base);
-
-    ValueT distance = 0.0f, a_norm = 0.0f, b_norm = 0.0f;
-    for (int d=0; d<D; ++d)
-    {
-      if (measure == Euclidean) {
-        distance += (h_base[static_cast<size_t>(b)*D+d]-h_base[static_cast<size_t>(a)*D+d])
-                   *(h_base[static_cast<size_t>(b)*D+d]-h_base[static_cast<size_t>(a)*D+d]);
-      }
-      else if (measure == Cosine) {
-        distance += h_base[static_cast<size_t>(b)*D+d]*h_base[static_cast<size_t>(a)*D+d];
-        b_norm += h_base[static_cast<size_t>(b)*D+d]*h_base[static_cast<size_t>(b)*D+d];
-        a_norm += h_base[static_cast<size_t>(a)*D+d]*h_base[static_cast<size_t>(a)*D+d];
-      }
-    }
-    if (measure == Euclidean) {
-      distance = sqrtf(distance);
-    }
-    else if (measure == Cosine) {
-      if (a_norm*b_norm > 0.0f)
-        distance = fabs(1.0f-distance/sqrtf(a_norm*b_norm));
-      else
-        distance = 1.0f;
-    }
-    return distance;
-  };
-
-  template <DistanceMeasure measure, typename ValueT>
-  void checkForDuplicatesInGroundTruth(const int KQuery) {
-    if (!top1DuplicateEnd.empty() || !topKDuplicateEnd.empty())
-      return;
-    VLOG(2) << "searching for duplicates in the ground truth indices.";
-
-    const float Epsilon = 0.000001f;
-
-    size_t total_num_duplicates_top_1 = 0, total_num_duplicates_top_k = 0;
-    uint8_t max_dup_top_1 = 0, max_dup_top_k = 0;
-
-    for (int n = 0; n < N_query; n++) {
-      const ValueT gt_dist1 = compute_distance_query<measure, ValueT>(gt[n * K_gt], n);
-      uint8_t num_duplicates_top_1 = 0, num_duplicates_top_k = 0;
-      for (int k=1; k < K_gt; ++k) {
-        const ValueT gt_dist_k = compute_distance_query<measure, ValueT>(gt[n * K_gt + k], n);
-        if (gt_dist_k-gt_dist1 > Epsilon)
-          break;
-        ++num_duplicates_top_1;
-      }
-      total_num_duplicates_top_1 += num_duplicates_top_1;
-      if (num_duplicates_top_1 > max_dup_top_1)
-        max_dup_top_1 = num_duplicates_top_1;
-      top1DuplicateEnd.push_back(1+num_duplicates_top_1);
-
-      if (KQuery <= K_gt) {
-        const ValueT gt_distKQuery = compute_distance_query<measure, ValueT>(gt[n * K_gt + KQuery-1], n);
-        for (int k=KQuery; k < K_gt; ++k) {
-          const ValueT gt_dist_k = compute_distance_query<measure, ValueT>(gt[n * K_gt + k], n);
-          if (gt_dist_k-gt_distKQuery > Epsilon)
-            break;
-          ++num_duplicates_top_k;
-        }
-
-        total_num_duplicates_top_k += num_duplicates_top_k;
-        if (num_duplicates_top_k > max_dup_top_k)
-          max_dup_top_k = num_duplicates_top_k;
-        topKDuplicateEnd.push_back(KQuery+num_duplicates_top_k);
-      }
-      else
-        topKDuplicateEnd.push_back(K_gt);
-    }
-
-    VLOG(2) << "found " << total_num_duplicates_top_1 << " duplicates for c@1."
-            << " max: " << uint32_t(max_dup_top_1);
-    if (KQuery <= K_gt) {
-      VLOG(2) << "found " << total_num_duplicates_top_k << " duplicates for c@"
-              << KQuery << "."
-              << " max: " << uint32_t(max_dup_top_k);
-    }
-  }
-};
-
-#endif  // INCLUDE_GGNN_UTILS_CUDA_KNN_DATASET_CUH_
diff --git a/include/ggnn/utils/cuda_knn_distance.cuh b/include/ggnn/utils/cuda_knn_distance.cuh
deleted file mode 100644
index 6dfa00a..0000000
--- a/include/ggnn/utils/cuda_knn_distance.cuh
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_UTILS_CUDA_KNN_DISTANCE_CUH_
-#define INCLUDE_GGNN_UTILS_CUDA_KNN_DISTANCE_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-// helper structs to avoid having the register when not necessary
-struct Nothing {
-};
-template <typename ValueT>
-struct QueryNorm {
-  // only valid in thread 0, only needed if measure == Cosine
-  ValueT query_norm;
-};
-
-/**
- * Distance calculates the distance/difference between the base vector and
- * other_id vector.
- */
-template <DistanceMeasure measure,
-          typename ValueT, typename KeyT, int D, int BLOCK_DIM_X,
-          typename BaseT = ValueT, typename AddrT = KeyT>
-struct Distance : std::conditional<measure == Cosine, QueryNorm<ValueT>, Nothing>::type {
-  enum { ITEMS_PER_THREAD = (D - 1) / BLOCK_DIM_X + 1 };
-
-  struct DistanceAndNorm {
-    ValueT r_dist;
-    ValueT r_norm;
-
-    __device__ __forceinline__ DistanceAndNorm(const ValueT dist, const ValueT norm)
-        : r_dist(dist), r_norm(norm) {}
-
-    __device__ __forceinline__ DistanceAndNorm() {}
-
-    struct Sum {
-      __host__ __device__ __forceinline__ DistanceAndNorm operator()(const DistanceAndNorm& a,
-                                                                     const DistanceAndNorm& b) const {
-        return DistanceAndNorm(a.r_dist + b.r_dist, a.r_norm + b.r_norm);
-      }
-    };
-  };
-
-  typedef cub::BlockReduce<ValueT, BLOCK_DIM_X> BlockReduceDist;
-  typedef typename std::conditional<measure == Cosine, cub::BlockReduce<DistanceAndNorm, BLOCK_DIM_X>, BlockReduceDist>::type BlockReduceDistAndNorm;
-
-  union TempStorage {
-    typename BlockReduceDist::TempStorage dist_temp_storage;
-    typename BlockReduceDistAndNorm::TempStorage dist_and_norm_temp_storage;
-    ValueT dist;
-  };
-
-  const BaseT* d_base;
-  BaseT r_query[ITEMS_PER_THREAD];
-
-  TempStorage& s_temp_storage;
-  __device__ __forceinline__ TempStorage& PrivateTmpStorage() {
-    __shared__ TempStorage s_tmp;
-    return s_tmp;
-  }
-
-  /**
-   * Distance dist_calc(d_base, d_query, blockIdx.x);
-   */
-  __device__ __forceinline__ Distance(const BaseT* d_base, const BaseT* d_query, const KeyT n)
-      : d_base(d_base), s_temp_storage(PrivateTmpStorage()) {
-    loadQueryPos(d_query+static_cast<AddrT>(n) * D);
-  }
-
-  /**
-   * Distance dist_calc(d_base, blockIdx.x);
-   */
-  __device__ __forceinline__ Distance(const BaseT* d_base, const KeyT n)
-      : d_base(d_base), s_temp_storage(PrivateTmpStorage()) {
-    loadQueryPos(d_base+static_cast<AddrT>(n) * D);
-  }
-
-  template <DistanceMeasure m = measure, typename std::enable_if<m == Euclidean, int>::type = 0> // euclidean distance version
-  __device__ __forceinline__ void loadQueryPos(const BaseT* d_query)
-  {
-    for (int item = 0; item < ITEMS_PER_THREAD; ++item) {
-      const int read_dim = item * BLOCK_DIM_X + threadIdx.x;
-      if (read_dim < D) {
-        r_query[item] = *(d_query+read_dim);
-      }
-    }
-  }
-  template <DistanceMeasure m = measure, typename std::enable_if<m == Cosine, int>::type = 0> // cosine similarity version
-  __device__ __forceinline__ void loadQueryPos(const BaseT* d_query)
-  {
-    ValueT r_query_norm = 0.0f;
-    for (int item = 0; item < ITEMS_PER_THREAD; ++item) {
-      const int read_dim = item * BLOCK_DIM_X + threadIdx.x;
-      if (read_dim < D) {
-        r_query[item] = *(d_query+read_dim);
-        r_query_norm += r_query[item]*r_query[item];
-      }
-    }
-    // only needed by thread 0
-    this->query_norm = BlockReduceDist(s_temp_storage.dist_temp_storage).Sum(r_query_norm);
-  }
-
-  /**
-   * Calculates distance of base vector to other_id vector.
-   *
-   * [parallel call]:
-   * ValueT dist = distCalc.distance(other_id)
-   *
-   * Return:
-   *   ValueT distance
-   *
-   * Note: distance only valid in first thread.
-   */
-  template <DistanceMeasure m = measure, typename std::enable_if<m == Euclidean, int>::type = 0> // euclidean distance version
-  __device__ __forceinline__ ValueT distance(const KeyT other_id) {
-    ValueT r_dist = 0.0f;
-    for (int item = 0; item < ITEMS_PER_THREAD; ++item) {
-      const int read_dim = item * BLOCK_DIM_X + threadIdx.x;
-      if (read_dim < D) {
-        ValueT pos_other =
-            r_query[item] - d_base[static_cast<AddrT>(other_id) * D + read_dim];
-        r_dist += pos_other * pos_other;
-      }
-    }
-
-    return BlockReduceDist(s_temp_storage.dist_temp_storage).Sum(r_dist);
-  }
-  template <DistanceMeasure m = measure, typename std::enable_if<m == Cosine, int>::type = 0> // cosine similarity version
-  __device__ __forceinline__ ValueT distance(const KeyT other_id) {
-    DistanceAndNorm r_dist_and_norm(0.0f, 0.0f);
-    for (int item = 0; item < ITEMS_PER_THREAD; ++item) {
-      const int read_dim = item * BLOCK_DIM_X + threadIdx.x;
-      if (read_dim < D) {
-        r_dist_and_norm.r_dist += r_query[item] * d_base[static_cast<AddrT>(other_id) * D + read_dim];
-        r_dist_and_norm.r_norm += d_base[static_cast<AddrT>(other_id) * D + read_dim]*d_base[static_cast<AddrT>(other_id) * D + read_dim];
-      }
-    }
-
-    DistanceAndNorm dist_and_norm = BlockReduceDistAndNorm(s_temp_storage.dist_and_norm_temp_storage).Reduce(r_dist_and_norm, DistanceAndNorm::Sum());
-    // need to normalize by the vectors' lengths (in high dimensions, no vector has length 1.0f)
-    ValueT norm_sqr = this->query_norm*dist_and_norm.r_norm;
-    // use negative dot product, as larger values are closer to each other
-    if (!threadIdx.x) {
-      if (norm_sqr > 0.0f)
-        dist_and_norm.r_dist = fabs(1.0f-dist_and_norm.r_dist/sqrt(norm_sqr));
-      else
-        dist_and_norm.r_dist = 1.0f;
-    }
-
-    return dist_and_norm.r_dist;
-  }
-
-  /**
-   * Calculates synced distance of base vector to other_id vector.
-   *
-   * [parallel call]:
-   * ValueT dist = distCalc.distance(other_id)
-   *
-   * Return:
-   *   ValueT distance
-   *
-   * Note: distance valid in all threads.
-   */
-  __device__ __forceinline__ ValueT distance_synced(const KeyT other_id) {
-    ValueT dist = distance(other_id);
-    if (!threadIdx.x)
-      s_temp_storage.dist = dist;
-    __syncthreads();
-
-    return s_temp_storage.dist;
-  }
-};
-
-#endif  // INCLUDE_GGNN_UTILS_CUDA_KNN_DISTANCE_CUH_
diff --git a/include/ggnn/utils/cuda_knn_ggnn_results.cuh b/include/ggnn/utils/cuda_knn_ggnn_results.cuh
deleted file mode 100644
index 48d44bb..0000000
--- a/include/ggnn/utils/cuda_knn_ggnn_results.cuh
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef INCLUDE_GGNN_UTILS_CUDA_KNN_GGNN_RESULTS_CUH_
-#define INCLUDE_GGNN_UTILS_CUDA_KNN_GGNN_RESULTS_CUH_
-
-#include <algorithm>
-#include <limits>
-#include <string>
-#include <thread>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/query/cuda_knn_ggnn_query.cuh"
-
-template <DistanceMeasure measure, typename KeyT, typename ValueT, typename BaseT, typename BAddrT,
-          int KQuery>
-struct GGNNResults {
-  KeyT* h_sorted_ids_gpu;
-  ValueT* h_sorted_dists_gpu;
-
-  KeyT* h_sorted_ids;
-
-  const Dataset<KeyT, BaseT, BAddrT>* dataset;
-
-  const int num_gpus;
-  const int num_iterations;
-
-  const int num_results_per_gpu;
-  const int num_results;
-
-  GGNNResults(const Dataset<KeyT, BaseT, BAddrT>* dataset,
-              const int num_gpus = 1, const int num_iterations = 1)
-      : dataset{dataset},
-        num_gpus{num_gpus},
-        num_iterations{num_iterations},
-        num_results_per_gpu{dataset->N_query * KQuery * num_iterations},
-        num_results{num_results_per_gpu * num_gpus} {
-    CHECK_CUDA(cudaMallocHost(&h_sorted_ids_gpu, num_results * sizeof(KeyT),
-                              cudaHostAllocPortable));
-    CHECK_CUDA(cudaMallocHost(&h_sorted_dists_gpu, num_results * sizeof(ValueT),
-                              cudaHostAllocPortable));
-
-    h_sorted_ids = (KeyT*)malloc(dataset->N_query * KQuery * sizeof(KeyT));
-  }
-
-  ~GGNNResults() {
-    cudaFreeHost(h_sorted_ids_gpu);
-    cudaFreeHost(h_sorted_dists_gpu);
-    free(h_sorted_ids);
-  }
-
-  GGNNResults(const GGNNResults&) = delete;
-  GGNNResults(GGNNResults&&) = delete;
-  GGNNResults& operator=(const GGNNResults&) = delete;
-  GGNNResults& operator=(GGNNResults&&) = delete;
-
-  void loadAsync(const GGNNQuery<KeyT, ValueT, BaseT>& ggnn_query,
-            const int gpu_index, const cudaStream_t stream) {
-    CHECK_CUDA(cudaMemcpyAsync(h_sorted_ids_gpu + num_results_per_gpu * gpu_index,
-               ggnn_query.d_query_result_ids_sorted,
-               num_results_per_gpu * sizeof(KeyT), cudaMemcpyDeviceToHost, stream));
-    CHECK_CUDA(cudaMemcpyAsync(h_sorted_dists_gpu + num_results_per_gpu * gpu_index,
-               ggnn_query.d_query_result_dists_sorted,
-               num_results_per_gpu * sizeof(ValueT), cudaMemcpyDeviceToHost, stream));
-  }
-
-  void merge() {
-    // If there is only one gpu, do just copy over the results to the expected memory.
-    if(num_gpus == 1) {
-      if(num_iterations == 1){
-       std::copy_n(h_sorted_ids_gpu, num_results_per_gpu, h_sorted_ids);
-      }
-      else{
-        for (int n = 0; n <  dataset->N_query; n++) {
-          std::copy_n(h_sorted_ids_gpu + n * KQuery * num_iterations, KQuery, h_sorted_ids + n * KQuery);
-        }     
-      }
-      return;
-    }
-    const int N_partition = dataset->N_base / num_gpus;
-    const int stride = KQuery * num_iterations;
-
-    auto start = std::chrono::steady_clock::now();
-
-    auto mergeResultPart = [&](int begin, int end) -> void {
-      struct KeyDistPartition {
-        KeyT key;
-        ValueT dist;
-        int partition;
-
-        KeyDistPartition(KeyT key, ValueT dist, int partition)
-            : key(key), dist(dist), partition(partition) {}
-      };
-      auto compare_heap = [](const KeyDistPartition& a,
-                             const KeyDistPartition& b) -> bool {
-        return a.dist >= b.dist;
-      };
-
-      const int num_parts = num_gpus;
-      std::vector<int> part_offsets(num_parts, 1);
-
-      std::vector<KeyDistPartition> heap;
-      heap.reserve(num_parts);
-      for (int n = begin; n < end; ++n) {
-        heap.clear();
-        std::fill(part_offsets.begin(), part_offsets.end(), 1);
-        // fill heap with min per partition
-        for (int part_id = 0; part_id < num_parts; ++part_id) {
-          const size_t pos = (part_id * dataset->N_query + n) * stride;
-          heap.emplace_back(h_sorted_ids_gpu[pos], h_sorted_dists_gpu[pos],
-                            part_id);
-        }
-        std::make_heap(heap.begin(), heap.end(), compare_heap);
-        // pop min and insert from popped partition until full
-        // we can safely assume not to run out of bounds within each partition
-        for (int k = 0; k < KQuery; ++k) {
-          const KeyDistPartition top = heap.front();
-          h_sorted_ids[n * KQuery + k] = top.partition * N_partition + top.key;
-          if (k == KQuery - 1) break;
-
-          std::pop_heap(heap.begin(), heap.end(), compare_heap);
-          heap.pop_back();
-          const size_t pos = (top.partition * dataset->N_query + n) * stride +
-                             part_offsets[top.partition];
-          ++part_offsets[top.partition];
-          heap.emplace_back(h_sorted_ids_gpu[pos], h_sorted_dists_gpu[pos],
-                            top.partition);
-          std::push_heap(heap.begin(), heap.end(), compare_heap);
-        }
-      }
-    };
-    std::vector<std::thread> mergeThreads;
-
-    int num_threads = std::min(dataset->N_query,
-                               int(std::thread::hardware_concurrency()));
-    int elements_per_bin = (dataset->N_query+num_threads-1)/num_threads;
-    mergeThreads.reserve(num_threads);
-    for (int i=0; i<num_threads; ++i) {
-      mergeThreads.emplace_back(mergeResultPart, i*elements_per_bin,
-          std::min(dataset->N_query, (i+1)*elements_per_bin));
-    }
-    for (auto&& t : mergeThreads) {
-      t.join();
-    }
-
-    auto end = std::chrono::steady_clock::now();
-    auto cpu_ms =
-        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    VLOG(0) << "[CPU] partial merge completed. " << cpu_ms.count() << " ms.";
-  }
-
-  void evaluateResults() {
-    int c1 = 0;
-    int c1_including_duplicates = 0;
-    int cKQuery = 0;
-    int cKQuery_including_duplicates = 0;
-    int rKQuery = 0;
-    int rKQuery_including_duplicates = 0;
-
-    for (int n = 0; n < dataset->N_query; n++) {
-      const uint8_t endTop1 = dataset->top1DuplicateEnd.at(n);
-      const uint8_t endTopK = dataset->topKDuplicateEnd.at(n);
-
-      CHECK_LE(endTopK, dataset->K_gt);
-
-      for (int k_result = 0; k_result < KQuery; k_result++) {
-        KeyT q = h_sorted_ids[n * KQuery + k_result];
-        CHECK_GE(q, 0) << "n: " << n << " k: " << k_result;
-        CHECK_LT(q, dataset->N_base) << "n: " << n << " k: " << k_result;
-        for (int k_gt = 0; k_gt < endTopK; k_gt++) {
-          KeyT gt = dataset->gt[n * dataset->K_gt + k_gt];
-          if (q == gt) {
-            if (!k_gt) {
-              if (!k_result) ++c1;
-              if (k_gt < KQuery) ++rKQuery;
-              ++rKQuery_including_duplicates;
-            }
-            if (k_gt < endTop1) {
-              if (!k_result) ++c1_including_duplicates;
-            }
-            if (k_gt < KQuery) ++cKQuery;
-            ++cKQuery_including_duplicates;
-            continue;
-          }
-        }
-      }
-    }
-
-    const float inv_num_points = 1.0f / dataset->N_query;
-
-    LOG(INFO) << "c@1 (=r@1): " << c1 * inv_num_points
-              << " +duplicates: " << c1_including_duplicates * inv_num_points;
-    if (KQuery <= dataset->K_gt) {
-      LOG(INFO) << "c@" << KQuery << ": " << cKQuery * inv_num_points / KQuery
-                << " +duplicates: "
-                << cKQuery_including_duplicates * inv_num_points / KQuery;
-    }
-    LOG(INFO) << "r@" << KQuery << ": " << rKQuery * inv_num_points
-              << " +duplicates: "
-              << rKQuery_including_duplicates * inv_num_points;
-  }
-};
-
-#endif  // INCLUDE_GGNN_UTILS_CUDA_KNN_GGNN_RESULTS_CUH_
diff --git a/include/ggnn/utils/cuda_knn_k_best_list.cuh b/include/ggnn/utils/cuda_knn_k_best_list.cuh
deleted file mode 100644
index de0c6d4..0000000
--- a/include/ggnn/utils/cuda_knn_k_best_list.cuh
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_GGNN_UTILS_CUDA_KNN_K_BEST_LIST_CUH_
-#define INCLUDE_GGNN_UTILS_CUDA_KNN_K_BEST_LIST_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <limits>
-
-/**
- * KBestList stores the K best elements in parallel.
- */
-template <typename ValueT, typename KeyT, int K, int BLOCK_DIM_X>
-struct KBestList {
-  // this allows for loop unrolling
-  static constexpr int ITERATIONS_FOR_K = (K+BLOCK_DIM_X-1)/BLOCK_DIM_X;
-
-  ValueT* dists;
-  KeyT* ids;
-
-  static constexpr KeyT EMPTY_KEY = -1;
-
-  __device__ __forceinline__ void initSharedStorage() {
-    __shared__ ValueT s_dists[K];
-    __shared__ KeyT s_ids[K];
-    dists = reinterpret_cast<ValueT*>(s_dists);
-    ids = reinterpret_cast<KeyT*>(s_ids);
-  }
-
-  __device__ __forceinline__ void init() {
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        dists[k] = std::numeric_limits<ValueT>::infinity();
-        ids[k] = EMPTY_KEY;
-      }
-    }
-    __syncthreads();
-  }
-
-  __device__ __forceinline__ KBestList() {
-    initSharedStorage();
-    init();
-  }
-
-  __device__ __forceinline__ ValueT worst() { return dists[K - 1]; }
-
-  /**
-   * Enters element with dist and id to list. [parallel call]:
-   * Only enters the object if the id is not already in the list.
-   * On same distances the entry is placed to the right.
-   *
-   * `list.add(dist, id)`
-   *
-   * Note: __syncthreads() need before next 'list' call.
-   *
-   */
-  __device__ __forceinline__ void add(ValueT dist, KeyT id) {
-    __shared__ bool s_enter;
-    if (!threadIdx.x) s_enter = true;
-    __syncthreads();
-    ValueT r_dist[ITERATIONS_FOR_K];
-    KeyT r_id[ITERATIONS_FOR_K];
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        r_dist[i] = dists[threadIdx.x];
-        r_id[i] = ids[threadIdx.x];
-        if (r_id[i] == id) s_enter = false;
-      }
-    }
-    __syncthreads();
-    if (!s_enter)
-      return;
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        if (r_dist[i] > dist) {
-          if (k < (K - 1)) {
-            dists[k + 1] = r_dist[i];
-            ids[k + 1] = r_id[i];
-          }
-
-          if (!k || dists[k - 1] <= dist) {
-            dists[k] = dist;
-            ids[k] = id;
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * Enters element with dist and id to list. [parallel call]:
-   * Only enters the object if the id is not already in the list.
-   * On same distances the entry is placed to the left.
-   *
-   * `list.add_priority(dist, id)`
-   *
-   * Note: __syncthreads() need before next 'list' call.
-   *
-   */
-  __device__ __forceinline__ void add_priority(ValueT dist, KeyT id) {
-    __shared__ bool s_enter;
-    if (!threadIdx.x) s_enter = true;
-    __syncthreads();
-    ValueT r_dist[ITERATIONS_FOR_K];
-    KeyT r_id[ITERATIONS_FOR_K];
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        r_dist[i] = dists[threadIdx.x];
-        r_id[i] = ids[threadIdx.x];
-        if (r_id[i] == id) s_enter = false;
-      }
-    }
-    __syncthreads();
-    if (!s_enter)
-      return;
-
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        if (r_dist[i] >= dist) {
-          if (k < (K - 1)) {
-            dists[k + 1] = r_dist[i];
-            ids[k + 1] = r_id[i];
-          }
-
-          if (!k || dists[k - 1] < dist) {
-            dists[k] = dist;
-            ids[k] = id;
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * Enters a (assumed to be) unique element with dist and id to list. [parallel
-   * call]:
-   * There is no check if the element is already in the list.
-   *
-   * `list.add_unique(dist, unique_id)`
-   *
-   * Note: __syncthreads() need before next 'list' call.
-   *
-   */
-  __device__ __forceinline__ void add_unique(ValueT dist, KeyT unique_id) {
-    ValueT r_dist[ITERATIONS_FOR_K];
-    KeyT r_id[ITERATIONS_FOR_K];
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        r_dist[i] = dists[k];
-        r_id[i] = ids[k];
-      }
-    }
-    __syncthreads();
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        if (r_dist[i] > dist) {
-          if (k < (K - 1)) {
-            dists[k + 1] = r_dist[i];
-            ids[k + 1] = r_id[i];
-          }
-
-          if (!k || dists[k - 1] <= dist) {
-            dists[k] = dist;
-            ids[k] = unique_id;
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * Transforms all ids w.r.t. a transformation list. [parallel call]:
-   *
-   * `list.transform(transform_list)`
-   *
-   * Note: __syncthreads() need before next 'list' call.
-   *
-   */
-  __device__ __forceinline__ void transform(const KeyT* transform) {
-    for (int i=0; i < ITERATIONS_FOR_K; ++i) {
-      const int k = i*BLOCK_DIM_X+threadIdx.x;
-      if (k < K) {
-        const KeyT id = ids[k];
-        if (id >= 0) ids[k] = transform[id];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void print(int len = -1) {
-    __syncthreads();
-    if (!threadIdx.x) {
-      printf("KBestList: \n");
-      for (int i = 0; i < K && (len < 0 || i < len); i++) {
-        printf("(%d -> %f [%d]) ", i, dists[i], ids[i]);
-      }
-      printf("\n");
-    }
-  }
-};
-
-#endif  // INCLUDE_GGNN_UTILS_CUDA_KNN_K_BEST_LIST_CUH_
diff --git a/include/ggnn/utils/cuda_knn_utils.cuh b/include/ggnn/utils/cuda_knn_utils.cuh
deleted file mode 100644
index 1253921..0000000
--- a/include/ggnn/utils/cuda_knn_utils.cuh
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-#ifndef INCLUDE_GGNN_UTILS_CUDA_KNN_UTILS_CUH_
-#define INCLUDE_GGNN_UTILS_CUDA_KNN_UTILS_CUH_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cub/cub.cuh>
-#include <glog/logging.h>
-
-enum DistanceMeasure : int {
-  Euclidean = 0,
-  Cosine = 1
-};
-
-template <typename T>
-__global__ void launcher(const T kernel) {
-  kernel();
-}
-
-#define CHECK_CUDA(ans) \
-  { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char* file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    if (abort)
-      LOG(FATAL) << "GPUassert: " << cudaGetErrorString(code) << " " << file << " " << line << "\n";
-    else
-      LOG(ERROR) << "GPUassert: " << cudaGetErrorString(code) << " " << file << " " << line << "\n";
-  }
-}
-
-template <typename T>
-float time_launcher(const int log_level, T* kernel, int N, cudaStream_t stream = 0) {
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-
-  cudaEventRecord(start, stream);
-  kernel->launch(stream);
-  cudaEventRecord(stop, stream);
-  cudaEventSynchronize(stop);
-
-  float milliseconds = 0;
-  cudaEventElapsedTime(&milliseconds, start, stop);
-
-  VLOG(log_level) << milliseconds << " ms for " << N << " queries -> " << milliseconds*1000.0f/N << " us/query \n";
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-
-  return milliseconds;
-}
-
-template <typename T>
-void launcher(const int log_level, T* kernel, int N, cudaStream_t stream = 0) {
-  kernel->launch(stream);
-}
-
-#endif  // INCLUDE_GGNN_UTILS_CUDA_KNN_UTILS_CUH_
diff --git a/include/ggnn/utils/hnswlib_loader.hpp b/include/ggnn/utils/hnswlib_loader.hpp
deleted file mode 100644
index 6cfce17..0000000
--- a/include/ggnn/utils/hnswlib_loader.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2021 ComputerGraphics Tuebingen. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef HNSWLIB_LOADER_HPP_
-#define HNSWLIB_LOADER_HPP_
-
-#include <gflags/gflags.h>
-#include <inttypes.h>
-
-#include <fstream>
-#include <string>
-#include <vector>
-
-/**
- * HNSW loader (for known parameters only to allow for easy access to the data)
- * @param ValueT datatype of the dataset, e.g. char, int, float
- * @param D dimension of the dataset
- * @param M maximum number of neighbors in the graph = K/2
- *
- * based on Hnswlib https://github.com/nmslib/hnswlib.git
- * commit bbddf198ffc607e321a65fad159cd4b984da651b
- */
-template <typename ValueT, size_t D, size_t M>
-struct HNSWLoader {
-  struct __attribute__((__packed__)) HNSWLevel0Element {
-    uint16_t link_count;
-    uint16_t padding;
-    uint32_t links[M * 2];
-    ValueT base_vector[D];
-    size_t label;  // unaligned
-  };
-
-  /*
-  // not relevant for GGNN-query
-  struct HNSWUpperLevelElement {
-    uint16_t link_count;
-    uint16_t padding;
-    uint32_t links[M];
-  };
-  */
-
-  struct HNSWHeader {  // based on HierarchicalNSW::saveIndex()
-    size_t offsetLevel0_;
-    size_t max_elements_;
-    size_t cur_element_count;
-    size_t size_data_per_element_;
-    size_t label_offset_;
-    size_t offsetData_;
-    int32_t maxlevel_;
-    uint32_t enterpoint_node_;
-    size_t maxM_;
-    size_t maxM0_;
-    size_t M_;
-    double mult_;
-    size_t ef_construction_;
-
-    bool verify() {
-      CHECK_EQ(M_, M);
-      CHECK_EQ(maxM0_, 2 * M_);
-      const size_t size_links_level0_ =
-          maxM0_ * sizeof(uint32_t) + sizeof(uint32_t);
-      const size_t data_size_ = D * sizeof(ValueT);
-      CHECK_EQ(size_data_per_element_,
-               size_links_level0_ + data_size_ + sizeof(size_t));
-      CHECK_EQ(size_data_per_element_, sizeof(HNSWLevel0Element));
-      return true;
-    }
-  } hnsw_header;
-
-  std::vector<HNSWLevel0Element> data_level0_memory_;  // [N];
-
-  /*
-  // not relevant for GGNN-query
-  std::vector<int32_t> element_levels; // [N]
-  std::vector<HNSWUpperLevelElement> linkLists_; // [sum(element_levels)]
-  */
-
-  HNSWLoader(const std::string& filename) {
-    // open, verify, load L0, done. (ignore upper levels)
-    std::ifstream hnsw_index_file(filename,
-                                  std::ios_base::in | std::ios_base::binary);
-    CHECK(hnsw_index_file.is_open());
-
-    hnsw_index_file.seekg(0, std::ios_base::end);
-    size_t filesize = hnsw_index_file.tellg();
-    hnsw_index_file.seekg(0, std::ios_base::beg);
-
-    CHECK_GT(filesize, sizeof(HNSWHeader));
-
-    hnsw_index_file.read(reinterpret_cast<char*>(&hnsw_header),
-                         sizeof(HNSWHeader));
-
-    CHECK(hnsw_header.verify());
-    CHECK_GE(filesize, sizeof(HNSWHeader) + sizeof(HNSWLevel0Element) *
-                                                hnsw_header.cur_element_count);
-
-    data_level0_memory_.resize(hnsw_header.cur_element_count);
-
-    hnsw_index_file.read(
-        reinterpret_cast<char*>(data_level0_memory_.data()),
-        sizeof(HNSWLevel0Element) * hnsw_header.cur_element_count);
-
-    hnsw_index_file.close();
-
-    LOG(INFO) << "read HNSW base layer containing "
-              << hnsw_header.cur_element_count << " elements.";
-  }
-};
-
-#endif  // HNSWLIB_LOADER_HPP_
diff --git a/include/io/loader.hpp b/include/io/loader.hpp
deleted file mode 100644
index 99eb91c..0000000
--- a/include/io/loader.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_IO_LOADER_HPP_
-#define INCLUDE_IO_LOADER_HPP_
-
-#include <fstream>
-#include <iostream>
-#include <string>
-
-template <typename ValueT>
-class Loader {
- public:
-  Loader() : dimension(0), num_elements(0) {}
-
-  explicit Loader(const std::string& path) : path(path) {
-    hnd = new std::ifstream(path, std::ios_base::in | std::ios_base::binary);
-
-    if (!hnd->good()) {
-      hnd->close();
-      throw std::runtime_error("Dataset file " + path + " does not exists");
-    }
-  }
-
-  virtual ~Loader() {
-    try {
-      if (hnd->is_open()) {
-        this->hnd->close();
-      }
-      delete hnd;
-    } catch (...) {
-      std::cout << "could not close \n";
-    }
-  }
-
-  /**
-   * load vectors
-   * @param skip number of vectors to skip (not bytes, not values)
-   * @param num  number of elements to read
-   */
-  virtual void load(ValueT* dst, size_t skip, size_t num) = 0;
-
-  int32_t Dim() const { return dimension; }
-  int32_t Num() const { return num_elements; }
-  std::string Path() const { return path; }
-
- protected:
-  std::string path;
-  std::ifstream* hnd;
-
-  int32_t dimension;
-  int32_t num_elements;
-};
-
-#endif  // INCLUDE_IO_LOADER_HPP_
diff --git a/include/io/loader_ann.hpp b/include/io/loader_ann.hpp
deleted file mode 100644
index 6c0e019..0000000
--- a/include/io/loader_ann.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_IO_LOADER_ANN_HPP_
-#define INCLUDE_IO_LOADER_ANN_HPP_
-
-#include <fstream>
-#include <string>
-#include "loader.hpp"
-
-template <typename ValueT>
-class XVecsLoader : public Loader<ValueT> {
- public:
-  explicit XVecsLoader(const std::string& path) : Loader<ValueT>(path) {
-    // find dimension
-    this->hnd->seekg(0, std::ios::beg);
-    this->hnd->read(reinterpret_cast<char*>(&this->dimension), sizeof(int));
-
-    size_t stride = sizeof(uint32_t) + this->dimension * sizeof(ValueT);
-
-    // calc file size
-    this->hnd->seekg(0, std::ios::beg);
-    std::streampos fsize = this->hnd->tellg();
-    this->hnd->seekg(0, std::ios::end);
-    fsize = this->hnd->tellg() - fsize;
-
-    this->num_elements = fsize / stride;
-    this->hnd->seekg(0, std::ios::beg);
-
-    DLOG(INFO) << "Open " << path << " with " << this->num_elements << " "
-               << this->dimension << "-dim vectors.";
-  }
-
-  void load(ValueT* dst, size_t skip, size_t num) override {
-    DLOG(INFO) << "Loading " << num << " vectors starting at " << skip
-               << " ...";
-
-    size_t stride = 1 * sizeof(uint32_t) + this->dimension * sizeof(ValueT);
-    this->hnd->seekg(stride * skip);
-
-    int32_t dim;
-
-    for (size_t n = 0; n < num; ++n) {
-      // skip dimension
-      this->hnd->read(reinterpret_cast<char*>(&dim), sizeof(int32_t));
-      CHECK_EQ(dim, this->dimension) << "dimension mismatch";
-
-      this->hnd->read(reinterpret_cast<char*>(dst),
-                      this->dimension * sizeof(ValueT));
-      dst += this->dimension;
-    }
-
-    DLOG(INFO) << "Done";
-  }
-};
-
-using FVecsLoader = XVecsLoader<float>;
-using IVecsLoader = XVecsLoader<int>;
-using BVecsLoader = XVecsLoader<uint8_t>;
-
-#endif  // INCLUDE_IO_LOADER_ANN_HPP_
diff --git a/include/io/storer.hpp b/include/io/storer.hpp
deleted file mode 100644
index 52332c6..0000000
--- a/include/io/storer.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_IO_STORER_HPP_
-#define INCLUDE_IO_STORER_HPP_
-
-#include <fstream>
-#include <string>
-
-template <typename ValueT>
-class Storer {
- public:
-  Storer() {}
-
-  explicit Storer(std::string path, uint dimension, uint num_elements)
-      : path(path), dimension(dimension), num_elements(num_elements) {
-    hnd = new std::ofstream(path, std::ios_base::out | std::ios_base::binary |
-                                      std::ios_base::trunc);
-
-    if (!hnd->good()) {
-      hnd->close();
-      throw std::runtime_error("Not able to write to path: " + path);
-    }
-  }
-
-  virtual ~Storer() {
-    this->hnd->close();
-    delete hnd;
-  }
-
-  /**
-   * load vectors
-   * @param skip number of vectors to skip (not bytes, not values)
-   * @param num  number of elements to read
-   */
-  virtual void store(ValueT* dst, size_t num) = 0;
-
-  uint Dim() const { return dimension; }
-  uint Num() const { return num_elements; }
-  std::string Path() const { return path; }
-
- protected:
-  std::string path;
-  std::ofstream* hnd;
-
-  uint dimension;
-  uint num_elements;
-};
-
-#endif  // INCLUDE_IO_STORER_HPP_
diff --git a/include/io/storer_ann.hpp b/include/io/storer_ann.hpp
deleted file mode 100644
index 23445df..0000000
--- a/include/io/storer_ann.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-
-#ifndef INCLUDE_IO_STORER_ANN_HPP_
-#define INCLUDE_IO_STORER_ANN_HPP_
-
-#include <fstream>
-#include <string>
-#include "storer.hpp"
-
-template <typename ValueT>
-class XVecsStorer : public Storer<ValueT> {
- public:
-  explicit XVecsStorer(std::string path, uint dimension, uint num_elements)
-      : Storer<ValueT>(path, dimension, num_elements) {}
-
-  void store(ValueT* dst, size_t num) override {
-    for (uint n = 0; n < num; ++n) {
-      this->hnd->write(reinterpret_cast<char*>(&this->dimension), sizeof(int));
-      this->hnd->write(reinterpret_cast<char*>(&dst[n * this->dimension]),
-                       sizeof(ValueT) * this->dimension);
-    }
-  }
-};
-
-using FVecsStorer = XVecsStorer<float>;
-using IVecsStorer = XVecsStorer<int>;
-
-#endif  // INCLUDE_IO_STORER_ANN_HPP_
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..9d675b9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,51 @@
+[project] # Project metadata
+name = "GGNN"
+readme = "README.md"
+requires-python = ">=3.8"
+license = { "file" = "LICENSE" }
+authors = [
+    {"name" = "Lukas Ruppert", "email" = "lukas.ruppert@uni-tuebingen.de"},
+    {"name" = "Deborah Kornwolf", "email" = "deborah.kornwolf@uni-tuebingen.de"},]
+keywords = ["kNN", "cuda", "nearest-neighbor", "graph", "GPU"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Environment :: GPU :: NVIDIA CUDA :: 12",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: POSIX :: Linux",
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules"
+]
+
+dependencies = []
+dynamic = ["version", "description"]
+
+[project.urls]
+Homepage = "https://github.com/cgtuebingen/ggnn"
+# TODO: Documentation = "https://ggnn.readthedocs.io/"
+
+[build-system] # How pip and other frontends should build this project
+requires = ["py-build-cmake~=0.3.3", "nanobind", "typing_extensions"]
+build-backend = "py_build_cmake.build"
+
+[tool.py-build-cmake.module] # Where to find the Python module to package
+directory = "python-src"
+
+[tool.py-build-cmake.sdist] # What to include in source distributions
+include = ["CMakeLists.txt", "python-src/*", "src/*", "include/*"]
+
+[tool.py-build-cmake.cmake] # How to build the CMake project
+build_type = "Release"
+source_path = "."
+build_args = ["-j"]
+install_components = ["python_modules"]
diff --git a/python-src/ggnn/__init__.py b/python-src/ggnn/__init__.py
new file mode 100644
index 0000000..a17388f
--- /dev/null
+++ b/python-src/ggnn/__init__.py
@@ -0,0 +1,6 @@
+"""GGNN: Graph-Based GPU Nearest Neighbor Search"""
+
+__version__ = '0.9.0'
+
+# import C++ module
+from .GGNN import *
diff --git a/src/deep1b_multi_gpu.cu b/src/deep1b_multi_gpu.cu
deleted file mode 100644
index 7e54b34..0000000
--- a/src/deep1b_multi_gpu.cu
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef CUDA_API_PER_THREAD_DEFAULT_STREAM
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-#endif
-
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-// only needed for file_exists check
-#include <sys/stat.h>
-
-inline bool file_exists(const std::string& name) {
-  struct stat buffer;
-  return (stat(name.c_str(), &buffer) == 0);
-}
-
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn_multi_gpu.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-
-DEFINE_string(
-    mode, "bq",
-    "Mode: bq -> build_and_query, bs -> build_and_store, lq -> load_and_query");
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_string(graph_dir, "./", "directory to store and load ggnn graph files.");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(factor, 1000000, "Factor");
-DEFINE_int32(base, 1, "N_base: base x factor");
-DEFINE_int32(shard, 1, "N_shard: shard x factor");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_string(gpu_ids, "0", "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint64_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint64_t;
-  //
-  // dataset configuration (here: DEEP1B)
-  //
-  /// dimension of the dataset
-  const int D = 96;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 24;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  std::istringstream iss(FLAGS_gpu_ids);
-  std::vector<std::string> results(std::istream_iterator<std::string>{iss},
-                                   std::istream_iterator<std::string>());
-
-  int numGpus;
-  cudaGetDeviceCount(&numGpus);
-
-  std::vector<int> gpus;
-  for (auto&& r : results) {
-    int gpu_id = atoi(r.c_str());
-    printf("GPU %d: ", gpu_id);
-    {
-      CHECK_GE(gpu_id, 0) << "This GPU does not exist";
-      CHECK_LT(gpu_id, numGpus) << "This GPU does not exist";
-
-      cudaDeviceProp prop;
-      cudaGetDeviceProperties(&prop, gpu_id);
-      printf("Found device name: %s\n", prop.name);
-
-      gpus.push_back(gpu_id);
-    }
-  }
-
-  const size_t N_base = FLAGS_base * FLAGS_factor;
-  const int N_shard = FLAGS_shard * FLAGS_factor;
-
-  typedef GGNNMultiGPU<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild,
-                       KF, KQuery, S>
-      GGNN;
-  GGNN ggnn{
-      FLAGS_base_filename,
-      FLAGS_query_filename,
-      file_exists(FLAGS_groundtruth_filename) ? FLAGS_groundtruth_filename : "",
-      L,
-      static_cast<float>(FLAGS_tau),
-      N_base};
-
-  ggnn.ggnnMain(gpus, FLAGS_mode, N_shard, FLAGS_graph_dir,
-                FLAGS_refinement_iterations, FLAGS_grid_search);
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/ggnn/base/data.cu b/src/ggnn/base/data.cu
new file mode 100644
index 0000000..dcae768
--- /dev/null
+++ b/src/ggnn/base/data.cu
@@ -0,0 +1,217 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/data.cuh>
+
+#include <ggnn/cuda_utils/check.cuh>
+
+#include <glog/logging.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <ostream>
+
+#include <cuda_runtime.h>
+
+namespace ggnn {
+
+namespace detail {
+
+DataLocation disown(DataLocation location)
+{
+  switch (location) {
+    case DataLocation::FOREIGN_GPU:
+      DLOG(WARNING) << "ownership has already been removed";
+    case DataLocation::GPU:
+    case DataLocation::MANAGED:
+      return DataLocation::FOREIGN_GPU;
+
+    case DataLocation::FOREIGN_CPU:
+      DLOG(WARNING) << "ownership has already been removed";
+    case DataLocation::CPU_PINNED:
+    case DataLocation::CPU_MALLOC:
+      return DataLocation::FOREIGN_CPU;
+    default:;
+  }
+  return DataLocation::UNKNOWN;
+};
+
+size_t dataSize(DataType type)
+{
+  switch (type) {
+    case DataType::BYTE:
+    case DataType::UINT8:
+      return 1;
+    case DataType::INT32:
+    case DataType::UINT32:
+    case DataType::FLOAT:
+      return 4;
+    default:
+      LOG(FATAL) << "size for data type " << type << " is unknown.";
+  }
+  return 0;
+}
+
+};  // namespace detail
+
+std::byte* Allocator::cudaMallocChecked(const size_t size)
+{
+  CHECK_CUDA(cudaPeekAtLastError());
+  std::byte* placeholder;
+  const cudaError_t result = cudaMalloc(&placeholder, size);
+  CHECK_EQ(result, cudaSuccess) << "failed to allocate " << size << " bytes of GPU memory.";
+  return placeholder;
+}
+std::byte* Allocator::cudaMallocManagedChecked(const size_t size)
+{
+  CHECK_CUDA(cudaPeekAtLastError());
+  std::byte* placeholder;
+  const cudaError_t result = cudaMallocManaged(&placeholder, size);
+  CHECK_EQ(result, cudaSuccess) << "failed to allocate " << size << " bytes of managed GPU memory.";
+  return placeholder;
+}
+std::byte* Allocator::cudaMallocHostChecked(const size_t size, const unsigned int flags)
+{
+  CHECK_CUDA(cudaPeekAtLastError());
+  std::byte* placeholder;
+  const cudaError_t result = cudaMallocHost(&placeholder, size, flags);
+  CHECK_EQ(result, cudaSuccess) << "failed to allocate " << size << " bytes of pinned CPU memory.";
+  return placeholder;
+}
+
+std::byte* Allocator::mallocChecked(const size_t size)
+{
+  CHECK_CUDA(cudaPeekAtLastError());
+  std::byte* placeholder = reinterpret_cast<std::byte*>(std::malloc(size));
+  CHECK_NOTNULL(placeholder);
+  return placeholder;
+}
+
+void Allocator::allocateData(Allocation& alloc, uint32_t flags)
+{
+  alloc.mem = [&alloc, &flags]() -> void* {
+    switch (alloc.location) {
+      case DataLocation::GPU:
+        CHECK_CUDA(cudaSetDevice(alloc.gpu_id));
+        return cudaMallocChecked(alloc.required_size_bytes());
+        break;
+      case DataLocation::MANAGED:
+        CHECK_CUDA(cudaSetDevice(alloc.gpu_id));
+        return cudaMallocManagedChecked(alloc.required_size_bytes());
+        break;
+      case DataLocation::CPU_PINNED:
+        return cudaMallocHostChecked(alloc.required_size_bytes(), flags);
+        break;
+      case DataLocation::CPU_MALLOC:
+        return mallocChecked(alloc.required_size_bytes());
+        break;
+      case DataLocation::FOREIGN_GPU:
+      case DataLocation::FOREIGN_CPU:
+      case DataLocation::UNKNOWN:
+        LOG(ERROR) << "cannot allocate data to unknown or foreign location.";
+    }
+    return nullptr;
+  }();
+}
+
+void Allocator::freeData(Allocation& alloc)
+{
+  if (alloc.mem) {
+    switch (alloc.location) {
+      case DataLocation::GPU:
+      case DataLocation::MANAGED:
+        CHECK_CUDA(cudaSetDevice(alloc.gpu_id));
+        CHECK_CUDA(cudaFree(alloc.mem));
+        break;
+      case DataLocation::CPU_PINNED:
+        CHECK_CUDA(cudaFreeHost(alloc.mem));
+        break;
+      case DataLocation::CPU_MALLOC:
+        std::free(alloc.mem);
+        break;
+      case DataLocation::FOREIGN_CPU:
+      case DataLocation::FOREIGN_GPU:
+        break;  // noop
+      default:
+        LOG(WARNING) << "cannot free data from unknown origin.";
+    }
+    alloc.mem = nullptr;
+  }
+}
+
+std::ostream& operator<<(std::ostream& stream, DataType type)
+{
+  switch (type) {
+    case DataType::UNKNOWN:
+      stream << "unknown";
+      break;
+    case DataType::BYTE:
+      stream << "byte";
+      break;
+    case DataType::UINT8:
+      stream << "uint8";
+      break;
+    case DataType::INT32:
+      stream << "int32";
+      break;
+    case DataType::UINT32:
+      stream << "uint32";
+      break;
+    case DataType::FLOAT:
+      stream << "float";
+      break;
+  }
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream, DataLocation location)
+{
+  switch (location) {
+    case DataLocation::UNKNOWN:
+      stream << "unknown";
+      break;
+    case DataLocation::GPU:
+      stream << "GPU";
+      break;
+    case DataLocation::MANAGED:
+      stream << "managed";
+      break;
+    case DataLocation::CPU_MALLOC:
+      stream << "CPU (malloc)";
+      break;
+    case DataLocation::CPU_PINNED:
+      stream << "CPU (pinned)";
+      break;
+    case DataLocation::FOREIGN_CPU:
+      stream << "CPU (foreign)";
+      break;
+    case DataLocation::FOREIGN_GPU:
+      stream << "GPU (foreign)";
+      break;
+  }
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream, Allocation alloc)
+{
+  stream << "[alloc: " << alloc.N << "x" << alloc.D << ", " << alloc.type << ", " << alloc.location
+         << "]";
+  return stream;
+}
+
+};  // namespace ggnn
diff --git a/src/ggnn/base/dataset.cu b/src/ggnn/base/dataset.cu
new file mode 100644
index 0000000..627abc5
--- /dev/null
+++ b/src/ggnn/base/dataset.cu
@@ -0,0 +1,342 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/data.cuh>
+#include <ggnn/base/dataset.cuh>
+
+#include <ggnn/base/lib.h>
+
+#include <ggnn/cuda_utils/check.cuh>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <iosfwd>
+#include <iostream>
+#include <span>
+#include <type_traits>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <cuda_runtime.h>
+
+namespace ggnn {
+
+GenericDataset::GenericDataset(GenericDataset&& other) noexcept : Allocation{other}
+{
+  other.releaseOwnership();
+}
+
+GenericDataset& GenericDataset::operator=(GenericDataset&& other) noexcept
+{
+  Allocator::freeData(*this);
+  Allocation::operator=(other);
+  other.releaseOwnership();
+  return *this;
+}
+
+GenericDataset::~GenericDataset()
+{
+  Allocator::freeData(*this);
+}
+
+GenericDataset GenericDataset::reference() const
+{
+  return Allocation{.N = N,
+                    .D = D,
+                    .type = type,
+                    .location = detail::disown(location),
+                    .gpu_id = gpu_id,
+                    .mem = mem};
+}
+GenericDataset GenericDataset::referenceRange(uint64_t from, uint64_t num) const
+{
+  CHECK_LE(from + num, N);
+  return Allocation{.N = num,
+                    .D = D,
+                    .type = type,
+                    .location = detail::disown(location),
+                    .gpu_id = gpu_id,
+                    .mem = reinterpret_cast<std::byte*>(mem) + from * D * detail::dataSize(type)};
+}
+
+template <typename T>
+Dataset<T> Dataset<T>::empty(const uint64_t N, const uint32_t D, bool pin_memory)
+{
+  Allocation alloc{.N = N,
+                   .D = D,
+                   .type = DataType_v<T>,
+                   .location = pin_memory ? DataLocation::CPU_PINNED : DataLocation::CPU_MALLOC};
+  Allocator::allocateData(alloc);
+
+  Dataset<T> result{alloc};
+  CHECK_NOTNULL(result.data());
+  return result;
+}
+
+template <typename T>
+Dataset<T> Dataset<T>::emptyOnGPU(const uint64_t N, const uint32_t D, int32_t gpu_id)
+{
+  Allocation alloc{
+      .N = N, .D = D, .type = DataType_v<T>, .location = DataLocation::GPU, .gpu_id = gpu_id};
+  Allocator::allocateData(alloc);
+
+  Dataset<T> result{alloc};
+  CHECK_NOTNULL(result.data());
+  return result;
+}
+
+template <typename T>
+Dataset<T> Dataset<T>::copy(const std::span<const T>& data, uint32_t D, bool pin_memory)
+{
+  const uint32_t N = data.size() / D;
+  CHECK_EQ(N * D, data.size());
+
+  Dataset<T> result{empty(N, D, pin_memory)};
+  std::copy(data.begin(), data.end(), result.data());
+
+  return result;
+}
+
+GenericDataset GenericDataset::load(const std::filesystem::path& path, uint32_t from, uint32_t num,
+                                    bool pin_memory)
+{
+  if (path.string().ends_with(".fvecs"))
+    return GenericDataset{Dataset<float>::load(path, from, num, pin_memory)};
+  else if (path.string().ends_with(".bvecs"))
+    return GenericDataset{Dataset<uint8_t>::load(path, from, num, pin_memory)};
+  else if (path.string().ends_with(".ivecs"))
+    return GenericDataset{Dataset<int32_t>::load(path, from, num, pin_memory)};
+  LOG(FATAL) << "Could not guess file type from " << path
+             << ". fvecs, bvecs, or ivecs file required.";
+  // to avoid "missing return" - the above LOG(FATAL) will already call abort
+  abort();
+}
+
+template <typename T>
+Dataset<T> Dataset<T>::load(const std::filesystem::path& path, uint32_t from, uint32_t num,
+                            bool pin_memory)
+{
+  std::ifstream file{path, std::ios_base::in | std::ios_base::binary};
+  CHECK(file) << "Unable to open file " << path << " for reading.";
+
+  uint32_t D{};
+  // read dimension
+  file.read(reinterpret_cast<char*>(&D), sizeof(D));
+
+  const size_t in_vec_size = sizeof(uint32_t) + D * sizeof(T);
+
+  // calc file size
+  file.seekg(0, std::ios::end);
+  std::streampos fsize = file.tellg();
+
+  uint32_t N = fsize / in_vec_size;
+  file.seekg(0, std::ios::beg);
+
+  VLOG(1) << "Opened dataset file " << path << " containing " << N << "x" << D << " vectors.";
+
+  if (num != -1U) {
+    N = std::min(N - from, num);
+    CHECK_EQ(N, num) << "Dataset contains fewer vectors than requested.";
+  }
+
+  // read in blocks of 1'000 vectors from the file
+  const uint32_t blocksize = 1'000;
+  const uint32_t num_blocks = (N + blocksize - 1) / blocksize;
+  const bool report_progress = N >= 100'000'000;
+  const uint32_t report_every = 1'000;
+
+  if (report_progress)
+    std::cout << "\n[CPU] Allocating memory..." << std::flush;
+  Dataset<T> result{empty(N, D, pin_memory)};
+  std::vector<std::byte> buffer(blocksize * in_vec_size);
+
+  for (uint32_t block = 0; block < num_blocks; ++block) {
+    const size_t out_pos = static_cast<size_t>(block) * blocksize;
+    const size_t in_pos = from + out_pos;
+
+    file.seekg(in_vec_size * in_pos);
+    file.read(reinterpret_cast<char*>(buffer.data()),
+              std::min(blocksize, N - block * blocksize) * in_vec_size);
+
+    for (size_t i = 0; i < blocksize && out_pos + i < N; ++i)
+      std::copy_n(buffer.data() + in_vec_size * i + sizeof(uint32_t), D * sizeof(T),
+                  reinterpret_cast<std::byte*>(&result[(out_pos + i) * D]));
+
+    if (report_progress) {
+      if (block % (report_every * 10) == 0) {
+        std::cout << "\r[";
+        std::cout.fill('0');
+        std::cout.width(2);
+        std::cout << block * 100 / num_blocks;
+        std::cout << "%] Loading...\033[K" << std::flush;
+      }
+      else if (block % report_every == 0)
+        std::cout << '.' << std::flush;
+    }
+  }
+
+  if (report_progress)
+    std::cout << "\r\033[K" << std::flush;
+
+  CHECK(file) << "Failed to read vectors from " << path << ".";
+
+  return result;
+}
+
+template <typename T>
+Dataset<T> Dataset<T>::referenceCPUData(T* data, const uint64_t N, const uint32_t D)
+{
+  return GenericDataset{Allocation{
+      .N = N, .D = D, .type = DataType_v<T>, .location = DataLocation::FOREIGN_CPU, .mem = data}};
+}
+
+template <typename T>
+Dataset<T> Dataset<T>::referenceGPUData(T* data, const uint64_t N, const uint32_t D, int32_t gpu_id)
+{
+  return GenericDataset{Allocation{.N = N,
+                                   .D = D,
+                                   .type = DataType_v<T>,
+                                   .location = DataLocation::FOREIGN_GPU,
+                                   .gpu_id = gpu_id,
+                                   .mem = data}};
+}
+
+template <typename T>
+void Dataset<T>::store(const std::filesystem::path& path) const
+{
+  std::ofstream file{path, std::ios_base::out | std::ios_base::binary | std::ios_base::trunc};
+  CHECK(file) << "Unable to open file " << path << " for writing.";
+
+  static_assert(std::is_same_v<decltype(D), uint32_t>);
+  for (size_t n = 0; n < N; ++n) {
+    file.write(reinterpret_cast<const char*>(&D), sizeof(uint32_t));
+    file.write(reinterpret_cast<const char*>(this->data() + n * D), sizeof(T) * D);
+  }
+}
+
+template <typename T>
+void Dataset<T>::copyTo(Dataset& other, cudaStream_t stream) const
+{
+  CHECK_NOTNULL(this->data());
+  CHECK_NOTNULL(other.data());
+  CHECK_GE(other.size_bytes(), this->size_bytes());
+  if (this->isCPUAccessible() && other.isCPUAccessible()) {
+    std::copy(this->begin(), this->end(), other.begin());
+  }
+  else {
+    cudaMemcpyKind copyType = cudaMemcpyDefault;
+    if (this->isGPUAccessible() && other.isGPUAccessible()) {
+      if (this->gpu_id == other.gpu_id) {
+        copyType = cudaMemcpyDeviceToDevice;
+      }
+      else {
+        // TODO: cudaMemcpyPeer where possible
+        VLOG(4) << "Copying between different GPUs is not supported - copying through CPU instead.";
+        Dataset<T> temp = Dataset<T>::empty(N, D, true);
+        copyTo(temp, stream);
+        temp.copyTo(other, stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+        return;
+      }
+    }
+    else if (this->isCPUAccessible() && other.isGPUAccessible())
+      copyType = cudaMemcpyHostToDevice;
+    else if (this->isGPUAccessible() && other.isCPUAccessible())
+      copyType = cudaMemcpyDeviceToHost;
+    CHECK_CUDA(cudaMemcpyAsync(other.data(), this->data(), this->size_bytes(), copyType, stream));
+  }
+}
+
+template <typename T>
+void Dataset<T>::copyRangeTo(uint64_t from, uint64_t num, Dataset& other, cudaStream_t stream) const
+{
+  CHECK_NOTNULL(this->data());
+  CHECK_NOTNULL(other.data());
+  const size_t copySize = num * D * sizeof(T);
+  CHECK_GE(other.size_bytes(), copySize);
+  if (this->isCPUAccessible() && other.isCPUAccessible()) {
+    std::copy_n(this->begin() + from * D, num * D, other.begin());
+  }
+  else {
+    cudaMemcpyKind copyType = cudaMemcpyDefault;
+    if (this->isGPUAccessible() && other.isGPUAccessible()) {
+      if (this->gpu_id == other.gpu_id) {
+        copyType = cudaMemcpyDeviceToDevice;
+      }
+      else {
+        // TODO: cudaMemcpyPeer where possible
+        VLOG(4) << "Copying between different GPUs is not supported - copying through CPU instead.";
+        Dataset<T> temp = Dataset<T>::empty(num, D, true);
+        copyRangeTo(from, num, temp, stream);
+        temp.copyTo(other, stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+        return;
+      }
+    }
+    else if (this->isCPUAccessible() && other.isGPUAccessible())
+      copyType = cudaMemcpyHostToDevice;
+    else if (this->isGPUAccessible() && other.isCPUAccessible())
+      copyType = cudaMemcpyDeviceToHost;
+    CHECK_CUDA(cudaMemcpyAsync(other.data(), this->data() + from * D, copySize, copyType, stream));
+  }
+}
+
+template <typename T>
+Dataset<T> Dataset<T>::clone(cudaStream_t stream) const
+{
+  Dataset<T> result;
+  switch (location) {
+    case DataLocation::FOREIGN_GPU:
+    case DataLocation::GPU:
+    case DataLocation::MANAGED:
+      result = Dataset<T>::emptyOnGPU(N, D, gpu_id);
+      break;
+    case DataLocation::FOREIGN_CPU:
+    case DataLocation::CPU_MALLOC:
+    case DataLocation::CPU_PINNED:
+      result = Dataset<T>::empty(N, D, location == DataLocation::CPU_PINNED);
+      break;
+    default:
+      break;
+  }
+  CHECK_NOTNULL(result.data());
+  copyTo(result, stream);
+  return result;
+}
+
+template <typename T>
+Dataset<T> Dataset<T>::referenceOnGPU(int gpu_id, cudaStream_t stream) const
+{
+  if (this->isGPUAccessible() && this->gpu_id == gpu_id)
+    return Dataset{reference()};
+
+  Dataset result = emptyOnGPU(N, D, gpu_id);
+  copyTo(result, stream);
+  return result;
+}
+
+template struct Dataset<std::byte>;
+template struct Dataset<uint8_t>;
+template struct Dataset<int32_t>;
+template struct Dataset<uint32_t>;
+template struct Dataset<float>;
+
+};  // namespace ggnn
diff --git a/src/ggnn/base/eval.cpp b/src/ggnn/base/eval.cpp
new file mode 100644
index 0000000..d1b08ac
--- /dev/null
+++ b/src/ggnn/base/eval.cpp
@@ -0,0 +1,246 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/eval.h>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/lib.h>
+#include <ggnn/base/data.cuh>
+#include <ggnn/base/dataset.cuh>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <ostream>
+#include <span>
+#include <stdexcept>
+
+#include <glog/logging.h>
+
+namespace ggnn {
+
+template <typename BaseT, typename ValueT>
+ValueT compute_distance(const std::span<const BaseT>& a, const std::span<const BaseT>& b,
+                        const DistanceMeasure measure)
+{
+  CHECK_EQ(a.size(), b.size());
+  const size_t D = a.size();
+  ValueT distance = 0.0f, a_norm = 0.0f, b_norm = 0.0f;
+  for (size_t d = 0; d < D; ++d) {
+    if (measure == DistanceMeasure::Euclidean) {
+      distance += (static_cast<ValueT>(a[d]) - static_cast<ValueT>(b[d])) *
+                  (static_cast<ValueT>(a[d]) - static_cast<ValueT>(b[d]));
+    }
+    else if (measure == DistanceMeasure::Cosine) {
+      distance += static_cast<ValueT>(a[d]) * static_cast<ValueT>(b[d]);
+      a_norm += static_cast<ValueT>(a[d]) * static_cast<ValueT>(a[d]);
+      b_norm += static_cast<ValueT>(a[d]) * static_cast<ValueT>(a[d]);
+    }
+  }
+  if (measure == DistanceMeasure::Euclidean) {
+    distance = std::sqrt(distance);
+  }
+  else if (measure == DistanceMeasure::Cosine) {
+    if (a_norm * b_norm > 0.0f)
+      distance = std::fabs(1.0f - distance / std::sqrt(a_norm * b_norm));
+    else
+      distance = 1.0f;
+  }
+  return distance;
+}
+
+std::ostream& operator<<(std::ostream& os, const Evaluation& eval)
+{
+  os << "c@1 (=r@1): " << eval.c1;
+  if (!std::isnan(eval.c1_dup))
+    os << " +duplicates: " << eval.c1_dup << '\n';
+  else
+    os << " (duplicates unknown)\n";
+  os << "c@" << eval.KQuery << ": " << eval.cKQuery;
+  if (!std::isnan(eval.cKQuery_dup))
+    os << " +duplicates: " << eval.cKQuery_dup << '\n';
+  else
+    os << " (duplicates unknown)\n";
+  os << "r@" << eval.KQuery << ": " << eval.rKQuery;
+  if (!std::isnan(eval.rKQuery_dup))
+    os << " +duplicates: " << eval.rKQuery_dup;
+  else
+    os << " (duplicates unknown)";
+
+  return os;
+}
+
+template <typename KeyT, typename ValueT>
+Evaluator<KeyT, ValueT>::Evaluator(const GenericDataset& base, const GenericDataset& query,
+                                   const Dataset<KeyT>& gt, const uint32_t KQuery,
+                                   const DistanceMeasure measure)
+    : KQuery{KQuery}, measure{measure}, gt{gt.clone()}
+{
+  if (!base.N || !query.N) {
+    LOG(WARNING)
+        << "Cannot check for duplicates in ground truth indices: No base and/or query data given.";
+    return;
+  }
+  if (!base.isCPUAccessible() || !query.isCPUAccessible()) {
+    LOG(WARNING)
+        << "Cannot check for duplicates in ground truth indices: Data is not CPU-accessible.";
+    return;
+  }
+
+  if (!gt.isCPUAccessible())
+    throw std::runtime_error("Ground truth data needs to be given on the CPU for evaluation.");
+
+  if (!gt_duplicates.top1DuplicateEnd.empty() || !gt_duplicates.topKDuplicateEnd.empty())
+    return;
+
+  CHECK_EQ(base.type, query.type);
+
+  auto compute_distance_base_to_query = [&](size_t base_idx, size_t query_idx) -> ValueT {
+    switch (base.type) {
+      case DataType::FLOAT: {
+        Dataset<float> b = base.reference();
+        Dataset<float> q = query.reference();
+        return compute_distance<float, ValueT>(
+            {&b.at(static_cast<size_t>(base_idx) * base.D), base.D},
+            {&q.at(static_cast<size_t>(query_idx) * query.D), query.D}, measure);
+      }
+      case DataType::UINT8: {
+        Dataset<uint8_t> b = base.reference();
+        Dataset<uint8_t> q = query.reference();
+        return compute_distance<uint8_t, ValueT>(
+            {&b.at(static_cast<size_t>(base_idx) * base.D), base.D},
+            {&q.at(static_cast<size_t>(query_idx) * query.D), query.D}, measure);
+      }
+      default:
+        break;
+    }
+    throw std::runtime_error("unsupported data type");
+  };
+
+  VLOG(2) << "searching for duplicates in the ground truth indices.";
+  const float Epsilon = 0.000001f;
+  size_t total_num_duplicates_top_1 = 0, total_num_duplicates_top_k = 0;
+  uint32_t max_dup_top_1 = 0, max_dup_top_k = 0;
+  for (uint32_t n = 0; n < query.N; n++) {
+    const ValueT gt_dist1 = compute_distance_base_to_query(gt[n * gt.D], n);
+    uint32_t num_duplicates_top_1 = 0, num_duplicates_top_k = 0;
+    for (uint32_t k = 1; k < gt.D; ++k) {
+      const ValueT gt_dist_k = compute_distance_base_to_query(gt[n * gt.D + k], n);
+      if (gt_dist_k - gt_dist1 > Epsilon)
+        break;
+      ++num_duplicates_top_1;
+    }
+    total_num_duplicates_top_1 += num_duplicates_top_1;
+    if (num_duplicates_top_1 > max_dup_top_1)
+      max_dup_top_1 = num_duplicates_top_1;
+    gt_duplicates.top1DuplicateEnd.push_back(1 + num_duplicates_top_1);
+    if (KQuery <= gt.D) {
+      const ValueT gt_distKQuery = compute_distance_base_to_query(gt[n * gt.D + KQuery - 1], n);
+      for (uint32_t k = KQuery; k < gt.D; ++k) {
+        const ValueT gt_dist_k = compute_distance_base_to_query(gt[n * gt.D + k], n);
+        if (gt_dist_k - gt_distKQuery > Epsilon)
+          break;
+        ++num_duplicates_top_k;
+      }
+      total_num_duplicates_top_k += num_duplicates_top_k;
+      if (num_duplicates_top_k > max_dup_top_k)
+        max_dup_top_k = num_duplicates_top_k;
+      gt_duplicates.topKDuplicateEnd.push_back(KQuery + num_duplicates_top_k);
+    }
+    else
+      gt_duplicates.topKDuplicateEnd.push_back(gt.D);
+  }
+  VLOG(2) << "found " << total_num_duplicates_top_1 << " duplicates for c@1."
+          << " max: " << max_dup_top_1;
+  if (KQuery <= gt.D) {
+    VLOG(2) << "found " << total_num_duplicates_top_k << " duplicates for c@" << KQuery << "."
+            << " max: " << max_dup_top_k;
+  }
+}
+
+template <typename KeyT, typename ValueT>
+Evaluation Evaluator<KeyT, ValueT>::evaluateResults(const Dataset<KeyT>& results)
+{
+  CHECK_GE(gt.N, results.N);
+
+  if (!gt.D)
+    throw std::runtime_error("No ground truth data loaded. cannot compute accuracy.");
+  if (!results.isCPUAccessible())
+    throw std::runtime_error("Results need to be given on the CPU for evaluation.");
+
+  const bool has_duplicate_info =
+      (!gt_duplicates.top1DuplicateEnd.empty() && !gt_duplicates.topKDuplicateEnd.empty());
+
+  uint32_t c1 = 0;
+  uint32_t c1_dup = 0;
+  uint32_t cKQuery = 0;
+  uint32_t cKQuery_dup = 0;
+  uint32_t rKQuery = 0;
+  uint32_t rKQuery_dup = 0;
+
+  for (uint32_t n = 0; n < results.N; n++) {
+    const uint32_t endTop1 = has_duplicate_info ? gt_duplicates.top1DuplicateEnd.at(n) : 1;
+    const uint32_t endTopK = has_duplicate_info ? gt_duplicates.topKDuplicateEnd.at(n) : KQuery;
+
+    CHECK_LE(endTopK, gt.D);
+
+    for (uint32_t k_result = 0; k_result < KQuery; k_result++) {
+      const KeyT q = results[n * KQuery + k_result];
+      for (uint32_t k_gt = 0; k_gt < endTopK; k_gt++) {
+        const KeyT gt_key = gt[n * gt.D + k_gt];
+        if (q == gt_key) {
+          if (!k_gt) {
+            if (!k_result)
+              ++c1;
+            if (k_gt < KQuery)
+              ++rKQuery;
+            ++rKQuery_dup;
+          }
+          if (k_gt < endTop1) {
+            if (!k_result)
+              ++c1_dup;
+          }
+          if (k_gt < KQuery)
+            ++cKQuery;
+          ++cKQuery_dup;
+          continue;
+        }
+      }
+    }
+  }
+
+  const float inv_num_queries = 1.0f / static_cast<float>(results.N);
+  const float inv_num_results = 1.0f / static_cast<float>(results.N * KQuery);
+
+  return Evaluation{
+      .KQuery = KQuery,
+      .c1 = static_cast<float>(c1) * inv_num_queries,
+      .c1_dup = has_duplicate_info ? static_cast<float>(c1_dup) * inv_num_queries
+                                   : std::numeric_limits<float>::quiet_NaN(),
+      .cKQuery = static_cast<float>(cKQuery) * inv_num_results,
+      .cKQuery_dup = has_duplicate_info ? static_cast<float>(cKQuery_dup) * inv_num_results
+                                        : std::numeric_limits<float>::quiet_NaN(),
+      .rKQuery = static_cast<float>(rKQuery) * inv_num_queries,
+      .rKQuery_dup = has_duplicate_info ? static_cast<float>(rKQuery_dup) * inv_num_queries
+                                        : std::numeric_limits<float>::quiet_NaN(),
+  };
+}
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_INSTANTIATE_STRUCT, Evaluator);
+
+};  // namespace ggnn
diff --git a/src/ggnn/base/ggnn.cu b/src/ggnn/base/ggnn.cu
new file mode 100644
index 0000000..f8c2f4e
--- /dev/null
+++ b/src/ggnn/base/ggnn.cu
@@ -0,0 +1,563 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/ggnn.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/eval.h>
+#include <ggnn/base/graph.h>
+#include <ggnn/base/graph_config.h>
+#include <ggnn/base/lib.h>
+#include <ggnn/base/result_merger.h>
+#include <ggnn/base/data.cuh>
+
+#include <ggnn/base/dataset.cuh>
+#include <ggnn/base/gpu_instance.cuh>
+
+#include <ggnn/cuda_utils/check.cuh>
+
+#include <ggnn/query/query_kernels.cuh>
+
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <filesystem>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include <glog/logging.h>
+
+namespace ggnn {
+
+struct GGNNConfig {
+  std::filesystem::path graph_dir{};
+  size_t cpu_memory_limit{-1UL};
+  std::vector<int> gpu_ids{};
+  uint32_t N_shard{};
+  bool return_results_on_gpu{};
+};
+
+template <typename KeyT, typename ValueT>
+struct GGNNImplBase : public GGNNConfig, public GGNN<KeyT, ValueT> {
+  using GGNN = ggnn::GGNN<KeyT, ValueT>;
+  using Graph = ggnn::Graph<KeyT, ValueT>;
+
+  GGNNImplBase(const GGNNConfig& config = {}) : GGNNConfig{config}, GGNN{1} {}
+
+  void setWorkingDirectory(const std::filesystem::path& dir) override
+  {
+    graph_dir = dir.empty() ? std::filesystem::current_path() : std::filesystem::absolute(dir);
+    if (std::filesystem::create_directories(graph_dir))
+      VLOG(1) << "Created working directory " << graph_dir << ".";
+    else
+      VLOG(1) << "Using working directory " << graph_dir;
+  }
+
+  void setCPUMemoryLimit(const size_t memory_limit) override
+  {
+    cpu_memory_limit = memory_limit;
+    VLOG(1) << "Set CPU memory limit to " << sizeInGB(cpu_memory_limit) << " GiB.";
+  }
+
+  void setGPUs(const std::span<const int>& gpu_ids) override
+  {
+    int num_physical_gpus;
+    cudaGetDeviceCount(&num_physical_gpus);
+
+    for (int gpu_id : gpu_ids) {
+      if (gpu_id < 0 || gpu_id > num_physical_gpus)
+        throw std::out_of_range("Invalid GPU index " + std::to_string(gpu_id) + " given.");
+    }
+
+    this->gpu_ids.assign(gpu_ids.begin(), gpu_ids.end());
+  }
+
+  void setShardSize(const uint32_t N_shard) override
+  {
+    this->N_shard = N_shard;
+  }
+
+  void setReturnResultsOnGPU(const bool return_results_on_gpu) override
+  {
+    this->return_results_on_gpu = return_results_on_gpu;
+  }
+
+  void build(const uint32_t /*KBuild*/, const float /*tau_build*/,
+             const uint32_t /*refinement_iterations*/, const DistanceMeasure /*measure*/) override
+  {
+    throw std::runtime_error("The base needs to be set before building a graph.");
+  }
+
+  const Graph& getGraph(const uint32_t /*global_shard_id*/) override
+  {
+    throw std::runtime_error("No graph has been built or loaded yet.");
+  }
+};
+
+template <typename KeyT, typename ValueT, typename BaseT>
+struct GGNNImpl : public GGNNImplBase<KeyT, ValueT> {
+  using GGNN = ggnn::GGNN<KeyT, ValueT>;
+  using GGNNImplBase = ggnn::GGNNImplBase<KeyT, ValueT>;
+  using GPUInstance = ggnn::GPUInstance<KeyT, ValueT, BaseT>;
+  using Results = ggnn::Results<KeyT, ValueT>;
+  using Graph = ggnn::Graph<KeyT, ValueT>;
+
+  using GGNNImplBase::cpu_memory_limit;
+  using GGNNImplBase::gpu_ids;
+  using GGNNImplBase::graph_dir;
+  using GGNNImplBase::N_shard;
+  using GGNNImplBase::return_results_on_gpu;
+
+  GGNNImpl(const GGNNConfig& config) : GGNNImplBase{config} {}
+
+  /// base data or reference to it
+  Dataset<BaseT> base{};
+
+  /// one instance per GPU
+  std::vector<GPUInstance> gpu_instances{};
+
+  void setBaseImpl(Dataset<BaseT>&& base)
+  {
+    // TODO: clear all base copies on GPU instances instead?
+    if (!gpu_instances.empty())
+      throw std::runtime_error("The base cannot be changed once the GPU instances are setup.");
+    this->base = std::move(base);
+  }
+
+  void prepare(uint32_t KBuild)
+  {
+    // TODO: remove the existing GPU instances instead?
+    if (!gpu_instances.empty())
+      throw std::runtime_error("A graph has already been built or loaded.");
+
+    GraphParameters graph_params{.N = static_cast<uint32_t>(base.N), .D = base.D, .KBuild = KBuild};
+
+    if (N_shard > 0) {
+      CHECK_EQ(base.N % N_shard, 0)
+          << "The base dataset needs to be evenly divisible by the shard size.";
+      graph_params.N = N_shard;
+    }
+
+    CHECK_GT(graph_params.N, 0);
+    CHECK_GE(graph_params.D, GGNN::MIN_D);
+    CHECK_LE(graph_params.D, GGNN::MAX_D);
+    CHECK_GE(graph_params.KBuild, GGNN::MIN_KBUILD);
+    CHECK_LE(graph_params.KBuild, GGNN::MAX_KBUILD);
+
+    if (gpu_ids.empty()) {
+      gpu_ids.resize(1);
+      cudaGetDevice(&gpu_ids.at(0));
+      VLOG(3) << "Auto-selecting current GPU: " << gpu_ids.at(0) << ".";
+    }
+
+    const size_t num_gpus = gpu_ids.size();
+    const uint32_t num_shards_per_gpu = base.N / (static_cast<uint64_t>(graph_params.N) * num_gpus);
+    CHECK_EQ(graph_params.N * num_gpus * num_shards_per_gpu, base.N)
+        << "base.N needs to be evenly divisible by (N_shard x num_gpus).";
+
+    const size_t cpu_memory_per_gpu = cpu_memory_limit / num_gpus;
+
+    const GraphConfig graph_config{graph_params};
+
+    gpu_instances.reserve(num_gpus);
+    for (uint32_t device_i = 0; device_i < num_gpus; ++device_i) {
+      const int gpu_id = gpu_ids[device_i];
+      CHECK_CUDA(cudaSetDevice(gpu_id));
+
+      gpu_instances.emplace_back(GPUContext{gpu_id},
+                                 ShardingConfiguration{.N_shard = graph_config.N,
+                                                       .device_index = device_i,
+                                                       .num_shards = num_shards_per_gpu,
+                                                       .cpu_memory_limit = cpu_memory_per_gpu},
+                                 graph_config);
+    }
+
+    VLOG(2) << "GGNN multi-GPU setup configured.";
+  }
+
+  void build(const uint32_t KBuild, const float tau_build, const uint32_t refinement_iterations,
+             const DistanceMeasure measure) override
+  {
+    if (!base.data())
+      throw std::runtime_error("The base needs to be set before building a graph.");
+
+    prepare(KBuild);
+
+    VLOG(0) << "GGNN::build() started.";
+
+    const auto begin = std::chrono::steady_clock::now();
+
+    const size_t num_gpus = gpu_ids.size();
+    std::vector<std::thread> build_threads;
+    build_threads.reserve(num_gpus);
+    float build_time_ms = 0.0f;
+
+    for (auto& gpu_instance : gpu_instances)
+      build_threads.emplace_back([&]() -> void {
+        const float build_time = gpu_instance.build(
+            base, graph_dir, GraphParameters{gpu_instance.shard_config.N_shard, base.D, KBuild},
+            tau_build, refinement_iterations, measure);
+        build_time_ms += build_time;
+      });
+    for (auto& build_thread : build_threads)
+      build_thread.join();
+
+    const auto end = std::chrono::steady_clock::now();
+
+    const auto wall_time_ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+
+    VLOG(0) << "GGNN::build() completed.";
+    VLOG(0) << "Sum of shard build times: " << build_time_ms * 0.001f << " s";
+    VLOG(0) << "Wall time: " << static_cast<float>(wall_time_ms) * 0.001f << " s";
+  }
+
+  void store() override
+  {
+    if (gpu_instances.empty())
+      throw std::runtime_error("There is no graph to store.");
+
+    const uint32_t num_gpus = static_cast<uint32_t>(gpu_instances.size());
+    std::vector<std::thread> store_threads;
+    store_threads.reserve(num_gpus);
+
+    for (auto& gpu_instance : gpu_instances)
+      store_threads.emplace_back([&]() -> void { gpu_instance.store(graph_dir); });
+    for (auto& thread : store_threads)
+      thread.join();
+  }
+
+  void load(const uint32_t KBuild) override
+  {
+    if (!base.data())
+      throw std::runtime_error("The base needs to be set before loading a graph.");
+
+    prepare(KBuild);
+
+    const uint32_t num_gpus = static_cast<uint32_t>(gpu_instances.size());
+    std::vector<std::thread> load_threads;
+    load_threads.reserve(num_gpus);
+
+    for (auto& gpu_instance : gpu_instances)
+      load_threads.emplace_back([&]() -> void {
+        gpu_instance.load(base, graph_dir,
+                          GraphParameters{gpu_instance.shard_config.N_shard, base.D, KBuild});
+      });
+    for (auto& thread : load_threads)
+      thread.join();
+  }
+
+  Results queryImpl(const Dataset<BaseT>& query, const uint32_t KQuery, const float tau_query,
+                    const uint32_t max_iterations, const DistanceMeasure measure)
+  {
+    if (gpu_instances.empty())
+      throw std::runtime_error("There is no graph to query.");
+
+    GPUInstance& gpu_instance_0 = gpu_instances.at(0);
+
+    const uint32_t N_query = query.N;
+    const uint32_t num_gpus = static_cast<uint32_t>(gpu_instances.size());
+    const uint32_t N_shard = gpu_instance_0.shard_config.N_shard;
+    const uint32_t num_shards_per_gpu = gpu_instance_0.shard_config.num_shards;
+
+    using ResultMerger = ggnn::ResultMerger<KeyT, ValueT>;
+    ResultMerger result_merger{static_cast<uint32_t>(query.N), KQuery, num_gpus,
+                               num_shards_per_gpu};
+
+    VLOG(0) << "GGNN::query()" << " | N_query: " << N_query << " | tau_query: " << tau_query
+            << " | num_gpus: " << num_gpus << " | N_shard: " << N_shard
+            << " | num_iterations: " << num_shards_per_gpu;
+
+    if (return_results_on_gpu) {
+      if (gpu_instances.size() > 1)
+        throw std::runtime_error(
+            "Returning query results on GPU is only possible when using a single GPU.");
+      Results d_results =
+          gpu_instance_0.query(query, graph_dir, KQuery, max_iterations, tau_query, measure);
+      return d_results;
+    }
+
+    std::vector<std::thread> query_threads;
+    query_threads.reserve(num_gpus);
+
+    auto run_query = [&](uint32_t device_i) -> void {
+      GPUInstance& gpu_instance = gpu_instances.at(device_i);
+      Results d_results =
+          gpu_instance.query(query, graph_dir, KQuery, max_iterations, tau_query, measure);
+      auto& h_results = result_merger.partial_results_per_gpu.at(device_i);
+      // TODO: use a stream assigned to this GPU?
+      // cudaStream_t shard0Stream = gpu_instance_0.getGPUGraphBuffer(0).stream.get();
+      d_results.ids.copyTo(h_results.ids, 0);
+      d_results.dists.copyTo(h_results.dists, 0);
+      cudaStreamSynchronize(0);
+    };
+
+    for (uint32_t device_i = 0; device_i < num_gpus; device_i++)
+      query_threads.emplace_back(run_query, device_i);
+    for (auto& thread : query_threads)
+      thread.join();
+
+    // CPU Zone:
+    return std::move(result_merger).merge(N_shard);
+  }
+
+  Results bfQueryImpl(const Dataset<BaseT>& query, const uint32_t KGT,
+                      const DistanceMeasure measure)
+  {
+    if (!base.data())
+      throw std::runtime_error("There is no base dataset loaded which could be queried.");
+
+    if (gpu_instances.size() > 1)
+      throw std::runtime_error("The brute-force query only supports a single GPU.");
+
+    // TODO: use a stream assigned to this GPU?
+    const cudaStream_t stream = 0;
+    const int32_t gpu_id = gpu_instances.empty() ? gpu_ids.empty() ? 0 : gpu_ids.at(0)
+                                                 : gpu_instances.at(0).gpu_ctx.gpu_id;
+    cudaSetDevice(gpu_id);
+
+    Dataset<BaseT> d_base =
+        gpu_instances.empty()
+            ? base.referenceOnGPU(gpu_id, stream)
+            : Dataset<BaseT>{gpu_instances.at(0).getGPUBaseShard(0).base.reference()};
+    Dataset<BaseT> d_query = query.referenceOnGPU(gpu_id, stream);
+
+    Results d_results = {Dataset<KeyT>::emptyOnGPU(query.N, KGT, gpu_id),
+                         Dataset<ValueT>::emptyOnGPU(query.N, KGT, gpu_id)};
+
+    QueryKernels<KeyT, ValueT, BaseT> query_kernels{measure};
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    float milliseconds = 0;
+
+    cudaEventRecord(start, stream);
+    query_kernels.bruteForceQuery(d_base, d_query, KGT, d_results, stream);
+    cudaEventRecord(stop, stream);
+
+    cudaEventSynchronize(stop);
+
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    VLOG(0) << "[GPU: " << 0 << "] brute-force query: => ms: " << milliseconds << " [" << query.N
+            << " points query -> " << milliseconds * 1000.0f / static_cast<float>(query.N)
+            << " us/point] \n";
+
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    if (return_results_on_gpu)
+      return d_results;
+
+    using ResultMerger = ggnn::ResultMerger<KeyT, ValueT>;
+    ResultMerger result_merger{static_cast<uint32_t>(query.N), KGT};
+
+    auto& h_results = result_merger.partial_results_per_gpu.at(0);
+    d_results.ids.copyTo(h_results.ids, stream);
+    d_results.dists.copyTo(h_results.dists, stream);
+
+    cudaStreamSynchronize(stream);
+
+    return std::move(result_merger).merge(base.N);
+  }
+
+  const Graph& getGraph(const uint32_t global_shard_id) override
+  {
+    if (gpu_instances.empty())
+      throw std::runtime_error("There is no graph.");
+
+    for (auto& gpu_instance : gpu_instances) {
+      if (gpu_instance.hasPart(global_shard_id)) {
+        if (return_results_on_gpu) {
+          const auto& gpu_graph_shard = gpu_instance.getGPUGraphShard(graph_dir, global_shard_id);
+          CHECK_EQ(gpu_graph_shard.global_shard_id, global_shard_id);
+          return gpu_graph_shard.graph;
+        }
+        else {
+          const auto& cpu_graph_shard = gpu_instance.getCPUGraphShard(graph_dir, global_shard_id);
+          CHECK_EQ(cpu_graph_shard.global_shard_id, global_shard_id);
+          return cpu_graph_shard.graph;
+        }
+      }
+    }
+    throw std::runtime_error("Shard " + std::to_string(global_shard_id) + " does not exist.");
+  }
+};
+
+template <typename KeyT, typename ValueT>
+GGNN<KeyT, ValueT>::GGNN() : pimpl{new GGNNImplBase<KeyT, ValueT>{}}
+{
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::setWorkingDirectory(const std::filesystem::path& dir)
+{
+  pimpl->setWorkingDirectory(dir);
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::setCPUMemoryLimit(const size_t memory_limit)
+{
+  pimpl->setCPUMemoryLimit(memory_limit);
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::setGPUs(const std::span<const int>& gpu_ids)
+{
+  pimpl->setGPUs(gpu_ids);
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::setShardSize(const uint32_t N_shard)
+{
+  pimpl->setShardSize(N_shard);
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::setReturnResultsOnGPU(const bool gpu_only)
+{
+  pimpl->setReturnResultsOnGPU(gpu_only);
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::setBase(GenericDataset&& base)
+{
+  // TODO: check pimpl, set new one based on data type
+  GGNNImpl<KeyT, ValueT, uint8_t>* impl_uint8_t =
+      dynamic_cast<GGNNImpl<KeyT, ValueT, uint8_t>*>(pimpl.get());
+  GGNNImpl<KeyT, ValueT, float>* impl_float =
+      dynamic_cast<GGNNImpl<KeyT, ValueT, float>*>(pimpl.get());
+  GGNNImplBase<KeyT, ValueT>* impl_base = dynamic_cast<GGNNImplBase<KeyT, ValueT>*>(pimpl.get());
+  CHECK_NOTNULL(impl_base);
+
+  switch (base.type) {
+    case DataType::FLOAT:
+      CHECK_EQ(impl_uint8_t, nullptr)
+          << "base has already been set with a different data type (uint8_t)";
+      if (!impl_float) {
+        impl_float = new GGNNImpl<KeyT, ValueT, float>{*static_cast<GGNNConfig*>(impl_base)};
+        pimpl.reset(impl_float);
+      }
+      impl_float->setBaseImpl(std::move(base));
+      return;
+    case DataType::UINT8:
+      CHECK_EQ(impl_float, nullptr)
+          << "base has already been set with a different data type (float)";
+      if (!impl_uint8_t) {
+        impl_uint8_t = new GGNNImpl<KeyT, ValueT, uint8_t>{*static_cast<GGNNConfig*>(impl_base)};
+        pimpl.reset(impl_uint8_t);
+      }
+      impl_uint8_t->setBaseImpl(std::move(base));
+      return;
+    default:
+      break;
+  }
+
+  throw std::runtime_error("unsupported datatype for base");
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::setBaseReference(const GenericDataset& base)
+{
+  setBase(base.reference());
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::build(const uint32_t KBuild, const float tau_build,
+                               const uint32_t refinement_iterations, const DistanceMeasure measure)
+{
+  pimpl->build(KBuild, tau_build, refinement_iterations, measure);
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::store()
+{
+  pimpl->store();
+}
+
+template <typename KeyT, typename ValueT>
+void GGNN<KeyT, ValueT>::load(const uint32_t KBuild)
+{
+  pimpl->load(KBuild);
+}
+
+template <typename KeyT, typename ValueT>
+Results<KeyT, ValueT> GGNN<KeyT, ValueT>::query(const GenericDataset& query, const uint32_t KQuery,
+                                                const float tau_query,
+                                                const uint32_t max_iterations,
+                                                const DistanceMeasure measure)
+{
+  switch (query.type) {
+    case DataType::FLOAT: {
+      GGNNImpl<KeyT, ValueT, float>* impl =
+          dynamic_cast<GGNNImpl<KeyT, ValueT, float>*>(pimpl.get());
+      CHECK_NOTNULL(impl);  // query data type does not mach base data type or base not set
+      return impl->queryImpl(query.reference(), KQuery, tau_query, max_iterations, measure);
+    }
+    case DataType::UINT8: {
+      GGNNImpl<KeyT, ValueT, uint8_t>* impl =
+          dynamic_cast<GGNNImpl<KeyT, ValueT, uint8_t>*>(pimpl.get());
+      CHECK_NOTNULL(impl);  // query data type does not mach base data type or base not set
+      return impl->queryImpl(query.reference(), KQuery, tau_query, max_iterations, measure);
+    }
+    default:
+      break;
+  }
+  throw std::runtime_error("unsupported datatype for query");
+}
+
+template <typename KeyT, typename ValueT>
+Results<KeyT, ValueT> GGNN<KeyT, ValueT>::bfQuery(const GenericDataset& query, const uint32_t KGT,
+                                                  const DistanceMeasure measure)
+{
+  switch (query.type) {
+    case DataType::FLOAT: {
+      GGNNImpl<KeyT, ValueT, float>* impl =
+          dynamic_cast<GGNNImpl<KeyT, ValueT, float>*>(pimpl.get());
+      CHECK_NOTNULL(impl);  // query data type does not mach base data type or base not set
+      return impl->bfQueryImpl(query.reference(), KGT, measure);
+    }
+    case DataType::UINT8: {
+      GGNNImpl<KeyT, ValueT, uint8_t>* impl =
+          dynamic_cast<GGNNImpl<KeyT, ValueT, uint8_t>*>(pimpl.get());
+      CHECK_NOTNULL(impl);  // query data type does not mach base data type or base not set
+      return impl->bfQueryImpl(query.reference(), KGT, measure);
+    }
+    default:
+      break;
+  }
+  throw std::runtime_error("unsupported datatype for brute-force query");
+}
+
+template <typename KeyT, typename ValueT>
+const Graph<KeyT, ValueT>& GGNN<KeyT, ValueT>::getGraph(const uint32_t on_gpu_shard_id)
+{
+  return pimpl->getGraph(on_gpu_shard_id);
+}
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_INSTANTIATE_STRUCT, GGNNImpl);
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_INSTANTIATE_STRUCT, GGNNImplBase);
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_INSTANTIATE_CLASS, GGNN);
+
+};  // namespace ggnn
diff --git a/src/ggnn/base/gpu_instance.cu b/src/ggnn/base/gpu_instance.cu
new file mode 100644
index 0000000..9069019
--- /dev/null
+++ b/src/ggnn/base/gpu_instance.cu
@@ -0,0 +1,800 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/gpu_instance.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph.h>
+#include <ggnn/base/graph_config.h>
+#include <ggnn/base/lib.h>
+#include <ggnn/base/dataset.cuh>
+
+#include <ggnn/cuda_utils/check.cuh>
+
+#include <ggnn/construction/graph_buffer.cuh>
+#include <ggnn/construction/graph_construction.cuh>
+#include <ggnn/query/query_kernels.cuh>
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <limits>
+#include <string>
+#include <thread>
+
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include <cub/cub.cuh>
+
+namespace ggnn {
+
+void CUDAStreamDeleter::operator()(cudaStream_t stream)
+{
+  if (stream)
+    CHECK_CUDA(cudaStreamDestroy(stream));
+}
+void CUDAEventDeleter::operator()(cudaEvent_t event)
+{
+  if (event)
+    CHECK_CUDA(cudaEventDestroy(event));
+}
+
+int GPUContext::getCurrentGPUId()
+{
+  int device;
+  cudaGetDevice(&device);
+  return device;
+}
+
+void GPUContext::activate() const
+{
+  CHECK_CUDA(cudaSetDevice(gpu_id));
+}
+
+CudaStream GPUContext::createStream()
+{
+  activate();
+  cudaStream_t new_stream;
+  CHECK_CUDA(cudaStreamCreate(&new_stream));
+  return CudaStream{new_stream};
+}
+CudaEvent GPUContext::createEvent()
+{
+  activate();
+  cudaEvent_t new_event;
+  CHECK_CUDA(cudaEventCreate(&new_event));
+  return CudaEvent{new_event};
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::CPUGraphBuffer::load(
+    const std::filesystem::path& part_filename, const uint32_t global_shard_id)
+{
+  std::ifstream inFile{part_filename, std::ifstream::in | std::ifstream::binary};
+  CHECK(inFile.is_open()) << "Unable to open " << part_filename;
+
+  inFile.seekg(0, std::ifstream::end);
+  size_t filesize = inFile.tellg();
+  inFile.seekg(0, std::ifstream::beg);
+  CHECK_EQ(filesize, graph.memory.size_bytes())
+      << "Error on loading" << part_filename
+      << ". File size of GGNNGraph does not match the expected size.";
+
+  inFile.read(reinterpret_cast<char*>(graph.memory.data()), graph.memory.size_bytes());
+  inFile.close();
+
+  this->global_shard_id = global_shard_id;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::CPUGraphBuffer::store(
+    const std::filesystem::path& part_filename) const
+{
+  std::ofstream outFile{part_filename,
+                        std::ofstream::out | std::ofstream::binary | std::ofstream::trunc};
+  CHECK(outFile.is_open()) << "Unable to open " << part_filename;
+  outFile.write(reinterpret_cast<const char*>(graph.memory.data()), graph.memory.size_bytes());
+  outFile.close();
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::CPUGraphBuffer::upload(GPUGraphBuffer& gpu_buffer) const
+{
+  gpu_buffer.global_shard_id = global_shard_id;
+  const cudaStream_t stream = gpu_buffer.stream.get();
+  graph.memory.copyTo(gpu_buffer.graph.memory, stream);
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::CPUGraphBuffer::download(const GPUGraphBuffer& gpu_buffer)
+{
+  global_shard_id = gpu_buffer.global_shard_id;
+  const cudaStream_t stream = gpu_buffer.stream.get();
+  gpu_buffer.graph.memory.copyTo(graph.memory, stream);
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::allocateGraph(const GraphConfig& config,
+                                                     const bool reserve_construction_memory)
+{
+  gpu_ctx.activate();
+
+  {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, gpu_ctx.gpu_id);
+    LOG(INFO) << "[GPU: " << shard_config.device_index
+              << "] GPUInstance(): CUDA device id: " << gpu_ctx.gpu_id << " " << prop.name;
+  }
+
+  // deallocate old shards
+  h_buffers.clear();
+  d_buffers.clear();
+  d_base_buffers.clear();
+
+  graph_config = config;
+
+  using GraphPartSizes = typename ggnn::Graph<KeyT, ValueT>::PartSizes;
+  const size_t graph_size = GraphPartSizes{graph_config}.getGraphSize();
+
+  const uint32_t max_gpu_buffers = [this, graph_size, reserve_construction_memory]() -> uint32_t {
+    size_t free, total;
+    CHECK_CUDA(cudaMemGetInfo(&free, &total));
+
+    if (reserve_construction_memory) {
+      using GraphBufferPartSizes = typename ggnn::GraphBuffer<KeyT, ValueT>::PartSizes;
+      const size_t construction_size = GraphBufferPartSizes{graph_config}.getBufferSize();
+      CHECK_GT(free, construction_size)
+          << "GPU memory does not suffice for the construction buffer";
+      VLOG(4) << "reserving " << sizeInGB(construction_size)
+              << " GB of device memory for graph construction.";
+      free -= construction_size;
+    }
+
+    const size_t size_per_shard = graph_config.getBaseSize(sizeof(BaseT)) + graph_size;
+
+    const uint32_t max_shards = static_cast<uint32_t>(free / size_per_shard);
+    VLOG(4) << "remaining device memory (" << sizeInGB(free) << " GB) suffices for " << max_shards
+            << " shards (" << sizeInGB(size_per_shard) << " GB each).";
+
+    CHECK_GT(max_shards, 0)
+        << "GPU memory does not suffice for a single shard. use smaller shards.";
+
+    return max_shards;
+  }();
+
+  const uint32_t num_gpu_buffers = std::min(max_gpu_buffers, shard_config.num_shards);
+
+  d_buffers.reserve(num_gpu_buffers);
+  d_base_buffers.reserve(num_gpu_buffers);
+
+  // allocate CPU memory first (fail early if out of memory)
+  const uint32_t max_cpu_buffers = [this, graph_size]() -> uint32_t {
+    if (shard_config.cpu_memory_limit == std::numeric_limits<uint32_t>::max())
+      return std::numeric_limits<uint32_t>::max();
+
+    const size_t size_per_shard = graph_size;
+
+    const uint32_t max_shards =
+        static_cast<uint32_t>(shard_config.cpu_memory_limit / size_per_shard);
+    VLOG(4) << "assigned CPU memory (" << sizeInGB(shard_config.cpu_memory_limit)
+            << " GB) suffices for " << max_shards << " shards (" << sizeInGB(size_per_shard)
+            << " GB each).";
+
+    CHECK_GT(max_shards, 0)
+        << "CPU memory does not suffice for a single shard. use smaller shards.";
+
+    return max_shards;
+  }();
+
+  // NOTE: even a single buffer is not strictly necessary when running everything purely on the GPU
+  const bool all_shards_on_gpu = num_gpu_buffers == shard_config.num_shards;
+  const uint32_t num_cpu_buffers =
+      all_shards_on_gpu ? 1 : std::min(max_cpu_buffers, shard_config.num_shards);
+  VLOG_IF(4, all_shards_on_gpu)
+      << "GPU memory suffices for all shards. Allocating only a single CPU buffer.";
+
+  allocateCPUBuffers(num_cpu_buffers);
+
+  for (uint32_t i = 0; i < num_gpu_buffers; i++) {
+    d_buffers.push_back(GPUGraphBuffer{
+        .graph = Graph{graph_config, Dataset<std::byte>::emptyOnGPU(graph_size, 1, gpu_ctx.gpu_id)},
+        .global_shard_id = -1U,
+        .stream = gpu_ctx.createStream()});
+    d_base_buffers.push_back(GPUBaseBuffer{
+        .base = Dataset<BaseT>::emptyOnGPU(graph_config.N, graph_config.D, gpu_ctx.gpu_id),
+        .global_shard_id = -1U,
+    });
+  }
+
+  io_threads.resize(std::min(num_cpu_buffers, num_gpu_buffers));
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::allocateCPUBuffers(const uint32_t num_cpu_buffers)
+{
+  CHECK(h_buffers.empty());
+
+  using GraphPartSizes = typename ggnn::Graph<KeyT, ValueT>::PartSizes;
+  const size_t graph_size = GraphPartSizes{graph_config}.getGraphSize();
+
+  h_buffers.reserve(num_cpu_buffers);
+  for (uint32_t i = 0; i < num_cpu_buffers; i++) {
+    h_buffers.push_back(CPUGraphBuffer{
+        .graph = Graph{graph_config, Dataset<std::byte>::empty(graph_size, 1)},
+        .global_shard_id = -1U,
+    });
+  }
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::prefetchBase()
+{
+  CHECK_NOTNULL(h_base_ref.data());
+
+  for (uint32_t i = 0; i < d_base_buffers.size(); i++) {
+    const uint32_t global_shard_id = shard_config.num_shards * shard_config.device_index + i;
+    loadBasePart(global_shard_id);
+  }
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+const GPUInstance<KeyT, ValueT, BaseT>::CPUGraphBuffer&
+GPUInstance<KeyT, ValueT, BaseT>::getCPUGraphShard(const std::filesystem::path& graph_dir,
+                                                   const uint32_t global_shard_id)
+{
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  CHECK_GE(global_shard_id, num_previous_shards);
+  const uint32_t on_gpu_shard_id = global_shard_id - num_previous_shards;
+  CHECK_LT(on_gpu_shard_id, shard_config.num_shards);
+
+  const CPUGraphBuffer& cpu_buffer = getCPUGraphBuffer(on_gpu_shard_id);
+  if (cpu_buffer.global_shard_id != global_shard_id) {
+    if (d_buffers.size() < shard_config.num_shards) {
+      // need to load from file
+      swapInPart(graph_dir, global_shard_id);
+    }
+    else {
+      // need to download from GPU
+      CHECK_EQ(d_buffers.size(), shard_config.num_shards);
+      swapOutPart(graph_dir, global_shard_id, true);
+    }
+    waitForPart(global_shard_id);
+  }
+  CHECK_EQ(cpu_buffer.global_shard_id, global_shard_id);
+  return cpu_buffer;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+const GPUInstance<KeyT, ValueT, BaseT>::GPUGraphBuffer&
+GPUInstance<KeyT, ValueT, BaseT>::getGPUGraphShard(const std::filesystem::path& graph_dir,
+                                                   const uint32_t global_shard_id,
+                                                   const bool sync_stream)
+{
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  CHECK_GE(global_shard_id, num_previous_shards);
+  const uint32_t on_gpu_shard_id = global_shard_id - num_previous_shards;
+  CHECK_LT(on_gpu_shard_id, shard_config.num_shards);
+
+  const GPUGraphBuffer& gpu_buffer = getGPUGraphBuffer(on_gpu_shard_id);
+  if (gpu_buffer.global_shard_id != global_shard_id) {
+    swapInPart(graph_dir, global_shard_id);
+    if (sync_stream)
+      waitForPart(global_shard_id);
+  }
+  CHECK_EQ(gpu_buffer.global_shard_id, global_shard_id);
+  return gpu_buffer;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+const GPUInstance<KeyT, ValueT, BaseT>::GPUBaseBuffer&
+GPUInstance<KeyT, ValueT, BaseT>::getGPUBaseShard(const uint32_t global_shard_id,
+                                                  const bool sync_stream)
+{
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  CHECK_GE(global_shard_id, num_previous_shards);
+  const uint32_t on_gpu_shard_id = global_shard_id - num_previous_shards;
+  CHECK_LT(on_gpu_shard_id, shard_config.num_shards);
+  CHECK(hasPart(global_shard_id)) << "part " << global_shard_id
+                                  << " does not belong to GPU instance "
+                                  << shard_config.device_index;
+
+  const GPUBaseBuffer& gpu_base_buffer = getGPUBaseBuffer(on_gpu_shard_id);
+  if (gpu_base_buffer.global_shard_id != global_shard_id) {
+    loadBasePart(global_shard_id);
+    if (sync_stream)
+      cudaStreamSynchronize(getStreamForPart(global_shard_id));
+  }
+  CHECK_EQ(gpu_base_buffer.global_shard_id, global_shard_id);
+  return gpu_base_buffer;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+bool GPUInstance<KeyT, ValueT, BaseT>::hasPart(const uint32_t global_shard_id) const
+{
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  return global_shard_id >= num_previous_shards &&
+         global_shard_id < num_previous_shards + shard_config.num_shards;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+cudaStream_t GPUInstance<KeyT, ValueT, BaseT>::getStreamForPart(
+    const uint32_t global_shard_id) const
+{
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  CHECK_GE(global_shard_id, num_previous_shards);
+  const uint32_t on_gpu_shard_id = global_shard_id - num_previous_shards;
+  CHECK_LT(on_gpu_shard_id, shard_config.num_shards);
+
+  const GPUGraphBuffer& gpu_buffer = getGPUGraphBuffer(on_gpu_shard_id);
+  return gpu_buffer.stream.get();
+}
+
+// io
+
+template <typename KeyT, typename ValueT, typename BaseT>
+std::thread& GPUInstance<KeyT, ValueT, BaseT>::getThreadForPart(const uint32_t global_shard_id)
+{
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  CHECK_GE(global_shard_id, num_previous_shards);
+  const uint32_t on_gpu_shard_id = global_shard_id - num_previous_shards;
+  CHECK_LT(on_gpu_shard_id, shard_config.num_shards);
+
+  return io_threads.at(on_gpu_shard_id % io_threads.size());
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::waitForPart(const uint32_t global_shard_id)
+{
+  std::thread& io_thread = getThreadForPart(global_shard_id);
+  if (io_thread.joinable())
+    io_thread.join();
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::swapOutPart(const std::filesystem::path& graph_dir,
+                                                   const uint32_t global_shard_id,
+                                                   bool force_to_ram, bool force_to_file)
+{
+  const uint32_t num_gpu_buffers = static_cast<uint32_t>(d_buffers.size());
+  const uint32_t num_cpu_buffers = static_cast<uint32_t>(h_buffers.size());
+  const bool swap_to_ram = num_gpu_buffers < shard_config.num_shards;
+  const bool swap_to_disk = swap_to_ram && num_cpu_buffers < shard_config.num_shards;
+
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  CHECK_GE(global_shard_id, num_previous_shards);
+  const uint32_t on_gpu_shard_id = global_shard_id - num_previous_shards;
+  CHECK_LT(on_gpu_shard_id, shard_config.num_shards);
+
+  std::thread& io_thread = getThreadForPart(global_shard_id);
+  if (io_thread.joinable())
+    io_thread.join();
+
+  io_thread = std::thread([=, this]() -> void {
+    CPUGraphBuffer& cpu_buffer = getCPUGraphBuffer(on_gpu_shard_id);
+
+    if (cpu_buffer.global_shard_id == global_shard_id) {
+      VLOG(4) << "[GPU: " << shard_config.device_index << "] part " << global_shard_id
+              << " is already downloaded";
+    }
+    else if (swap_to_ram || force_to_ram || force_to_file) {
+      GPUGraphBuffer& gpu_buffer = getGPUGraphBuffer(on_gpu_shard_id);
+      gpu_ctx.activate();
+
+      CHECK_EQ(gpu_buffer.global_shard_id, global_shard_id);
+      cpu_buffer.download(gpu_buffer);
+      VLOG(3) << "[GPU: " << shard_config.device_index << "] downloaded part " << global_shard_id;
+    }
+    else {
+      // TODO: is this a good idea? (in this case, there is enough space on the GPU, so we don't
+      // need to copy back to CPU)
+      VLOG(4) << "[GPU: " << shard_config.device_index << "] skipped downloading part "
+              << global_shard_id;
+    }
+
+    if (swap_to_disk || force_to_file) {
+      CHECK_EQ(cpu_buffer.global_shard_id, global_shard_id);
+      const std::filesystem::path part_filename =
+          graph_dir / std::string{"part_" + std::to_string(global_shard_id) + ".ggnn"};
+      cpu_buffer.store(part_filename);
+      VLOG(2) << "[GPU: " << shard_config.device_index << "] stored part " << global_shard_id
+              << " to " << part_filename.c_str();
+    }
+  });
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::swapInPart(const std::filesystem::path& graph_dir,
+                                                  const uint32_t global_shard_id,
+                                                  bool force_load_from_file)
+{
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  CHECK_GE(global_shard_id, num_previous_shards);
+  const uint32_t on_gpu_shard_id = global_shard_id - num_previous_shards;
+  CHECK_LT(on_gpu_shard_id, shard_config.num_shards);
+
+  GPUGraphBuffer& gpu_buffer = getGPUGraphBuffer(on_gpu_shard_id);
+  if (!force_load_from_file && gpu_buffer.global_shard_id == global_shard_id) {
+    VLOG(4) << "[GPU: " << shard_config.device_index << "] part " << global_shard_id
+            << " is already loaded on GPU buffer " << on_gpu_shard_id % d_buffers.size();
+    return;
+  }
+
+  std::thread& io_thread = getThreadForPart(global_shard_id);
+  if (io_thread.joinable())
+    io_thread.join();
+
+  io_thread = std::thread([=, this]() -> void {
+    gpu_ctx.activate();
+
+    CPUGraphBuffer& cpu_buffer = getCPUGraphBuffer(on_gpu_shard_id);
+    GPUGraphBuffer& gpu_buffer = getGPUGraphBuffer(on_gpu_shard_id);
+
+    if (!force_load_from_file && cpu_buffer.global_shard_id == global_shard_id) {
+      VLOG(4) << "[GPU: " << shard_config.device_index << "] part " << global_shard_id
+              << " is already loaded on CPU buffer " << on_gpu_shard_id % h_buffers.size();
+    }
+    else {
+      const std::filesystem::path part_filename =
+          graph_dir / std::string{"part_" + std::to_string(global_shard_id) + ".ggnn"};
+      VLOG(4) << "[GPU: " << shard_config.device_index << "] loading part " << global_shard_id
+              << " from " << part_filename.c_str();
+      cpu_buffer.load(part_filename, global_shard_id);
+      VLOG(3) << "[GPU: " << shard_config.device_index << "] loaded part " << global_shard_id
+              << " from " << part_filename.c_str();
+    }
+    CHECK_EQ(cpu_buffer.global_shard_id, global_shard_id);
+    cpu_buffer.upload(gpu_buffer);
+    CHECK_EQ(gpu_buffer.global_shard_id, global_shard_id);
+    VLOG(3) << "[GPU: " << shard_config.device_index << "] uploaded part " << global_shard_id;
+  });
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::loadBasePart(const uint32_t global_shard_id)
+{
+  const uint32_t num_previous_shards = shard_config.num_shards * shard_config.device_index;
+  CHECK_GE(global_shard_id, num_previous_shards);
+  const uint32_t on_gpu_shard_id = global_shard_id - num_previous_shards;
+  CHECK_LT(on_gpu_shard_id, shard_config.num_shards);
+
+  CHECK_NOTNULL(h_base_ref.data());
+
+  GPUBaseBuffer& gpu_base_buffer = getGPUBaseBuffer(on_gpu_shard_id);
+  if (gpu_base_buffer.global_shard_id == global_shard_id) {
+    VLOG(4) << "[GPU: " << shard_config.device_index << "] base part " << global_shard_id
+            << " is already loaded on shard " << on_gpu_shard_id;
+    return;
+  }
+  const size_t N_shard = graph_config.N;
+
+  if (h_base_ref.isGPUAccessible() && h_base_ref.gpu_id == gpu_ctx.gpu_id) {
+    gpu_base_buffer.base = h_base_ref.referenceRange(N_shard * global_shard_id, N_shard);
+  }
+  else {
+    h_base_ref.copyRangeTo(N_shard * global_shard_id, N_shard, gpu_base_buffer.base,
+                           getStreamForPart(global_shard_id));
+  }
+  gpu_base_buffer.global_shard_id = global_shard_id;
+  VLOG(3) << "[GPU: " << shard_config.device_index << "] base part " << global_shard_id
+          << " loaded on shard " << on_gpu_shard_id;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+float GPUInstance<KeyT, ValueT, BaseT>::build(const Dataset<BaseT>& base,
+                                              const std::filesystem::path& graph_dir,
+                                              const GraphConfig& config, const float tau_build,
+                                              const uint32_t refinement_iterations,
+                                              const DistanceMeasure measure)
+{
+  h_base_ref = base.reference();
+  allocateGraph(config, true);
+  prefetchBase();
+
+  using GraphConstruction = ggnn::GraphConstruction<KeyT, ValueT, BaseT>;
+
+  GraphConstruction construction{*this, tau_build, measure};
+  float total_build_time = 0.0f;
+
+  gpu_ctx.activate();
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  VLOG(1) << "[GPU: " << shard_config.device_index << "] build(): N: " << graph_config.N;
+
+  for (uint32_t i = 0; i < shard_config.num_shards; i++) {
+    const uint32_t global_shard_id = shard_config.device_index * shard_config.num_shards + i;
+    GPUGraphBuffer& gpu_buffer = getGPUGraphBuffer(i);
+    const GPUBaseBuffer& gpu_base_buffer = getGPUBaseBuffer(i);
+    const cudaStream_t stream = gpu_buffer.stream.get();
+
+    uint32_t refinement_step = 0;
+    float milliseconds = 0;
+
+    auto sync_and_report_progress = [&]() {
+      cudaEventRecord(stop, stream);
+      cudaEventSynchronize(stop);
+
+      cudaEventElapsedTime(&milliseconds, start, stop);
+      VLOG(0) << "[GPU: " << shard_config.device_index << "] build(): part: " << global_shard_id
+              << " refinement step: " << refinement_step << " => seconds: " << milliseconds / 1000.f
+              << " [" << graph_config.N << " points build -> "
+              << milliseconds * 1000.0f / static_cast<float>(graph_config.N) << " us/point] \n";
+    };
+
+    cudaStreamSynchronize(stream);
+
+    cudaEventRecord(start, stream);
+    construction.build(gpu_buffer.graph, gpu_base_buffer.base, stream);
+
+    for (; refinement_step < refinement_iterations; ++refinement_step) {
+      sync_and_report_progress();
+      construction.refine(gpu_buffer.graph, gpu_base_buffer.base, stream);
+    }
+
+    sync_and_report_progress();
+    total_build_time += milliseconds;
+
+    getGPUGraphBuffer(i).global_shard_id = global_shard_id;
+
+    swapOutPart(graph_dir, global_shard_id);
+
+    // prefetch base for following in-memory shards
+    if (i + d_base_buffers.size() < shard_config.num_shards)
+      loadBasePart(global_shard_id + d_base_buffers.size());
+  }
+
+  // wait for all parts to be swapped out
+  for (uint32_t i = 0; i < shard_config.num_shards; i++) {
+    const uint32_t global_shard_id = shard_config.device_index * shard_config.num_shards + i;
+    waitForPart(global_shard_id);
+  }
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  VLOG(0) << "[GPU: " << shard_config.device_index << "] build() done.";
+
+  // process the shards in reverse order during the next query for improved cache utilization
+  process_shards_back_to_front = true;
+
+  return total_build_time;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::load(const Dataset<BaseT>& base,
+                                            const std::filesystem::path& graph_dir,
+                                            const GraphConfig& config)
+{
+  h_base_ref = base.reference();
+  allocateGraph(config, false);
+  prefetchBase();
+
+  for (uint32_t i = 0; i < shard_config.num_shards; i++) {
+    const uint32_t global_shard_id = shard_config.device_index * shard_config.num_shards + i;
+    swapInPart(graph_dir, global_shard_id, true);
+  }
+  for (uint32_t i = 0; i < shard_config.num_shards; i++) {
+    const uint32_t global_shard_id = shard_config.device_index * shard_config.num_shards + i;
+    waitForPart(global_shard_id);
+  }
+
+  // process the shards in reverse order during the next query for improved cache utilization
+  process_shards_back_to_front = true;
+
+  VLOG(0) << "[GPU: " << shard_config.device_index << "] load() done.";
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::store(const std::filesystem::path& graph_dir)
+{
+  for (uint32_t i = 0; i < shard_config.num_shards; i++) {
+    const uint32_t global_shard_id = shard_config.device_index * shard_config.num_shards + i;
+    swapOutPart(graph_dir, global_shard_id, true, true);
+  }
+  for (uint32_t i = 0; i < shard_config.num_shards; i++) {
+    const uint32_t global_shard_id = shard_config.device_index * shard_config.num_shards + i;
+    waitForPart(global_shard_id);
+  }
+
+  VLOG(0) << "[GPU: " << shard_config.device_index << "] store() done.";
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+typename GPUInstance<KeyT, ValueT, BaseT>::Results GPUInstance<KeyT, ValueT, BaseT>::query(
+    const Dataset<BaseT>& query, const std::filesystem::path& graph_dir, const uint32_t KQuery,
+    const uint32_t max_iterations, const float tau_query, const DistanceMeasure measure)
+{
+  if (d_buffers.empty() || getGPUGraphBuffer(0).global_shard_id == -1U) {
+    LOG(ERROR) << "no graph available for query. did you forget to build one?";
+    return {};
+  }
+
+  const uint32_t num_previous_shards = shard_config.device_index * shard_config.num_shards;
+
+  Dataset<BaseT> d_query = query.referenceOnGPU(
+      gpu_ctx.gpu_id,
+      getStreamForPart(num_previous_shards +
+                       (process_shards_back_to_front ? shard_config.num_shards - 1 : 0)));
+
+  Results d_results = {
+      Dataset<KeyT>::emptyOnGPU(d_query.N, KQuery * shard_config.num_shards, gpu_ctx.gpu_id),
+      Dataset<ValueT>::emptyOnGPU(d_query.N, KQuery * shard_config.num_shards, gpu_ctx.gpu_id)};
+
+  QueryKernels<KeyT, ValueT, BaseT> query_kernels{measure};
+  const uint32_t N_query = d_query.N;
+  const size_t num_gpu_buffers = d_buffers.size();
+  const size_t num_cpu_buffers = h_buffers.size();
+  const size_t prefetch_amount = std::min(num_cpu_buffers, num_gpu_buffers);
+
+  gpu_ctx.activate();
+
+  cudaEvent_t start, stop;
+  CHECK_CUDA(cudaEventCreate(&start));
+  CHECK_CUDA(cudaEventCreate(&stop));
+  float milliseconds = 0;
+
+  // prefetch as many shards onto the GPU as possible
+  for (uint32_t i = 0; i < num_gpu_buffers; i++) {
+    const uint32_t j = process_shards_back_to_front ? shard_config.num_shards - i - 1 : i;
+    const uint32_t global_shard_id = num_previous_shards + j;
+    loadBasePart(global_shard_id);
+    swapInPart(graph_dir, global_shard_id);
+  }
+
+  // query all shards
+  for (uint32_t i = 0; i < shard_config.num_shards; i++) {
+    const uint32_t j = process_shards_back_to_front ? shard_config.num_shards - i - 1 : i;
+    const uint32_t global_shard_id = num_previous_shards + j;
+
+    const cudaStream_t stream = getStreamForPart(global_shard_id);
+
+    {
+      const auto begin = std::chrono::high_resolution_clock::now();
+      waitForPart(global_shard_id);
+      const auto end = std::chrono::high_resolution_clock::now();
+      const auto cpu_us =
+          std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count();
+      VLOG(2) << "[GPU: " << shard_config.device_index
+              << "] shard-swap delay: " << static_cast<float>(cpu_us) * 0.001f << " ms.";
+    }
+
+    // CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    CHECK_CUDA(cudaEventRecord(start, stream));
+    query_kernels.query(*this, global_shard_id, d_query, KQuery, max_iterations, tau_query,
+                        d_results);
+    CHECK_CUDA(cudaEventRecord(stop, stream));
+
+    // start the upload for the next shard after starting the current query
+    // then, it should be able to overlap
+    // prefetch only as much in parallel as there are cpu buffers
+    if (process_shards_back_to_front) {
+      if (j >= prefetch_amount && j - prefetch_amount < shard_config.num_shards - num_gpu_buffers) {
+        loadBasePart(global_shard_id - prefetch_amount);
+        swapInPart(graph_dir, global_shard_id - prefetch_amount);
+      }
+    }
+    else if (j + prefetch_amount < shard_config.num_shards &&
+             j + prefetch_amount >= num_gpu_buffers) {
+      loadBasePart(global_shard_id + prefetch_amount);
+      swapInPart(graph_dir, global_shard_id + prefetch_amount);
+    }
+
+    CHECK_CUDA(cudaEventSynchronize(stop));
+
+    CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
+    VLOG(0) << "[GPU: " << shard_config.device_index << "] query part: " << global_shard_id
+            << " => ms: " << milliseconds << " [" << N_query << " points query -> "
+            << milliseconds * 1000.0f / static_cast<float>(N_query) << " us/point] \n";
+  }
+
+  // sort results from multiple parts
+  if (shard_config.num_shards > 1) {
+    cudaStream_t lastShardStream = getStreamForPart(
+        num_previous_shards + (process_shards_back_to_front ? 0 : shard_config.num_shards - 1));
+
+    CHECK_CUDA(cudaEventRecord(start, lastShardStream));
+
+    sortQueryResults(d_results, lastShardStream);
+
+    CHECK_CUDA(cudaEventRecord(stop, lastShardStream));
+    CHECK_CUDA(cudaEventSynchronize(stop));
+
+    CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
+    VLOG(0) << "[GPU: " << shard_config.device_index
+            << "] query sort: " << " => ms: " << milliseconds << " [" << N_query
+            << " points query -> " << milliseconds * 1000.0f / static_cast<float>(N_query)
+            << " us/point] \n";
+
+    VLOG(0) << "[GPU: " << shard_config.device_index << "] query() done.";
+  }
+
+  CHECK_CUDA(cudaEventDestroy(start));
+  CHECK_CUDA(cudaEventDestroy(stop));
+
+  // process the shards in reverse order during the next query for improved cache utilization
+  process_shards_back_to_front = !process_shards_back_to_front;
+
+  return d_results;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+void GPUInstance<KeyT, ValueT, BaseT>::sortQueryResults(Results& d_results, cudaStream_t stream)
+{
+  if (shard_config.num_shards <= 1)
+    return;
+
+  CHECK_NOTNULL(d_results.ids.data());
+
+  Results d_results_sorted = {
+      Dataset<KeyT>::emptyOnGPU(d_results.ids.N, d_results.ids.D, gpu_ctx.gpu_id),
+      Dataset<ValueT>::emptyOnGPU(d_results.dists.N, d_results.dists.D, gpu_ctx.gpu_id)};
+
+  Dataset<uint32_t> d_offsets =
+      Dataset<uint32_t>::emptyOnGPU(d_results.ids.N + 1, 1, gpu_ctx.gpu_id);
+
+  // The results are stored sequentially for all parts per query.
+  // CUB needs to know where these sequences begin and end.
+  // The previous end always serves as the next beginning.
+  Dataset<uint32_t> h_offsets{Dataset<uint32_t>::empty(d_results.ids.N + 1, 1, true)};
+  for (uint32_t i = 0; i < (d_results.ids.N + 1); i++) {
+    h_offsets[i] = i * d_results.ids.D;
+  }
+  h_offsets.copyTo(d_offsets, stream);
+
+  size_t temp_storage_bytes = 0;
+
+  cub::DeviceSegmentedRadixSort::SortPairs(
+      nullptr, temp_storage_bytes, d_results.dists.data(), d_results_sorted.dists.data(),
+      d_results.ids.data(), d_results_sorted.ids.data(), static_cast<int>(d_results.ids.numel()),
+      static_cast<int>(d_results.ids.N), d_offsets.data(), d_offsets.data() + 1, 0,
+      sizeof(ValueT) * 8, stream);
+
+  Dataset<std::byte> d_temp_storage =
+      Dataset<std::byte>::emptyOnGPU(temp_storage_bytes, 1, gpu_ctx.gpu_id);
+
+  cub::DeviceSegmentedRadixSort::SortPairs(
+      d_temp_storage.data(), temp_storage_bytes, d_results.dists.data(),
+      d_results_sorted.dists.data(), d_results.ids.data(), d_results_sorted.ids.data(),
+      static_cast<int>(d_results.ids.numel()), static_cast<int>(d_results.ids.N), d_offsets.data(),
+      d_offsets.data() + 1, 0, sizeof(ValueT) * 8, stream);
+
+  // wait for CUB to finish using d_temp_storage before deleting
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+
+  std::swap(d_results, d_results_sorted);
+}
+
+#define GGNN_GPU_INSTANCE(KeyT, ValueT, BaseT)                  \
+  extern template struct Dataset<BaseT>;                        \
+  extern template struct Graph<KeyT, ValueT>;                   \
+  extern template struct Results<KeyT, ValueT>;                 \
+  extern template class GraphConstruction<KeyT, ValueT, BaseT>; \
+  extern template class QueryKernels<KeyT, ValueT, BaseT>;      \
+                                                                \
+  template class GPUInstance<KeyT, ValueT, BaseT>;
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_GPU_INSTANCE);
+
+};  // namespace ggnn
diff --git a/src/ggnn/base/graph.cpp b/src/ggnn/base/graph.cpp
new file mode 100644
index 0000000..efe9be5
--- /dev/null
+++ b/src/ggnn/base/graph.cpp
@@ -0,0 +1,96 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Rupert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/graph.h>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph_config.h>
+#include <ggnn/base/lib.h>
+#include <ggnn/base/dataset.cuh>
+
+#include <glog/logging.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT>
+Graph<KeyT, ValueT>::Graph(const GraphConfig& graph_config, Dataset<std::byte>&& memory)
+    : memory{std::move(memory)}
+{
+  const PartSizes graph_part_sizes{graph_config};
+  const size_t total_graph_size{graph_part_sizes.getGraphSize()};
+
+  VLOG(2) << "Graph(): N: " << graph_config.N << ", K: " << graph_config.KBuild
+          << ", N_all: " << graph_config.N_all << ", ST_all: " << graph_config.ST_all << " ("
+          << sizeInGB(total_graph_size) << " GB total, " << this->memory.location << ")\n";
+
+  CHECK_GE(this->memory.size_bytes(), total_graph_size);
+
+  const bool on_gpu = this->memory.isGPUAccessible();
+
+  Dataset<KeyT> neighborhood_data =
+      on_gpu ? Dataset<KeyT>::referenceGPUData(reinterpret_cast<KeyT*>(this->memory.data()),
+                                               graph_config.N_all, graph_config.KBuild,
+                                               this->memory.gpu_id)
+             : Dataset<KeyT>::referenceCPUData(reinterpret_cast<KeyT*>(this->memory.data()),
+                                               graph_config.N_all, graph_config.KBuild);
+  Dataset<KeyT> selection_translation_data =
+      on_gpu ? Dataset<KeyT>::referenceGPUData(
+                   reinterpret_cast<KeyT*>(this->memory.data() + neighborhood_data.size_bytes()),
+                   graph_config.ST_all * 2, 1, this->memory.gpu_id)
+             : Dataset<KeyT>::referenceCPUData(
+                   reinterpret_cast<KeyT*>(this->memory.data() + neighborhood_data.size_bytes()),
+                   graph_config.ST_all * 2, 1);
+
+  Dataset<KeyT> graph_layers = neighborhood_data.referenceRange(0, graph_config.N_all);
+  Dataset<KeyT> translation_layers =
+      selection_translation_data.referenceRange(0, graph_config.ST_all);
+  Dataset<KeyT> selection_layers =
+      selection_translation_data.referenceRange(graph_config.ST_all, graph_config.ST_all);
+
+  for (uint32_t layer = 0; layer < GraphConfig::L; ++layer) {
+    graph[layer] =
+        graph_layers.referenceRange(graph_config.Ns_offsets[layer], graph_config.Ns[layer]);
+    if (layer) {
+      selection[layer] =
+          selection_layers.referenceRange(graph_config.STs_offsets[layer], graph_config.Ns[layer]);
+      translation[layer] = translation_layers.referenceRange(graph_config.STs_offsets[layer],
+                                                             graph_config.Ns[layer]);
+    }
+  }
+
+  nn1_stats =
+      on_gpu ? Dataset<ValueT>::referenceGPUData(
+                   reinterpret_cast<ValueT*>(this->memory.data() + neighborhood_data.size_bytes() +
+                                             selection_translation_data.size_bytes()),
+                   2, 1, this->memory.gpu_id)
+             : Dataset<ValueT>::referenceCPUData(
+                   reinterpret_cast<ValueT*>(this->memory.data() + neighborhood_data.size_bytes() +
+                                             selection_translation_data.size_bytes()),
+                   2, 1);
+
+  CHECK_EQ(neighborhood_data.size_bytes() + selection_translation_data.size_bytes() +
+               nn1_stats.size_bytes(),
+           total_graph_size);
+}
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_INSTANTIATE_STRUCT, Graph);
+
+};  // namespace ggnn
diff --git a/src/ggnn/base/graph_config.cpp b/src/ggnn/base/graph_config.cpp
new file mode 100644
index 0000000..b59f907
--- /dev/null
+++ b/src/ggnn/base/graph_config.cpp
@@ -0,0 +1,113 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph_config.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+#include <glog/logging.h>
+
+namespace ggnn {
+
+constexpr uint32_t powInt(const uint32_t base, const uint32_t power)
+{
+  if (!power)
+    return 1;
+  else if (power == 1)
+    return base;
+  return base * powInt(base, power - 1);
+}
+
+GraphDimensions::GraphDimensions(uint32_t N, uint32_t S, uint32_t G)
+{
+  // fixed block hierarchy
+  for (uint32_t l = L - 1, B = 1; l != -1U; --l, B *= G) {
+    Bs[l] = B;
+    Ns[l] = B * S;
+  }
+  // bottom layer has all points (block sizes adjust accordingly)
+  Ns[0] = N;
+  // no offsets in layer 0
+  Ns_offsets[0] = 0;
+  STs_offsets[0] = 0;
+  // no selection/translation in layer 0
+  STs_offsets[1] = 0;
+  Ns_offsets[1] = N;
+
+  for (uint32_t l = 2; l < L; ++l) {
+    Ns_offsets[l] = Ns_offsets[l - 1] + Ns[l - 1];
+    STs_offsets[l] = STs_offsets[l - 1] + Ns[l - 1];
+  }
+  N_all = Ns_offsets[L - 1] + Ns[L - 1];
+  ST_all = STs_offsets[L - 1] + Ns[L - 1];
+}
+
+GraphDerivedParameters::GraphDerivedParameters(const GraphParameters& params)
+    : GraphParameters{params}
+{
+  /// theoretical growth factor (number of sub-graphs merged together per layer)
+  /// graph grows top down: 1*S, G*S, G*G*S, G*G*G*S0+S0_off == N
+  const float growth = std::pow(static_cast<float>(N) / static_cast<float>(S), 1.f / (L - 1));
+
+  // pick between the closest integers
+  const uint32_t Gf = static_cast<uint32_t>(growth);
+  const uint32_t Gc = Gf + 1;
+
+  // resulting level 0 (base level) segment sizes
+  const float S0f = static_cast<float>(N) / (std::pow(static_cast<float>(Gf), (L - 1.0f)));
+  const float S0c = static_cast<float>(N) / (std::pow(static_cast<float>(Gc), (L - 1.0f)));
+
+  // use the larger layer 0 segment size (S0f)
+  // if the smaller one (S0c) becomes too small to establish meaningful neighborhoods within it
+  // or if it (S0f) is closer to S than the smaller option (S0c)
+  const bool is_floor =
+      (static_cast<uint32_t>(S0c) < KBuild) ||
+      (std::abs(S0f - static_cast<float>(S)) < std::abs(S0c - static_cast<float>(S)));
+
+  G = (is_floor) ? Gf : Gc;
+  S0 = (is_floor) ? static_cast<uint32_t>(S0f) : static_cast<uint32_t>(S0c);
+  S0_off = N - powInt(G, L - 1) * S0;
+
+  // parameters for selection
+  SG = S / G;
+  SG_off = S - SG * G;
+
+  // TODO: can we fix that? ==> S needs to be a multiple of G
+  DLOG_IF(WARNING, SG == 0) << "less than one point per segment contributes to upper level "
+                               "segments. this may negatively impact search performance.";
+  DLOG_IF(WARNING, SG_off > 0) << "segment's contributions to upper level segments are imbalanced. "
+                                  "this may negatively impact search performance.";
+}
+
+GraphConfig::GraphConfig(const GraphParameters& params)
+    : GraphDerivedParameters{params}, GraphDimensions{N, S, G}
+{
+  VLOG(1) << "GraphConfig(): N: " << N << ", K: " << KBuild << ", KF: " << KF << ", L: " << L
+          << ", G: " << G << ", S: " << S << ", S0: " << S0 << ", S0_off: " << S0_off
+          << ", SG: " << SG << ", SG_off: " << SG_off;
+}
+
+size_t GraphConfig::maxBaseAddr() const
+{
+  return std::max(static_cast<size_t>(N) * D, static_cast<size_t>(N_all) * KBuild);
+}
+
+};  // namespace ggnn
diff --git a/src/ggnn/base/result_merger.cpp b/src/ggnn/base/result_merger.cpp
new file mode 100644
index 0000000..05c2ac1
--- /dev/null
+++ b/src/ggnn/base/result_merger.cpp
@@ -0,0 +1,153 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/result_merger.h>
+#include <ggnn/base/dataset.cuh>
+
+#include <ggnn/base/lib.h>
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT>
+ResultMerger<KeyT, ValueT>::ResultMerger(const uint32_t N_query, const uint32_t KQuery,
+                                         const uint32_t num_gpus, const uint32_t num_shards_per_gpu)
+    : N_query{N_query}, KQuery{KQuery}, num_gpus{num_gpus}, num_shards_per_gpu{num_shards_per_gpu}
+{
+  CHECK_GE(num_gpus, 1);
+  CHECK_GE(num_shards_per_gpu, 1);
+
+  partial_results_per_gpu.reserve(num_gpus);
+  for (uint32_t i = 0; i < num_gpus; ++i) {
+    partial_results_per_gpu.emplace_back(
+        Results{Dataset<KeyT>::empty(N_query, KQuery * num_shards_per_gpu, true),
+                Dataset<ValueT>::empty(N_query, KQuery * num_shards_per_gpu, true)});
+  }
+}
+
+template <typename KeyT, typename ValueT>
+typename ResultMerger<KeyT, ValueT>::Results ResultMerger<KeyT, ValueT>::merge(uint32_t N_shard) &&
+{
+  // for one part on one GPU, the results are directly passed through.
+  if (num_gpus == 1 && num_shards_per_gpu == 1) {
+    return std::move(partial_results_per_gpu.at(0));
+  }
+
+  Results merged_results = {Dataset<KeyT>::empty(N_query, KQuery),
+                            Dataset<ValueT>::empty(N_query, KQuery)};
+
+  if (num_gpus == 1) {
+    // results have already been pre-sorted per GPU, so we can just copy this over
+    for (uint32_t n = 0; n < N_query; n++) {
+      std::copy_n(partial_results_per_gpu.at(0).ids.data() +
+                      static_cast<size_t>(n) * KQuery * num_shards_per_gpu,
+                  KQuery, merged_results.ids.data() + static_cast<size_t>(n) * KQuery);
+      std::copy_n(partial_results_per_gpu.at(0).dists.data() +
+                      static_cast<size_t>(n) * KQuery * num_shards_per_gpu,
+                  KQuery, merged_results.dists.data() + static_cast<size_t>(n) * KQuery);
+    }
+    return merged_results;
+  }
+
+  const uint32_t stride = KQuery * num_shards_per_gpu;
+
+  auto start = std::chrono::steady_clock::now();
+
+  auto mergeResultPart = [&](uint32_t begin, uint32_t end) -> void {
+    struct KeyDistPartition {
+      KeyT key;
+      ValueT dist;
+      uint32_t partition;
+
+      KeyDistPartition(KeyT key, ValueT dist, uint32_t partition)
+          : key(key), dist(dist), partition(partition)
+      {
+      }
+    };
+    auto compare_heap = [](const KeyDistPartition& a, const KeyDistPartition& b) -> bool {
+      return a.dist >= b.dist;
+    };
+
+    std::vector<uint32_t> part_offsets(num_gpus, 1);
+
+    std::vector<KeyDistPartition> heap;
+    heap.reserve(num_gpus);
+    for (uint32_t n = begin; n < end; ++n) {
+      heap.clear();
+      std::fill(part_offsets.begin(), part_offsets.end(), 1);
+      // fill heap with min per partition
+      for (uint32_t device_i = 0; device_i < num_gpus; ++device_i) {
+        const size_t pos = static_cast<size_t>(n) * stride;
+        heap.emplace_back(partial_results_per_gpu.at(device_i).ids[pos],
+                          partial_results_per_gpu.at(device_i).dists[pos], device_i);
+      }
+      std::make_heap(heap.begin(), heap.end(), compare_heap);
+      // Pop min and insert from popped partition until full.
+      // We can safely assume not to run out of bounds within each partition,
+      // since there are as many results per part as total results requested.
+      for (uint32_t k = 0; k < KQuery; ++k) {
+        const KeyDistPartition top = heap.front();
+        // each GPU only knows about its part of the base
+        // increase the base index by the number of base points assigned to previous devices
+        merged_results.ids[n * KQuery + k] =
+            static_cast<KeyT>(top.partition * num_shards_per_gpu * N_shard) + top.key;
+        merged_results.dists[n * KQuery + k] = top.dist;
+        if (k == KQuery - 1)
+          break;
+
+        std::pop_heap(heap.begin(), heap.end(), compare_heap);
+        heap.pop_back();
+        const size_t pos = static_cast<size_t>(n) * stride + part_offsets[top.partition];
+        ++part_offsets[top.partition];
+        heap.emplace_back(partial_results_per_gpu.at(top.partition).ids[pos],
+                          partial_results_per_gpu.at(top.partition).dists[pos], top.partition);
+        std::push_heap(heap.begin(), heap.end(), compare_heap);
+      }
+    }
+  };
+  std::vector<std::thread> mergeThreads;
+
+  uint32_t num_threads = std::min(N_query, std::thread::hardware_concurrency());
+  uint32_t elements_per_bin = (N_query + num_threads - 1) / num_threads;
+  mergeThreads.reserve(num_threads);
+  for (uint32_t i = 0; i < num_threads; ++i) {
+    mergeThreads.emplace_back(mergeResultPart, i * elements_per_bin,
+                              std::min(N_query, (i + 1) * elements_per_bin));
+  }
+  for (auto&& t : mergeThreads) {
+    t.join();
+  }
+
+  auto end = std::chrono::steady_clock::now();
+  auto cpu_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  VLOG(0) << "[CPU] partial merge completed. " << cpu_ms.count() << " ms.";
+
+  return merged_results;
+}
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_INSTANTIATE_STRUCT, ResultMerger);
+
+};  // namespace ggnn
diff --git a/src/ggnn/construction/graph_buffer.cu b/src/ggnn/construction/graph_buffer.cu
new file mode 100644
index 0000000..3e1b615
--- /dev/null
+++ b/src/ggnn/construction/graph_buffer.cu
@@ -0,0 +1,85 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/construction/graph_buffer.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/fwd.h>
+#include <ggnn/base/lib.h>
+
+#include <ggnn/cuda_utils/check.cuh>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include <glog/logging.h>
+
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT>
+GraphBuffer<KeyT, ValueT>::GraphBuffer(const GraphConfig& graph_config, Dataset<std::byte>&& memory)
+    : memory{std::move(memory)}
+{
+  const PartSizes buffer_part_sizes{graph_config};
+
+  // stats
+  {
+    ValueT* unused{nullptr};
+    size_t temp_storage_bytes_sum{0};
+    size_t temp_storage_bytes_max{0};
+
+    cub::DeviceReduce::Sum(nullptr, temp_storage_bytes_sum, unused, unused,
+                           static_cast<int>(graph_config.N));
+    cub::DeviceReduce::Max(nullptr, temp_storage_bytes_max, unused, unused,
+                           static_cast<int>(graph_config.N));
+    temp_storage_bytes_cub = align8(std::max(temp_storage_bytes_sum, temp_storage_bytes_max));
+  }
+
+  // const size_t total_size = graph_buffer_size + sym_buffer_size + rng_size + sym_atomic_size +
+  // sym_statistics_size + nn1_dist_buffer_size + temp_storage_bytes_sum + temp_storage_bytes_max;
+
+  // this will work as long as the construction code remains as is
+  const size_t merge_size =
+      buffer_part_sizes.nn1_dist_buffer_size + buffer_part_sizes.graph_buffer_size;
+  const size_t select_size = buffer_part_sizes.nn1_dist_buffer_size + buffer_part_sizes.rng_size;
+  const size_t stats_size = buffer_part_sizes.nn1_dist_buffer_size + temp_storage_bytes_cub;
+  const size_t sym_size = buffer_part_sizes.sym_buffer_size + buffer_part_sizes.sym_atomic_size;
+
+  const size_t overlapped_size = std::max({merge_size, select_size, stats_size, sym_size});
+
+  VLOG(2) << "GraphBuffer(): allocating GPU memory... (" << sizeInGB(overlapped_size)
+          << " GB total).\n";
+
+  CHECK_GE(this->memory.size_bytes(), overlapped_size);
+
+  nn1_dist_buffer = reinterpret_cast<ValueT*>(this->memory.data());
+  graph_buffer =
+      reinterpret_cast<KeyT*>(this->memory.data() + buffer_part_sizes.nn1_dist_buffer_size);
+  rng = reinterpret_cast<float*>(this->memory.data() + buffer_part_sizes.nn1_dist_buffer_size);
+  temp_storage_cub = this->memory.data() + buffer_part_sizes.nn1_dist_buffer_size;
+  sym_buffer = reinterpret_cast<KeyT*>(this->memory.data());
+  sym_atomic = reinterpret_cast<uint32_t*>(this->memory.data() + buffer_part_sizes.sym_buffer_size);
+}
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_INSTANTIATE_STRUCT, GraphBuffer);
+
+};  // namespace ggnn
diff --git a/src/ggnn/construction/graph_construction.cu b/src/ggnn/construction/graph_construction.cu
new file mode 100644
index 0000000..c299c08
--- /dev/null
+++ b/src/ggnn/construction/graph_construction.cu
@@ -0,0 +1,416 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/construction/graph_construction.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph.h>
+#include <ggnn/base/graph_config.h>
+#include <ggnn/base/lib.h>
+#include <ggnn/base/dataset.cuh>
+#include <ggnn/base/gpu_instance.cuh>
+
+#include <ggnn/construction/graph_buffer.cuh>
+#include <ggnn/construction/merge_layer.cuh>
+#include <ggnn/construction/sym_buffer_merge_layer.cuh>
+#include <ggnn/construction/sym_query_layer.cuh>
+#include <ggnn/construction/top_merge_layer.cuh>
+#include <ggnn/construction/wrs_select_layer.cuh>
+
+#include <ggnn/cuda_utils/check.cuh>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <curand_kernel.h>
+
+#include <cub/cub.cuh>
+
+#include <glog/logging.h>
+
+namespace ggnn {
+
+template <typename T>
+void time_launcher(const int log_level, T& kernel, uint32_t N, cudaStream_t stream = 0)
+{
+  cudaEvent_t start, stop;
+  if (VLOG_IS_ON(log_level)) {
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start, stream);
+  }
+  kernel.launch(N, stream);
+  if (VLOG_IS_ON(log_level)) {
+    cudaEventRecord(stop, stream);
+    CHECK_CUDA(cudaEventSynchronize(stop));
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    VLOG(log_level) << milliseconds << " ms for " << N << " queries -> "
+                    << milliseconds * 1000.0f / static_cast<float>(N) << " us/query \n";
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+  }
+}
+
+template <typename ValueT>
+__global__ void divide(ValueT* res, ValueT* input, ValueT N)
+{
+  res[threadIdx.x] = input[threadIdx.x] / N;
+}
+
+struct CurandGeneratorDeleter {
+  void operator()(curandGenerator_t gen)
+  {
+    if (gen)
+      curandDestroyGenerator(gen);
+  }
+};
+
+using CurandGenerator =
+    std::unique_ptr<std::remove_pointer_t<curandGenerator_t>, CurandGeneratorDeleter>;
+
+CurandGenerator createPRNG()
+{
+  curandGenerator_t gen_tmp;
+  curandCreateGenerator(&gen_tmp, CURAND_RNG_PSEUDO_DEFAULT);
+  curandSetPseudoRandomGeneratorSeed(gen_tmp, 1234ULL);
+  return CurandGenerator{gen_tmp};
+}
+
+template <typename KeyT, typename ValueT, typename BaseT>
+class GraphConstructionImpl : public GraphConstruction<KeyT, ValueT, BaseT> {
+ public:
+  using Graph = ggnn::Graph<KeyT, ValueT>;
+  using GraphBuffer = ggnn::GraphBuffer<KeyT, ValueT>;
+  using GPUInstance = ggnn::GPUInstance<KeyT, ValueT, BaseT>;
+
+  GraphConstructionImpl(GPUInstance& gpu_instance, const float tau_build,
+                        const DistanceMeasure measure)
+      : gpu_instance{gpu_instance}, tau_build{tau_build}, measure{measure}
+  {
+  }
+
+ private:
+  GPUInstance& gpu_instance;
+  const GraphConfig& graph_config{gpu_instance.graph_config};
+  float tau_build{};
+  DistanceMeasure measure;
+
+  const size_t buffer_size = typename GraphBuffer::PartSizes{graph_config}.getBufferSize();
+  GraphBuffer buffer{graph_config,
+                     Dataset<std::byte>::emptyOnGPU(buffer_size, 1, gpu_instance.gpu_ctx.gpu_id)};
+  CurandGenerator gen{createPRNG()};
+
+  void build(Graph& graph, const Dataset<BaseT>& base, const cudaStream_t stream) override
+  {
+    for (uint32_t layer_top = 0; layer_top < GraphConfig::L; layer_top++) {
+      for (uint32_t layer_btm = layer_top; layer_btm != -1U; layer_btm--) {
+        merge(layer_top, layer_btm, graph, base, stream);
+
+        if (layer_top < (GraphConfig::L - 1) && layer_top == layer_btm)
+          select(layer_top, graph, stream);
+
+        sym(layer_btm, graph, base, stream);
+      }
+    }
+  }
+  void refine(Graph& graph, const Dataset<BaseT>& base, const cudaStream_t stream) override
+  {
+    for (uint32_t layer = GraphConfig::L - 2; layer != -1U; layer--) {
+      merge(GraphConfig::L - 1, layer, graph, base, stream);
+      sym(layer, graph, base, stream);
+    }
+  }
+
+  struct ConstructionKernelConfig {
+    uint32_t block_dim_x;
+    uint32_t dist_items_per_thread;
+  };
+
+  ConstructionKernelConfig selectKernelConfig(uint32_t D, uint32_t min_block_dim_x)
+  {
+    const uint32_t dist_items_per_thread = D <= 1024 ? 4U : 8U;
+    const uint32_t block_dim_x = std::max(
+        min_block_dim_x, ggnn::bit_ceil((D + dist_items_per_thread - 1) / dist_items_per_thread));
+
+    return {block_dim_x, dist_items_per_thread};
+  }
+
+  void select(const uint32_t layer, Graph& graph, cudaStream_t stream)
+  {
+    gpu_instance.gpu_ctx.activate();
+
+    /* Generate n floats on device */
+    curandSetStream(gen.get(), stream);
+    curandGenerateUniform(gen.get(), buffer.rng, graph_config.Ns[layer]);
+
+    using SelectionKernel = ggnn::WRSSelectionKernel<KeyT, ValueT>;
+
+    SelectionKernel select_kernel{.d_selection = graph.selection[layer + 1].data(),
+                                  .d_translation = graph.translation[layer + 1].data(),
+                                  .d_translation_layer = graph.translation[layer].data(),
+                                  .d_nn1_dist_buffer = buffer.nn1_dist_buffer,
+                                  .d_rng = buffer.rng,
+                                  .Sglob = graph_config.S,
+                                  .S = layer ? graph_config.S : graph_config.S0,
+                                  .S_offset = layer ? 0 : graph_config.S0_off,
+                                  .G = graph_config.G,
+                                  .SG = graph_config.SG,
+                                  .SG_offset = graph_config.SG_off,
+                                  .layer = layer};
+
+    time_launcher(2, select_kernel, graph_config.Bs[layer], stream);
+  }
+
+  void merge(const uint32_t layer_top, const uint32_t layer_btm, Graph& graph,
+             const Dataset<BaseT>& base, const cudaStream_t stream)
+  {
+    if (layer_top == layer_btm)
+      top(layer_btm, graph, base, stream);
+    else
+      mergeLayer(layer_top, layer_btm, graph, base, stream);
+
+    if (!layer_btm)
+      computeNN1Stats(graph, stream);
+  }
+
+  void top(const uint32_t layer, Graph& graph, const Dataset<BaseT>& base,
+           const cudaStream_t stream)
+  {
+    gpu_instance.gpu_ctx.activate();
+
+    auto run_top_merge = [&]<uint32_t BLOCK_DIM_X, uint32_t DIST_ITEMS_PER_THREAD>() -> void {
+      using TopMergeKernel =
+          ggnn::TopMergeKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD>;
+
+      TopMergeKernel top_kernel{.D = base.D,
+                                .measure = measure,
+                                .KBuild = graph_config.KBuild,
+                                .d_base = base.data(),
+                                .d_translation = graph.translation[layer].data(),
+                                .d_graph = graph.graph[layer].data(),
+                                .d_nn1_dist_buffer = buffer.nn1_dist_buffer,
+                                .S = layer ? graph_config.S : graph_config.S0,
+                                .S_offset = layer ? 0 : graph_config.S0_off,
+                                .layer = layer};
+
+      time_launcher(2, top_kernel, graph_config.Ns[layer], stream);
+    };
+
+    static constexpr uint32_t MIN_BLOCK_DIM_X = 128;
+    auto [block_dim_x, dist_items_per_thread] = selectKernelConfig(base.D, MIN_BLOCK_DIM_X);
+
+    if (block_dim_x == 128 && dist_items_per_thread == 4)
+      run_top_merge.template operator()<128, 4>();
+    else if (block_dim_x == 256 && dist_items_per_thread == 4)
+      run_top_merge.template operator()<256, 4>();
+    else if (block_dim_x == 256 && dist_items_per_thread == 8)
+      run_top_merge.template operator()<256, 8>();
+    else if (block_dim_x == 512 && dist_items_per_thread == 8)
+      run_top_merge.template operator()<512, 8>();
+    else
+      CHECK(false) << "configuration " << block_dim_x << " " << dist_items_per_thread
+                   << " not supported for top merge kernel.";
+  }
+
+  void mergeLayer(const uint32_t layer_top, const uint32_t layer_btm, Graph& graph,
+                  const Dataset<BaseT>& base, const cudaStream_t stream)
+  {
+    gpu_instance.gpu_ctx.activate();
+
+    auto run_merge = [&]<uint32_t BLOCK_DIM_X, uint32_t DIST_ITEMS_PER_THREAD>() -> void {
+      using MergeKernel =
+          ggnn::MergeKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD>;
+
+      MergeKernel merge_kernel{
+          .D = base.D,
+          .measure = measure,
+          .KBuild = graph_config.KBuild,
+          .S = graph_config.S,
+          .d_base = base.data(),
+          .d_selection = graph.selection[1].data(),      // the entire selection starts at layer 1
+          .d_translation = graph.translation[1].data(),  // the entire translation starts at layer 1
+          .d_graph = graph.graph[0].data(),
+          .d_graph_buffer = buffer.graph_buffer,
+          .d_nn1_stats = graph.nn1_stats.data(),
+          .d_nn1_dist_buffer = buffer.nn1_dist_buffer,
+          .layer_top = layer_top,
+          .layer_btm = layer_btm,
+          .G = graph_config.G,
+          .S0 = graph_config.S0,
+          .S0_offset = graph_config.S0_off,
+          .Ns_offsets = graph_config.Ns_offsets,
+          .STs_offsets = graph_config.STs_offsets,
+          .tau_build = tau_build};
+
+      time_launcher(2, merge_kernel, graph_config.Ns[layer_btm], stream);
+    };
+
+    static constexpr uint32_t MIN_BLOCK_DIM_X = 32;
+    auto [block_dim_x, dist_items_per_thread] = selectKernelConfig(base.D, MIN_BLOCK_DIM_X);
+
+    if (block_dim_x == 32 && dist_items_per_thread == 4)
+      run_merge.template operator()<32, 4>();
+    else if (block_dim_x == 64 && dist_items_per_thread == 4)
+      run_merge.template operator()<64, 4>();
+    else if (block_dim_x == 128 && dist_items_per_thread == 4)
+      run_merge.template operator()<128, 4>();
+    else if (block_dim_x == 256 && dist_items_per_thread == 4)
+      run_merge.template operator()<256, 4>();
+    else if (block_dim_x == 256 && dist_items_per_thread == 8)
+      run_merge.template operator()<256, 8>();
+    else if (block_dim_x == 512 && dist_items_per_thread == 8)
+      run_merge.template operator()<512, 8>();
+    else
+      CHECK(false) << "configuration " << block_dim_x << " " << dist_items_per_thread
+                   << " not supported for merge kernel.";
+
+    const size_t graph_buffer_size =
+        static_cast<size_t>(graph_config.Ns[layer_btm]) * graph_config.KBuild * sizeof(KeyT);
+    CHECK_CUDA(cudaMemcpyAsync(graph.graph[layer_btm].data(), buffer.graph_buffer,
+                               graph_buffer_size, cudaMemcpyDeviceToDevice, stream));
+  }
+
+  void sym(const uint32_t layer, Graph& graph, const Dataset<BaseT>& base,
+           const cudaStream_t stream)
+  {
+    gpu_instance.gpu_ctx.activate();
+
+    cudaMemsetAsync(buffer.sym_buffer, -1,
+                    static_cast<size_t>(graph_config.Ns[layer]) * graph_config.KF * sizeof(KeyT),
+                    stream);
+
+    cudaMemsetAsync(buffer.sym_atomic, 0, graph_config.Ns[layer] * sizeof(uint32_t), stream);
+    auto run_sym = [&]<uint32_t BLOCK_DIM_X, uint32_t DIST_ITEMS_PER_THREAD>() -> void {
+      using SymQueryKernel =
+          ggnn::SymQueryKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD>;
+
+      SymQueryKernel sym_kernel{
+          .D = base.D,
+          .measure = measure,
+          .KBuild = graph_config.KBuild,
+          .d_base = base.data(),
+          .d_graph = graph.graph[layer].data(),
+          .d_translation = graph.translation[layer].data(),
+          .d_nn1_stats = graph.nn1_stats.data(),
+          .tau_build = tau_build,
+          .d_sym_buffer = buffer.sym_buffer,
+          .d_sym_atomic = buffer.sym_atomic,
+      };
+
+      time_launcher(2, sym_kernel, graph_config.Ns[layer], stream);
+    };
+
+    static constexpr uint32_t MIN_BLOCK_DIM_X = 64;
+    auto [block_dim_x, dist_items_per_thread] = selectKernelConfig(base.D, MIN_BLOCK_DIM_X);
+
+    if (block_dim_x == 64 && dist_items_per_thread == 4)
+      run_sym.template operator()<64, 4>();
+    else if (block_dim_x == 128 && dist_items_per_thread == 4)
+      run_sym.template operator()<128, 4>();
+    else if (block_dim_x == 256 && dist_items_per_thread == 4)
+      run_sym.template operator()<256, 4>();
+    else if (block_dim_x == 256 && dist_items_per_thread == 8)
+      run_sym.template operator()<256, 8>();
+    else if (block_dim_x == 512 && dist_items_per_thread == 8)
+      run_sym.template operator()<512, 8>();
+    else
+      CHECK(false) << "configuration " << block_dim_x << " " << dist_items_per_thread
+                   << " not supported for sym kernel.";
+
+    using SymBufferMergeKernel = ggnn::SymBufferMergeKernel<KeyT, ValueT>;
+
+    SymBufferMergeKernel sym_buffer_merge_kernel{.KBuild = graph_config.KBuild,
+                                                 .d_sym_buffer = buffer.sym_buffer,
+                                                 .d_sym_atomic = buffer.sym_atomic,
+                                                 .d_graph = graph.graph[layer].data()};
+
+    time_launcher(3, sym_buffer_merge_kernel, graph_config.Ns[layer], stream);
+
+    if (VLOG_IS_ON(2)) {
+      Dataset<uint32_t> h_sym_atomic = Dataset<uint32_t>::empty(graph_config.Ns[layer], 1, true);
+      // Dataset<uint32_t> h_statistics = Dataset<uint32_t>::empty(graph_config.Ns[layer], 1, true);
+
+      CHECK_CUDA(cudaMemcpyAsync(h_sym_atomic.data(), buffer.sym_atomic, h_sym_atomic.size_bytes(),
+                                 cudaMemcpyDeviceToHost, stream));
+      // cudaMemcpyAsync(h_statistics.data(), buffer.statistics, h_statistics.size_bytes(),
+      // cudaMemcpyDeviceToHost, stream);
+
+      CHECK_CUDA(cudaStreamSynchronize(stream));
+
+      uint32_t c = 0;
+      uint32_t m = 0;
+      // int unconnected = 0;
+      for (uint32_t i = 0; i < graph_config.Ns[layer]; i++) {
+        if (h_sym_atomic[i] > graph_config.KF)
+          c++;
+        m += (h_sym_atomic[i] > graph_config.KF) ? graph_config.KF : h_sym_atomic[i];
+        // unconnected += h_statistics[i];
+      }
+      VLOG(2) << "Layer " << layer << " [N: " << graph_config.Ns[layer] << "] | overflow: " << c
+              << " (" << static_cast<float>(c) / static_cast<float>(graph_config.Ns[layer])
+              << ") | added_links: " << m << " ("
+              << static_cast<float>(m) / static_cast<float>(graph_config.Ns[layer]) << ")\n";
+    }
+  }
+
+  void computeNN1Stats(Graph& graph, const cudaStream_t stream)
+  {
+    gpu_instance.gpu_ctx.activate();
+
+    CHECK_CUDA(cub::DeviceReduce::Sum(buffer.temp_storage_cub, buffer.temp_storage_bytes_cub,
+                                      buffer.nn1_dist_buffer, &graph.nn1_stats[0],
+                                      static_cast<int>(graph_config.N), stream));
+    CHECK_CUDA(cub::DeviceReduce::Max(buffer.temp_storage_cub, buffer.temp_storage_bytes_cub,
+                                      buffer.nn1_dist_buffer, &graph.nn1_stats[1],
+                                      static_cast<int>(graph_config.N), stream));
+
+    divide<ValueT><<<1, 1, 0, stream>>>(graph.nn1_stats.data(), graph.nn1_stats.data(),
+                                        ValueT(graph_config.N));
+
+    if (VLOG_IS_ON(2)) {
+      ValueT h_nn1_stats[2];
+      CHECK_CUDA(cudaMemcpyAsync(h_nn1_stats, graph.nn1_stats.data(), 2 * sizeof(ValueT),
+                                 cudaMemcpyDeviceToHost, stream));
+      CHECK_CUDA(cudaStreamSynchronize(stream));
+      VLOG(2) << "mean: " << h_nn1_stats[0] << " | max: " << h_nn1_stats[1] << std::endl;
+    }
+  }
+};
+
+template <typename KeyT, typename ValueT, typename BaseT>
+GraphConstruction<KeyT, ValueT, BaseT>::GraphConstruction(GPUInstance& gpu_instance,
+                                                          const float tau_build,
+                                                          const DistanceMeasure measure)
+{
+  pimpl.reset(new GraphConstructionImpl<KeyT, ValueT, BaseT>{gpu_instance, tau_build, measure});
+}
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_INSTANTIATE_CLASS, GraphConstruction);
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_INSTANTIATE_CLASS, GraphConstructionImpl);
+
+};  // namespace ggnn
diff --git a/src/ggnn/construction/merge_layer.cu b/src/ggnn/construction/merge_layer.cu
new file mode 100644
index 0000000..fffb6fd
--- /dev/null
+++ b/src/ggnn/construction/merge_layer.cu
@@ -0,0 +1,167 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/construction/merge_layer.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/lib.h>
+
+#include <ggnn/cuda_utils/simple_knn_cache.cuh>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void __launch_bounds__(T::BLOCK_DIM_X) merge(const T kernel)
+{
+  kernel();
+}
+
+// determine the start of the top-layer segment (always 0 for layer_top = L-1)
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          uint32_t DIST_ITEMS_PER_THREAD>
+__device__ __forceinline__ uint32_t
+MergeKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>::get_top_seg_offset(
+    const KeyT n) const
+{
+  // first, determine the bottom-level segment
+  uint32_t seg_btm = n / S;
+  if (!layer_btm) {
+    const KeyT offset_points = S0_offset * (S0 + 1);
+    seg_btm = (n < offset_points) ? n / (S0 + 1) : S0_offset + (n - offset_points) / S0;
+  }
+
+  // then divide by G once per layer to step up the tree
+  // and finally multiply by S to get the start of the segment
+
+  uint32_t powG = G;  // assuming layer_top > layer_btm (which should always be the case)
+  for (uint32_t i = 1; i < layer_top - layer_btm; ++i)
+    powG *= G;
+
+  return (seg_btm / powG) * S;
+}
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          uint32_t DIST_ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+MergeKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>::operator()() const
+{
+  static constexpr uint32_t K_BLOCK = 32;
+  static_assert(K_BLOCK <= BLOCK_DIM_X);
+  static constexpr bool DIST_STATS = false;
+
+  using Cache = SimpleKNNCache<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD, DIST_STATS>;
+
+  const float xi = (measure == DistanceMeasure::Euclidean)
+                       ? (d_nn1_stats[0] * d_nn1_stats[0]) * tau_build * tau_build
+                       : d_nn1_stats[0] * tau_build;
+
+  const KeyT n = static_cast<KeyT>(blockIdx.x);
+
+  const KeyT m = (!layer_btm) ? n : d_translation[STs_offsets[layer_btm] + n];
+
+  Cache cache(D, measure, KBuild + 1, SORTED_SIZE, CACHE_SIZE, d_base, m, xi);
+
+  __shared__ KeyT s_knn[K_BLOCK];
+
+  {
+    const uint32_t s_offset = get_top_seg_offset(n);
+
+    // fetch starting points
+    for (uint32_t i = 0; i < S; i += K_BLOCK) {
+      if (threadIdx.x < K_BLOCK) {
+        const uint32_t s = i + threadIdx.x;
+        s_knn[threadIdx.x] = (s < S) ? static_cast<KeyT>(s_offset + s) : Cache::EMPTY_KEY;
+      }
+      cache.fetch_unfiltered(s_knn, &d_translation[STs_offsets[layer_top]], K_BLOCK);
+    }
+  }
+
+  // hierarchic kNN search
+  for (uint32_t layer = layer_top - 1; layer >= layer_btm && layer != -1U; layer--) {
+    cache.transform(&d_selection[STs_offsets[layer + 1]]);
+
+    if (layer == layer_btm)
+      cache.fetch_unfiltered(&n, (!layer) ? nullptr : &d_translation[STs_offsets[layer]], 1);
+
+    for (uint32_t ite = 0; ite < MAX_ITERATIONS; ++ite) {
+      const KeyT anchor = cache.pop();
+      if (anchor == Cache::EMPTY_KEY)
+        break;
+
+      for (uint32_t j = 0; j < KBuild; j += K_BLOCK) {
+        if (threadIdx.x < K_BLOCK) {
+          const uint32_t k = j + threadIdx.x;
+          s_knn[threadIdx.x] =
+              (k < KBuild) ? d_graph[(static_cast<size_t>(Ns_offsets[layer]) + anchor) * KBuild + k]
+                           : Cache::EMPTY_KEY;
+        }
+        cache.fetch(s_knn, (!layer) ? nullptr : &d_translation[STs_offsets[layer]], K_BLOCK);
+      }
+    }
+  }
+
+  KeyT& s_own_idx{s_knn[0]};
+  if (!threadIdx.x)
+    s_own_idx = static_cast<KeyT>(-1);
+  __syncthreads();
+
+  // check if own index is part of cache and mark its index to skip it
+  // we cannot rely on it being at index 0 (in case of duplicates) or in the cache at all
+  for (uint32_t j = 0; j < KBuild; j += BLOCK_DIM_X) {
+    const uint32_t k = j + threadIdx.x;
+    if (k < KBuild) {
+      if (cache.s_cache[k] == n)
+        s_own_idx = static_cast<KeyT>(k);
+    }
+  }
+  __syncthreads();
+  for (uint32_t j = 0; j < KBuild; j += BLOCK_DIM_X) {
+    const uint32_t k = j + threadIdx.x;
+    if (k < KBuild) {
+      // skip self-referential link (if any)
+      const KeyT idx = cache.s_cache[k + (static_cast<KeyT>(k) >= s_own_idx)];
+      d_graph_buffer[static_cast<size_t>(n) * KBuild + k] = (idx != Cache::EMPTY_KEY) ? idx : n;
+    }
+  }
+
+  if (!layer_btm && !threadIdx.x) {
+    uint32_t i = s_own_idx + 1;
+    ValueT dist;
+    do {
+      dist = cache.s_dists[i];
+      ++i;
+    } while (dist == 0.0f && i < cache.BEST_SIZE);
+    if (measure == DistanceMeasure::Euclidean)
+      dist = sqrtf(dist);
+    d_nn1_dist_buffer[n] = dist;
+  }
+}
+
+#define GGNN_MERGE(KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD)    \
+  template __global__ void                                                    \
+  merge<MergeKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>>( \
+      const MergeKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>);
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_MERGES, GGNN_MERGE);
+
+};  // namespace ggnn
diff --git a/src/ggnn/construction/sym_buffer_merge_layer.cu b/src/ggnn/construction/sym_buffer_merge_layer.cu
new file mode 100644
index 0000000..679dd08
--- /dev/null
+++ b/src/ggnn/construction/sym_buffer_merge_layer.cu
@@ -0,0 +1,107 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/construction/sym_buffer_merge_layer.cuh>
+
+#include <ggnn/base/lib.h>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void __launch_bounds__(T::BLOCK_DIM_X) sym_buffer_merge(const T kernel, const uint32_t N)
+{
+  kernel(N);
+}
+
+template <typename KeyT, typename ValueT>
+__device__ __forceinline__ void SymBufferMergeKernel<KeyT, ValueT>::operator()(uint32_t N) const
+{
+  const uint32_t n = blockIdx.x * POINTS_PER_BLOCK + threadIdx.y;
+  const uint32_t kf = threadIdx.x;
+
+  if (n >= N)
+    return;
+
+  /// inverse links which need to be added to the graph
+  extern __shared__ KeyT s_sym_buffer[];  // [POINTS_PER_BLOCK * KF];
+  /// current contents of the graph's foreign/inverse link storage
+  KeyT* s_graph_buffer{&s_sym_buffer[POINTS_PER_BLOCK * KF]};  // [POINTS_PER_BLOCK * KF];
+  /// whether the foreign link in the graph exists in the list of inverse links to be added
+  bool* s_found{
+      reinterpret_cast<bool*>(&s_sym_buffer[2 * POINTS_PER_BLOCK * KF])};  // [POINTS_PER_BLOCK];
+
+  // number of inverse links to be entered per point (only valid for threadIdx.x == 0)
+  uint32_t r_num_links;
+  if (!threadIdx.x) {
+    r_num_links = d_sym_atomic[n];
+  }
+
+  const uint32_t tid = threadIdx.y * KF + threadIdx.x;
+  // # load buffer
+  s_sym_buffer[tid] = d_sym_buffer[static_cast<size_t>(n) * KF + kf];
+  s_graph_buffer[tid] = d_graph[static_cast<size_t>(n) * KBuild + KL + kf];
+
+  // add existing foreign links to the inverse link list if there is still room
+  for (uint32_t i = 0; i < KF; i++) {
+    if (!threadIdx.x) {
+      // only search if there is a spot where we could add another link
+      s_found[threadIdx.y] = r_num_links >= KF;
+    }
+    __syncthreads();
+
+    KeyT r_graph;
+
+    if (!s_found[threadIdx.y]) {
+      // read all requested inverse links per point
+      const KeyT r_sym_buffer = s_sym_buffer[tid];
+      // read existing foreign link i per point from graph
+      r_graph = s_graph_buffer[threadIdx.y * KF + i];
+      // existing foreign link exists in requested inverse link list? ==> found
+      if (r_graph == r_sym_buffer)
+        s_found[threadIdx.y] = true;
+    }
+    __syncthreads();
+
+    // if there is still room and the existing foreign link is not part of the requested inverse
+    // links, add it
+    if (!threadIdx.x && !s_found[threadIdx.y]) {
+      s_sym_buffer[threadIdx.y * KF + r_num_links] = r_graph;
+      ++r_num_links;
+    }
+  }
+
+  __syncthreads();
+
+  // store requested inverse links and added previous foreign links in the graph's foreign link
+  // list. if there aren't enough links, store the points own index (to avoid entries with -1)
+  const KeyT res = s_sym_buffer[tid];
+  d_graph[static_cast<size_t>(n) * KBuild + KL + kf] = (res >= 0) ? res : n;
+}
+
+#define GGNN_SYM_BUFFER_MERGE(KeyT, ValueT)                                      \
+  template __global__ void sym_buffer_merge<SymBufferMergeKernel<KeyT, ValueT>>( \
+      const SymBufferMergeKernel<KeyT, ValueT>, const uint32_t);
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_SYM_BUFFER_MERGE);
+
+};  // namespace ggnn
diff --git a/src/ggnn/construction/sym_query_layer.cu b/src/ggnn/construction/sym_query_layer.cu
new file mode 100644
index 0000000..b8aeb61
--- /dev/null
+++ b/src/ggnn/construction/sym_query_layer.cu
@@ -0,0 +1,154 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/construction/sym_query_layer.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/lib.h>
+
+#include <ggnn/cuda_utils/simple_knn_sym_cache.cuh>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void __launch_bounds__(T::BLOCK_DIM_X) sym(const T kernel)
+{
+  kernel();
+}
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          uint32_t DIST_ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+SymQueryKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>::operator()() const
+{
+  static constexpr uint32_t K_BLOCK = 32;
+  static_assert(K_BLOCK <= BLOCK_DIM_X);
+
+  static constexpr bool DIST_STATS = false;
+
+  const uint32_t KF{KBuild / 2};
+  const uint32_t KL = KBuild - KF;
+
+  using Cache =
+      SimpleKNNSymCache<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD, DIST_STATS>;
+
+  const float xi = (measure == DistanceMeasure::Euclidean)
+                       ? (d_nn1_stats[0] * d_nn1_stats[0]) * tau_build * tau_build
+                       : d_nn1_stats[0] * tau_build;
+
+  const KeyT n = static_cast<KeyT>(blockIdx.x);
+
+  Cache cache(D, measure, KF, sorted_size, CACHE_SIZE, d_base, d_translation ? d_translation[n] : n,
+              xi);
+
+  __shared__ bool s_connected;
+
+  // fetch neighbors in local neighbor list
+  for (uint32_t i = 0; i < KL; i += K_BLOCK) {
+    __shared__ KeyT s_sym_ids[K_BLOCK];
+    {
+      const uint32_t kl = i + threadIdx.x;
+      if (threadIdx.x < K_BLOCK && kl < KL) {
+        const KeyT sym_n = d_graph[static_cast<size_t>(n) * KBuild + kl];
+        s_sym_ids[threadIdx.x] = sym_n;
+      }
+    }
+
+    for (uint32_t k = 0; i + k < KL && k < K_BLOCK; k++) {
+      __syncthreads();
+      if (!threadIdx.x)
+        s_connected = false;
+
+      // search for k-th local neighbor
+      cache.init_start_point(s_sym_ids[k], d_translation);
+
+      bool found_connection = false;
+
+      for (uint32_t ite = 0; ite < MAX_PER_PATH_ITERATIONS && !found_connection; ++ite) {
+        __syncthreads();
+
+        const KeyT anchor = cache.pop();
+
+        if (anchor == Cache::EMPTY_KEY) {
+          break;
+        }
+
+        // fetch neighbors at anchor point + points in sym buffer
+        __shared__ KeyT s_knn[K_BLOCK];
+        for (uint32_t i = 0; i < KBuild; i += K_BLOCK) {
+          if (threadIdx.x < K_BLOCK) {
+            const uint32_t k = i + threadIdx.x;
+            if (k < KBuild) {
+              const KeyT other_id = (k < KL)
+                                        ? d_graph[static_cast<size_t>(anchor) * KBuild + k]
+                                        : d_sym_buffer[static_cast<size_t>(anchor) * KF + k - KL];
+              if (other_id == n) {
+                s_connected = true;
+              }
+              s_knn[threadIdx.x] = other_id;
+            }
+            else {
+              s_knn[threadIdx.x] = Cache::EMPTY_KEY;
+            }
+          }
+          __syncthreads();
+          if (s_connected) {
+            // stop if the original index n has been found as a neighbor
+            found_connection = true;
+            break;
+          }
+          cache.fetch(s_knn, d_translation, K_BLOCK);
+        }
+      }  // end per k iteration
+
+      if (!found_connection) {
+        // we need to add a symmetric link to the original index n
+        if (!threadIdx.x) {
+          for (uint32_t i = 0; i < KF; i++) {
+            // try to enter the symmetric link at the i-th nearest neighbor
+            // found on the path
+            const KeyT other_n = cache.s_cache[i];
+            if (other_n == Cache::EMPTY_KEY)
+              break;
+            const uint32_t pos = atomicAdd(&d_sym_atomic[other_n], 1U);
+            if (pos < KF) {
+              d_sym_buffer[static_cast<size_t>(other_n) * KF + pos] = n;
+              break;
+            }
+          }
+          // could not add a link
+        }
+      }
+    }  // end k neighbors
+    __syncthreads();
+  }
+}
+
+#define GGNN_SYM(KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD)       \
+  template __global__ void                                                     \
+  sym<SymQueryKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>>( \
+      const SymQueryKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>);
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_SYMS, GGNN_SYM);
+
+};  // namespace ggnn
diff --git a/src/ggnn/construction/top_merge_layer.cu b/src/ggnn/construction/top_merge_layer.cu
new file mode 100644
index 0000000..3db7967
--- /dev/null
+++ b/src/ggnn/construction/top_merge_layer.cu
@@ -0,0 +1,91 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/construction/top_merge_layer.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/lib.h>
+
+#include <ggnn/cuda_utils/distance.cuh>
+#include <ggnn/cuda_utils/k_best_list.cuh>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void __launch_bounds__(T::BLOCK_DIM_X) top(const T kernel)
+{
+  kernel();
+}
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE,
+          uint32_t DIST_ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+TopMergeKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>::operator()() const
+{
+  using Distance = ggnn::Distance<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD>;
+  using KBestList = ggnn::KBestList<KeyT, ValueT, BLOCK_DIM_X>;
+
+  const uint32_t n = blockIdx.x;
+  const KeyT m = (!layer) ? n : d_translation[n];
+
+  Distance distCalc(D, measure, d_base, m);
+  KBestList best(KBuild);
+
+  const uint32_t S_plus_offset = S_offset * (S + 1);
+  const uint32_t S_actual = (!layer && n < S_plus_offset) ? S + 1 : S;
+
+  const KeyT start = (layer || n < S_plus_offset)
+                         ? (n / S_actual) * S_actual
+                         : S_plus_offset + ((n - S_plus_offset) / S_actual) * S_actual;
+  const KeyT end = start + S_actual;
+
+  for (KeyT other_n = start; other_n < end; other_n++) {
+    __syncthreads();
+    const KeyT other_m = (layer) ? d_translation[other_n] : other_n;
+
+    if (m == other_m)
+      continue;
+    ValueT dist = distCalc.distance_synced(other_m);
+
+    best.add_unique(dist, other_n);
+  }
+
+  for (uint32_t k = threadIdx.x; k < KBuild; k += BLOCK_DIM_X) {
+    d_graph[static_cast<size_t>(n) * KBuild + k] = best.s_ids[k];
+  }
+  if (!threadIdx.x) {
+    ValueT nn1_dist = best.s_dists[1];
+    if (measure == DistanceMeasure::Euclidean)
+      nn1_dist = sqrtf(nn1_dist);
+    d_nn1_dist_buffer[n] = nn1_dist;
+  }
+}
+
+#define GGNN_TOP_MERGE(KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD) \
+  template __global__ void                                                     \
+  top<TopMergeKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>>( \
+      const TopMergeKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, DIST_ITEMS_PER_THREAD>);
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_TOPS, GGNN_TOP_MERGE);
+
+};  // namespace ggnn
diff --git a/src/ggnn/construction/wrs_select_layer.cu b/src/ggnn/construction/wrs_select_layer.cu
new file mode 100644
index 0000000..4b01fd2
--- /dev/null
+++ b/src/ggnn/construction/wrs_select_layer.cu
@@ -0,0 +1,110 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/construction/wrs_select_layer.cuh>
+
+#include <ggnn/base/lib.h>
+
+#include <cstdint>
+#include <limits>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cub/cub.cuh>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void __launch_bounds__(T::BLOCK_DIM_X) select(const T kernel)
+{
+  kernel();
+}
+
+/*
+ * Selection of K Points per B for Layers.
+ */
+template <typename KeyT, typename ValueT>
+__device__ __forceinline__ void WRSSelectionKernel<KeyT, ValueT>::operator()() const
+{
+  using BlockRadixSort = cub::BlockRadixSort<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, KeyT>;
+  __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+  const uint32_t b = blockIdx.x;
+
+  const uint32_t S_current = S + (b < S_offset);
+  const uint32_t start = b * S + min(b, S_offset);
+
+  ValueT keys[ITEMS_PER_THREAD];
+  KeyT values[ITEMS_PER_THREAD];
+
+  for (uint32_t item = 0; item < ITEMS_PER_THREAD; item++) {
+    const uint32_t i = item * BLOCK_DIM_X + threadIdx.x;
+    if (i < S_current) {
+      const KeyT n = start + i;
+      const float e = (-1 * logf(d_rng[n])) /
+                      // the top merge kernel is configured to output the matching values for the
+                      // current layer otherwise, we would need to translate n to the bottom layer
+                      (d_nn1_dist_buffer[n] + std::numeric_limits<float>::epsilon());
+      keys[item] = e;
+      values[item] = n;
+    }
+    else {
+      // NOTE: if this happens, the following sym query will fail
+      // this is prevented by keeping the segment size (S) > the number of foreign edges (KF)
+      keys[item] = -1.f;
+      values[item] = -1;
+    }
+  }
+
+  BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values);
+
+  // block index / growth ==> index of the upper segment
+  const uint32_t upper_segment = b / G;
+  // n-th segment contributing to the upper segment
+  const uint32_t nth_lower_segment = b - upper_segment * G;
+
+  // number of points contributed by the current block
+  // evenly distributed between blocks as SG=S/G + the first SG_offset many blocks contribute one
+  // more
+  const uint32_t num_selected_points = SG + (nth_lower_segment < SG_offset);
+
+  // destination for selected points
+  // start of upper segment + point contributed by previous blocks to this segment
+  const uint32_t dest =
+      upper_segment * Sglob + nth_lower_segment * SG + min(nth_lower_segment, SG_offset);
+
+  __syncthreads();
+
+  for (uint32_t item = 0; item < ITEMS_PER_THREAD; item++) {
+    const uint32_t s = threadIdx.x + item * BLOCK_DIM_X;
+    if (s < num_selected_points) {
+      const KeyT n = values[item];
+
+      d_selection[dest + s] = n;
+      d_translation[dest + s] = (!layer) ? n : d_translation_layer[n];
+    }
+  }
+}
+
+#define GGNN_SELECT(KeyT, ValueT)                                    \
+  template __global__ void select<WRSSelectionKernel<KeyT, ValueT>>( \
+      const WRSSelectionKernel<KeyT, ValueT>);
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_SELECT);
+
+};  // namespace ggnn
diff --git a/src/ggnn/python/nanobind.cu b/src/ggnn/python/nanobind.cu
new file mode 100644
index 0000000..121d306
--- /dev/null
+++ b/src/ggnn/python/nanobind.cu
@@ -0,0 +1,300 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/eval.h>
+#include <ggnn/base/graph.h>
+#include <ggnn/base/lib.h>
+#include <ggnn/base/dataset.cuh>
+#include <ggnn/base/ggnn.cuh>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/array.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
+
+#include <glog/logging.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace ggnn;
+
+namespace nb = nanobind;
+
+using namespace nb::literals;
+
+#if NB_VERSION_MAJOR > 1
+// shapes are signed starting with NB 2, nb::any now stands for any type, not any shape
+constexpr auto any_size = -1;
+#else
+constexpr auto any_size = nb::any;
+#endif
+
+using KeyT = int32_t;
+using ValueT = float;
+
+template <typename T>
+using NB2DArrayTorch = nb::ndarray<T, nb::shape<any_size, any_size>, nb::c_contig, nb::pytorch>;
+template <typename T>
+using NB2DArrayCPU = nb::ndarray<T, nb::shape<any_size, any_size>, nb::c_contig, nb::device::cpu>;
+template <typename T>
+using NB2DArrayGPU = nb::ndarray<T, nb::shape<any_size, any_size>, nb::c_contig, nb::device::cuda>;
+
+struct GlobalInit {
+  GlobalInit()
+  {
+    google::InitGoogleLogging("GGNN");
+    google::LogToStderr();
+    // google::SetVLOGLevel("*", 4);
+  }
+};
+
+static GlobalInit init{};
+
+auto dataset_to_ndarray_view =
+    []<typename T>(const Dataset<T>& dataset) -> NB2DArrayTorch<const T> {
+  return NB2DArrayTorch<const T>(
+      dataset.data(), {dataset.N, dataset.D}, nb::handle(), {}, nb::dtype<T>(),
+      dataset.isGPUAccessible() ? nb::device::cuda::value : nb::device::cpu::value, dataset.gpu_id);
+};
+
+auto dataset_to_ndarray = []<typename T>(Dataset<T>&& dataset) -> NB2DArrayTorch<T> {
+  Dataset<T>* reowned_data = new Dataset<T>{std::move(dataset)};
+
+  const int32_t device_type =
+      reowned_data->isGPUAccessible() ? nb::device::cuda::value : nb::device::cpu::value;
+
+  return NB2DArrayTorch<T>(reowned_data->data(), {reowned_data->N, reowned_data->D},
+                           nb::capsule(reowned_data,
+                                       [](void* data) noexcept -> void {
+                                         Dataset<T>* reowned_data =
+                                             reinterpret_cast<Dataset<T>*>(data);
+                                         delete reowned_data;
+                                       }),
+                           {}, nb::dtype<T>(), device_type, dataset.gpu_id);
+};
+
+auto ndarray_to_dataset = []<typename T>(const NB2DArrayTorch<T>& data) -> Dataset<T> {
+  if (data.device_type() == nb::device::cpu::value)
+    return Dataset<T>::referenceCPUData(data.data(), data.shape(0), data.shape(1)).clone();
+  else if (data.device_type() == nb::device::cuda::value)
+    return Dataset<T>::referenceGPUData(data.data(), data.shape(0), data.shape(1), data.device_id())
+        .clone();
+
+  throw std::runtime_error("tensor given on unsupported device type.");
+};
+
+template <typename T>
+consteval const char* get_name_for_dataset_type();
+
+template <>
+consteval const char* get_name_for_dataset_type<float>()
+{
+  return "FloatDataset";
+};
+template <>
+consteval const char* get_name_for_dataset_type<uint8_t>()
+{
+  return "UCharDataset";
+};
+template <>
+consteval const char* get_name_for_dataset_type<int32_t>()
+{
+  return "IntDataset";
+};
+
+NB_MODULE(GGNN, m)
+{
+  m.doc() = R"(GGNN: Graph-Based GPU Nearest Neighbor Search,
+by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch,
+Computer Graphics Group University of Tübingen,
+published in IEEE Transactions on Big Data,
+vol. 9, no. 1, pp. 267-279, 1 Feb. 2023,
+doi: 10.1109/TBDATA.2022.3161156.
+
+Refactored into a Python library by Lukas Ruppert, Deborah Kornwolf,
+Computer Graphics Group University of Tübingen, 2025.
+
+https://github.com/cgtuebingen/ggnn
+
+GGNN performs nearest-neighbor computations on CUDA-capable GPUs.
+It supports billion-scale datasets and can execute on multiple GPUs through sharding.
+When using just a single GPU, data can be exchanged directly with other code
+without copying through CPU memory (e.g., torch tensors).
+)";
+
+  m.def("set_log_level", [](int log_level) -> void { google::SetVLOGLevel("*", log_level); });
+
+  nb::enum_<DistanceMeasure>(m, "DistanceMeasure")
+      .value("Euclidean", DistanceMeasure::Euclidean)
+      .value("Cosine", DistanceMeasure::Cosine);
+
+#define DATASET_CLASS(T)                                                                         \
+  nb::class_<Dataset<T>>(m, get_name_for_dataset_type<T>())                                      \
+      .def("__init__",                                                                           \
+           [](Dataset<T>* new_dataset, const NB2DArrayTorch<T>& data) {                          \
+             new (new_dataset) Dataset<T>{ndarray_to_dataset(data)};                             \
+           })                                                                                    \
+      .def_static("load", &Dataset<T>::load, "file"_a, "from"_a = 0,                             \
+                  "num"_a = std::numeric_limits<uint32_t>::max(), "pin_memory"_a = false)        \
+      .def("store", &Dataset<T>::store, "file"_a)                                                \
+      .def_prop_ro("N", [](const Dataset<T>& data) -> uint64_t { return data.N; })               \
+      .def_prop_ro("D", [](const Dataset<uint8_t>& data) -> uint64_t { return data.D; })         \
+      .def("numel", [](const Dataset<T>& data) -> size_t { return data.numel(); })               \
+      .def("clone",                                                                              \
+           [](const Dataset<T>& data) -> NB2DArrayTorch<T> {                                     \
+             return dataset_to_ndarray(data.clone());                                            \
+           })                                                                                    \
+      .def_prop_ro("view", [](const Dataset<T>& data) { return dataset_to_ndarray_view(data); }) \
+      .def_prop_ro("device", [](const Dataset<T>& data) -> std::string {                         \
+        return data.isGPUAccessible() ? "cuda:" + std::to_string(data.gpu_id) : "cpu";           \
+      });                                                                                        \
+                                                                                                 \
+  nb::implicitly_convertible<NB2DArrayCPU<T>, Dataset<T>>();                                     \
+  nb::implicitly_convertible<NB2DArrayGPU<T>, Dataset<T>>();
+
+  GGNN_EVAL(GGNN_BASES, DATASET_CLASS);
+  GGNN_EVAL(GGNN_KEYS, DATASET_CLASS);
+
+  nb::class_<GGNN<KeyT, ValueT>>(m, "GGNN")
+      .def(nb::init<>())
+      // set base
+      // .def("set_base", &GGNN<KeyT, ValueT>::setBase, "base"_a)
+      .def(
+          "set_base",
+          [](GGNN<KeyT, ValueT>& ggnn, Dataset<float>&& base) { ggnn.setBase(std::move(base)); },
+          "base"_a)
+      .def(
+          "set_base",
+          [](GGNN<KeyT, ValueT>& ggnn, Dataset<uint8_t>&& base) { ggnn.setBase(std::move(base)); },
+          "base"_a)
+      .def("set_working_directory", &GGNN<KeyT, ValueT>::setWorkingDirectory, "dir"_a)
+      .def("set_cpu_memory_limit", &GGNN<KeyT, ValueT>::setCPUMemoryLimit, "memory_limit"_a)
+      .def(
+          "set_gpus",
+          [](GGNN<KeyT, ValueT>& ggnn, const std::vector<int>& gpu_ids) -> void {
+            ggnn.setGPUs(gpu_ids);
+          },
+          "gpu_ids"_a)
+      .def("set_shard_size", &GGNN<KeyT, ValueT>::setShardSize, "n_shard"_a)
+      .def("set_return_results_on_gpu", &GGNN<KeyT, ValueT>::setReturnResultsOnGPU,
+           "return_results_on_gpu"_a = true)
+
+      // graph construction
+      .def("build", &GGNN<KeyT, ValueT>::build, "k_build"_a, "tau_build"_a,
+           "refinement_iterations"_a = 2, "measure"_a = DistanceMeasure::Euclidean,
+           "Build a GGNN graph.")
+      .def("load", &GGNN<KeyT, ValueT>::load, "k_build"_a, "Load a GGNN graph.")
+      .def("store", &GGNN<KeyT, ValueT>::store, "Store a GGNN graph.")
+
+      // run queries
+      .def(
+          "query",
+          [](GGNN<KeyT, ValueT>& ggnn, const Dataset<float>& query, const uint32_t KQuery,
+             const float tau_query, const uint32_t max_iterations, const DistanceMeasure measure) {
+            Results results = ggnn.query(query, KQuery, tau_query, max_iterations, measure);
+
+            return std::make_tuple<>(dataset_to_ndarray(std::move(results.ids)),
+                                     dataset_to_ndarray(std::move(results.dists)));
+          },
+          "query"_a, "k_query"_a, "tau_query"_a, "max_iterations"_a = 400,
+          "measure"_a = DistanceMeasure::Euclidean, "Run a query and return indices and distances.")
+      .def(
+          "query",
+          [](GGNN<KeyT, ValueT>& ggnn, const Dataset<uint8_t>& query, const uint32_t KQuery,
+             const float tau_query, const uint32_t max_iterations, const DistanceMeasure measure) {
+            Results results = ggnn.query(query, KQuery, tau_query, max_iterations, measure);
+
+            return std::make_tuple<>(dataset_to_ndarray(std::move(results.ids)),
+                                     dataset_to_ndarray(std::move(results.dists)));
+          },
+          "query"_a, "k_query"_a, "tau_query"_a, "max_iterations"_a = 400,
+          "measure"_a = DistanceMeasure::Euclidean, "Run a query and return indices and distances.")
+      .def(
+          "bf_query",
+          [](GGNN<KeyT, ValueT>& ggnn, const Dataset<float>& query, const uint32_t KGT,
+             const DistanceMeasure measure) {
+            Results results = ggnn.bfQuery(query, KGT, measure);
+
+            return std::make_tuple<>(dataset_to_ndarray(std::move(results.ids)),
+                                     dataset_to_ndarray(std::move(results.dists)));
+          },
+          "query"_a, "k_gt"_a = 100, "measure"_a = DistanceMeasure::Euclidean,
+          "Run a brute-force query and indices and distances.")
+      .def(
+          "bf_query",
+          [](GGNN<KeyT, ValueT>& ggnn, const Dataset<uint8_t>& query, const uint32_t KGT,
+             const DistanceMeasure measure) {
+            Results results = ggnn.bfQuery(query, KGT, measure);
+
+            return std::make_tuple<>(dataset_to_ndarray(std::move(results.ids)),
+                                     dataset_to_ndarray(std::move(results.dists)));
+          },
+          "query"_a, "k_gt"_a = 100, "measure"_a = DistanceMeasure::Euclidean,
+          "Run a brute-force query and indices and distances.")
+
+      // access the graph
+      .def("get_graph", &GGNN<KeyT, ValueT>::getGraph, "on_gpu_shard_id"_a = 0,
+           "Access the GGNN graph.", nb::rv_policy::reference_internal)
+      .doc() =
+      "GGNN main class. Provides functionality for building, loading, storing, and querying "
+      "nearest neighbor graphs on the GPU.";
+  ;
+
+  nb::class_<Evaluator<KeyT, ValueT>>(m, "Evaluator")
+      .def(nb::init<const Dataset<float>&, const Dataset<float>&, const Dataset<KeyT>&,
+                    const uint32_t, const DistanceMeasure>(),
+           "base"_a, "query"_a, "gt"_a, "k_query"_a, "measure"_a = DistanceMeasure::Euclidean)
+      .def(nb::init<const Dataset<uint8_t>&, const Dataset<uint8_t>&, const Dataset<KeyT>&,
+                    const uint32_t, const DistanceMeasure>(),
+           "base"_a, "query"_a, "gt"_a, "k_query"_a, "measure"_a = DistanceMeasure::Euclidean)
+      .def("evaluate_results", &Evaluator<KeyT, ValueT>::evaluateResults, "results"_a,
+           "Evaluate the accuracy of a query result.");
+
+  nb::class_<Evaluation>(m, "Evaluation")
+      //.def(nb::init<>())
+      .def_rw("k_query", &Evaluation::KQuery)
+      .def_rw("c1", &Evaluation::c1)
+      .def_rw("c1_dup", &Evaluation::c1_dup)
+      .def_rw("c_k_query", &Evaluation::cKQuery)
+      .def_rw("c_k_query_dup", &Evaluation::cKQuery_dup)
+      .def_rw("r_k_query", &Evaluation::rKQuery)
+      .def_rw("r_k_query_dup", &Evaluation::rKQuery_dup)
+      .def("__repr__", [](const Evaluation& eval) -> std::string {
+        std::stringstream ss;
+        ss << eval;
+        return ss.str();
+      });
+
+  nb::class_<Graph<KeyT, ValueT>>(m, "Graph")
+      //.def(nb::init<>())
+      .def_ro("graph", &Graph<KeyT, ValueT>::graph)
+      .def_ro("selection", &Graph<KeyT, ValueT>::selection)
+      .def_ro("translation", &Graph<KeyT, ValueT>::translation)
+      .def_ro("nn1_stats", &Graph<KeyT, ValueT>::nn1_stats);
+}
diff --git a/src/ggnn/query/bf_query_layer.cu b/src/ggnn/query/bf_query_layer.cu
new file mode 100644
index 0000000..9ca6610
--- /dev/null
+++ b/src/ggnn/query/bf_query_layer.cu
@@ -0,0 +1,79 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/query/bf_query_layer.cuh>
+
+#include <ggnn/base/lib.h>
+
+#include <ggnn/cuda_utils/distance.cuh>
+#include <ggnn/cuda_utils/k_best_list.cuh>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void __launch_bounds__(T::BLOCK_DIM_X) bf_query(const T kernel)
+{
+  kernel();
+}
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE, bool WRITE_DISTS>
+__device__ __forceinline__ void
+BruteForceQueryKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, WRITE_DISTS>::operator()() const
+{
+  static constexpr uint32_t K_BLOCK = 32;
+
+  using Distance = Distance<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD>;
+  using KBestList = KBestList<KeyT, ValueT, BLOCK_DIM_X>;
+
+  const KeyT n = static_cast<int>(blockIdx.x);
+
+  Distance distCalc(D, measure, d_base, d_query, n);
+  KBestList best(KQuery);
+  __syncthreads();
+
+  for (KeyT i = 0; i < N_base; ++i) {
+    // fetch the entire base, one by one
+    ValueT dist = distCalc.distance_synced(i);
+    if (dist < best.worst())  // should be faster than checking all elements
+      best.add_unique(dist, i);
+  }
+  __syncthreads();
+
+  for (uint32_t i = 0; i < KQuery; i += K_BLOCK) {
+    const uint32_t k = i + threadIdx.x;
+    if (k < KQuery) {
+      d_query_results[static_cast<size_t>(n) * KQuery + k] = best.s_ids[k];
+      if constexpr (WRITE_DISTS)
+        d_query_results_dists[static_cast<size_t>(n) * KQuery + k] = best.s_dists[k];
+    }
+  }
+}
+
+#define GGNN_BF_QUERY(KeyT, ValueT, BaseT, BLOCK_DIM_X, WRITE_DISTS)              \
+  template __global__ void                                                        \
+  bf_query<BruteForceQueryKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, WRITE_DISTS>>( \
+      const BruteForceQueryKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, WRITE_DISTS>);
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_QUERYS, GGNN_WRITE_DISTS, GGNN_BF_QUERY);
+
+};  // namespace ggnn
diff --git a/src/ggnn/query/query_kernels.cu b/src/ggnn/query/query_kernels.cu
new file mode 100644
index 0000000..eaaf9f7
--- /dev/null
+++ b/src/ggnn/query/query_kernels.cu
@@ -0,0 +1,275 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/query/query_kernels.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/graph_config.h>
+#include <ggnn/base/lib.h>
+#include <ggnn/base/dataset.cuh>
+#include <ggnn/base/gpu_instance.cuh>
+
+#include <ggnn/query/bf_query_layer.cuh>
+#include <ggnn/query/query_layer.cuh>
+
+#include <ggnn/cuda_utils/check.cuh>
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+namespace ggnn {
+
+template <typename KeyT, typename ValueT, typename BaseT>
+class QueryKernelsImpl : public QueryKernels<KeyT, ValueT, BaseT> {
+ public:
+  using GPUInstance = ggnn::GPUInstance<KeyT, ValueT, BaseT>;
+  using Results = ggnn::Results<KeyT, ValueT>;
+
+  QueryKernelsImpl(const DistanceMeasure measure) : measure{measure} {}
+
+ private:
+  const DistanceMeasure measure;
+
+  virtual void query(const GPUInstance& gpu_instance, const uint32_t shard_id,
+                     const Dataset<BaseT>& query, const uint32_t KQuery, const uint32_t max_iters,
+                     const float tau_query, Results& results) override
+  {
+    // number of cache entries per thread
+    static constexpr uint32_t CACHE_ITEMS_PER_THREAD = 16;
+    // number of query dimensions cached per thread in the distance computation
+    static constexpr uint32_t DIST_ITEMS_PER_THREAD = 4;
+    // CUDA warp size
+    static constexpr uint32_t WARP_SIZE = 32;
+    // CUDA block size limit
+    static constexpr uint32_t MAX_BLOCK_DIM_X = 1024;
+
+    static constexpr uint32_t MIN_PRIOQ_SIZE = 16;
+    static constexpr uint32_t MIN_VISITED_SIZE = 32;
+    static constexpr uint32_t MIN_CACHE_SIZE = 256;
+    static constexpr uint32_t MAX_K_QUERY =
+        6000;  // making it larger would exceed the 48k shared memory limit
+    static constexpr uint32_t MAX_CACHE_SIZE =
+        8192;  // the next larger one would exceed the 48k shared memory limit
+
+    static constexpr uint32_t MAX_ITERS =
+        std::min(MAX_BLOCK_DIM_X * CACHE_ITEMS_PER_THREAD, MAX_CACHE_SIZE);
+    static constexpr uint32_t MAX_DIM = MAX_BLOCK_DIM_X * DIST_ITEMS_PER_THREAD;
+
+    CHECK_LE(KQuery, MAX_K_QUERY);
+
+    const uint32_t required_sorted_size = next_multiple<uint32_t, 32>(KQuery + 1 + MIN_PRIOQ_SIZE);
+
+    const uint32_t cache_size =
+        std::max({MIN_CACHE_SIZE, required_sorted_size + MIN_VISITED_SIZE, bit_ceil(max_iters)});
+    /** number of threads required for tracking visited points
+     * (ignoring cache entries used for best list and sorted size)
+     * (iterations beyond that length typically result in cycles)
+     * 512 ==> 32 threads
+     * 1024 ==> 64 threads
+     * 2048 ==> 128 threads
+     * 4096 ==> 256 threads
+     * 8192 ==> 512 threads
+     */
+    const uint32_t cache_size_block_dim_x =
+        bit_ceil((cache_size + CACHE_ITEMS_PER_THREAD - 1) / CACHE_ITEMS_PER_THREAD);
+    /** number of threads required for processing data with a certain dimension:
+     * 128D ==> 32 threads
+     * 256D ==> 64 threads
+     * 512D ==> 128 threads
+     * 1024D ==> 256 threads
+     * 2048D ==> 512 threads
+     * 4096D ==> 1024 threads
+     */
+    const uint32_t dimension_block_dim_x =
+        bit_ceil((query.D + DIST_ITEMS_PER_THREAD - 1) / DIST_ITEMS_PER_THREAD);
+    const uint32_t block_dim_x =
+        std::max({WARP_SIZE, cache_size_block_dim_x, dimension_block_dim_x});
+
+    CHECK_LE(max_iters, MAX_ITERS);
+    CHECK_LE(query.D, MAX_DIM);
+    CHECK_LE(cache_size, MAX_CACHE_SIZE);
+    CHECK_LE(block_dim_x, MAX_BLOCK_DIM_X);
+
+    const uint32_t sorted_size = std::max(cache_size < 512U ? 64U : 32U, required_sorted_size);
+
+    gpu_instance.gpu_ctx.activate();
+    const uint32_t on_gpu_shard_id =
+        shard_id - gpu_instance.shard_config.num_shards * gpu_instance.shard_config.device_index;
+    const auto& gpu_buffer = gpu_instance.getGPUGraphBuffer(on_gpu_shard_id);
+    const auto& gpu_base_buffer = gpu_instance.getGPUBaseBuffer(on_gpu_shard_id);
+    const auto& graph = gpu_buffer.graph;
+    const Dataset<BaseT>& base = gpu_base_buffer.base;
+    const cudaStream_t stream = gpu_buffer.stream.get();
+
+    static constexpr bool WRITE_DISTS = true;
+    static constexpr bool DIST_STATS = false;
+
+    uint32_t* m_dist_statistics = nullptr;
+
+    if constexpr (DIST_STATS)
+      cudaMallocAsync(&m_dist_statistics, query.N * sizeof(uint32_t), stream);
+
+    CHECK_LT(base.N, std::numeric_limits<uint32_t>::max());
+
+    auto run_query = [&]<uint32_t BLOCK_DIM_X>() {
+      using QueryKernel =
+          ggnn::QueryKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, WRITE_DISTS, DIST_STATS>;
+
+      QueryKernel query_kernel{
+          .D = query.D,
+          .measure = measure,
+          .KQuery = KQuery,
+          .sorted_size = sorted_size,
+          .cache_size = cache_size,
+          .tau_query = tau_query,
+          .max_iterations = max_iters,
+          .N_base = static_cast<KeyT>(base.N),
+          .KBuild = gpu_instance.graph_config.KBuild,
+          .num_starting_points = gpu_instance.graph_config.S,
+          .d_base = base.data(),
+          .d_query = query.data(),
+          .d_graph = graph.graph[0].data(),
+          .d_starting_points = graph.translation[GraphConfig::L - 1].data(),
+          .d_nn1_stats = graph.nn1_stats.data(),
+          .d_query_results = results.ids.data(),
+          .d_query_results_dists = results.dists.data(),
+          .d_dist_stats = m_dist_statistics,
+          .shards_per_gpu = gpu_instance.shard_config.num_shards,
+          .on_gpu_shard_id = on_gpu_shard_id,
+      };
+
+      query_kernel.launch(query.N, gpu_buffer.stream.get());
+    };
+
+    if constexpr (DIST_STATS)
+      CHECK_CUDA(cudaFreeAsync(m_dist_statistics, gpu_buffer.stream.get()));
+
+    switch (block_dim_x) {
+      case 32:
+        run_query.template operator()<32>();
+        break;
+      case 64:
+        run_query.template operator()<64>();
+        break;
+      case 128:
+        run_query.template operator()<128>();
+        break;
+      case 256:
+        run_query.template operator()<256>();
+        break;
+      case 512:
+        run_query.template operator()<512>();
+        break;
+      case 1024:
+        run_query.template operator()<1024>();
+        break;
+      default:
+        LOG(DFATAL) << "The query has not been compiled for BLOCK_DIM_X == " << block_dim_x << ".";
+    }
+  }
+
+  virtual void bruteForceQuery(const Dataset<BaseT>& base, const Dataset<BaseT>& query,
+                               const uint32_t KQuery, Results& results,
+                               cudaStream_t stream) override
+  {
+    // number of query dimensions cached per thread in the distance computation
+    static constexpr uint32_t DIST_ITEMS_PER_THREAD = 4;
+    // CUDA warp size
+    static constexpr uint32_t WARP_SIZE = 32;
+    // CUDA block size limit
+    static constexpr uint32_t MAX_BLOCK_DIM_X = 1024;
+    static constexpr uint32_t MAX_DIM = MAX_BLOCK_DIM_X * DIST_ITEMS_PER_THREAD;
+    static constexpr uint32_t MAX_K_QUERY =
+        6000;  // making it larger would exceed the 48k shared memory limit
+
+    CHECK_LE(KQuery, MAX_K_QUERY);
+
+    /** number of threads required for processing data with a certain dimension:
+     * 128D ==> 32 threads
+     * 256D ==> 64 threads
+     * 512D ==> 128 threads
+     * 1024D ==> 256 threads
+     * 2048D ==> 512 threads
+     * 4096D ==> 1024 threads
+     */
+    const uint32_t dimension_block_dim_x =
+        bit_ceil((query.D + DIST_ITEMS_PER_THREAD - 1) / DIST_ITEMS_PER_THREAD);
+    const uint32_t block_dim_x = std::max({WARP_SIZE, dimension_block_dim_x});
+
+    CHECK_LE(query.D, MAX_DIM);
+    CHECK_LE(block_dim_x, MAX_BLOCK_DIM_X);
+
+    static constexpr bool WRITE_DISTS = true;
+
+    CHECK_LT(base.N, std::numeric_limits<KeyT>::max());
+
+    auto run_bf_query = [&]<uint32_t BLOCK_DIM_X>() {
+      using BFQueryKernel =
+          ggnn::BruteForceQueryKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, WRITE_DISTS>;
+
+      BFQueryKernel query_kernel{
+          .D = query.D,
+          .measure = measure,
+          .KQuery = KQuery,
+          .N_base = static_cast<KeyT>(base.N),  // this applies to potential subsets
+          .d_base = base.data(),
+          .d_query = query.data(),
+          .d_query_results = results.ids.data(),
+          .d_query_results_dists = results.dists.data(),
+      };
+
+      query_kernel.launch(query.N, stream);
+    };
+
+    switch (block_dim_x) {
+      case 32:
+        run_bf_query.template operator()<32>();
+        break;
+      case 64:
+        run_bf_query.template operator()<64>();
+        break;
+      case 128:
+        run_bf_query.template operator()<128>();
+        break;
+      case 256:
+        run_bf_query.template operator()<256>();
+        break;
+      case 512:
+        run_bf_query.template operator()<512>();
+        break;
+      case 1024:
+        run_bf_query.template operator()<1024>();
+        break;
+      default:
+        LOG(DFATAL) << "The brute-force query has not been compiled for BLOCK_DIM_X == "
+                    << block_dim_x << ".";
+    }
+  }
+};
+
+template <typename KeyT, typename ValueT, typename BaseT>
+QueryKernels<KeyT, ValueT, BaseT>::QueryKernels(const DistanceMeasure measure)
+{
+  pimpl.reset(new QueryKernelsImpl<KeyT, ValueT, BaseT>{measure});
+}
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_INSTANTIATE_CLASS, QueryKernels);
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_INSTANTIATE_CLASS, QueryKernelsImpl);
+};  // namespace ggnn
diff --git a/src/ggnn/query/query_layer.cu b/src/ggnn/query/query_layer.cu
new file mode 100644
index 0000000..8eb2dd8
--- /dev/null
+++ b/src/ggnn/query/query_layer.cu
@@ -0,0 +1,107 @@
+/* Copyright 2025 ComputerGraphics Tuebingen. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
+// converted to GGNN library by: Lukas Ruppert, Deborah Kornwolf
+
+#include <ggnn/query/query_layer.cuh>
+
+#include <ggnn/base/def.h>
+#include <ggnn/base/lib.h>
+#include <ggnn/cuda_utils/distance.cuh>
+#include <ggnn/cuda_utils/simple_knn_cache.cuh>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ggnn {
+
+template <typename T>
+__global__ void __launch_bounds__(T::BLOCK_DIM_X) query(const T kernel)
+{
+  kernel();
+}
+
+template <typename KeyT, typename ValueT, typename BaseT, uint32_t BLOCK_SIZE, bool WRITE_DISTS,
+          bool DIST_STATS>
+__device__ __forceinline__ void
+QueryKernel<KeyT, ValueT, BaseT, BLOCK_SIZE, WRITE_DISTS, DIST_STATS>::operator()() const
+{
+  static constexpr uint32_t K_BLOCK = 32;
+
+  using Cache = SimpleKNNCache<KeyT, ValueT, BaseT, BLOCK_DIM_X, DIST_ITEMS_PER_THREAD, DIST_STATS>;
+
+  const float xi = (measure == DistanceMeasure::Euclidean)
+                       ? (d_nn1_stats[1] * d_nn1_stats[1]) * tau_query * tau_query
+                       : d_nn1_stats[1] * tau_query;
+
+  const KeyT n = static_cast<KeyT>(blockIdx.x);
+
+  Cache cache(D, measure, KQuery, sorted_size, cache_size, d_base, d_query, n, xi);
+  cache.fetch_unfiltered(d_starting_points, nullptr, num_starting_points);
+
+  for (uint32_t ite = 0; ite < max_iterations; ++ite) {
+    if (measure == DistanceMeasure::Euclidean) {
+      cache.r_xi = min(xi, cache.s_dists[0] * tau_query * tau_query);
+    }
+    else if (measure == DistanceMeasure::Cosine) {
+      cache.r_xi = min(xi, cache.s_dists[0] * tau_query);
+    }
+
+    const KeyT anchor = cache.pop();
+    if (anchor == Cache::EMPTY_KEY)
+      break;
+
+    __shared__ KeyT s_knn[K_BLOCK];
+    for (uint32_t i = 0; i < KBuild; i += K_BLOCK) {
+      if (threadIdx.x < K_BLOCK) {
+        s_knn[threadIdx.x] = (i + threadIdx.x < KBuild)
+                                 ? d_graph[static_cast<size_t>(anchor) * KBuild + i + threadIdx.x]
+                                 : Cache::EMPTY_KEY;
+      }
+      cache.fetch(s_knn, nullptr, K_BLOCK);
+    }
+
+  }  // end iterations
+
+  __syncthreads();
+  cache.write_best(d_query_results, n * shards_per_gpu + on_gpu_shard_id, KQuery,
+                   on_gpu_shard_id * N_base);
+
+  if constexpr (WRITE_DISTS) {
+#pragma unroll
+    for (uint32_t k = threadIdx.x; k < KQuery; k += BLOCK_DIM_X) {
+      d_query_results_dists[(n * shards_per_gpu + on_gpu_shard_id) * KQuery + k] = cache.s_dists[k];
+    }
+  }
+
+  if constexpr (DIST_STATS) {
+    if (!threadIdx.x) {
+      d_dist_stats[n] = cache.get_dist_stats();
+    }
+  }
+}
+
+#define GGNN_QUERY(KeyT, ValueT, BaseT, BLOCK_DIM_X, WRITE_DISTS, DIST_STATS)    \
+  template __global__ void                                                       \
+  query<QueryKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, WRITE_DISTS, DIST_STATS>>( \
+      const QueryKernel<KeyT, ValueT, BaseT, BLOCK_DIM_X, WRITE_DISTS, DIST_STATS>);
+
+GGNN_EVAL(GGNN_KEYS, GGNN_VALUES, GGNN_BASES, GGNN_QUERYS, GGNN_WRITE_DISTS, GGNN_DIST_STATS,
+          GGNN_QUERY);
+
+};  // namespace ggnn
diff --git a/src/gist.cu b/src/gist.cu
deleted file mode 100644
index 2cd1aa6..0000000
--- a/src/gist.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: GIST)
-  //
-  /// dimension of the dataset
-  const int D = 960;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 96;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 64;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-
-    m_ggnn.queryLayer<128, 2000, 2048, 32>();
-    // m_ggnn.queryLayer<32, 400, 448, 64>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 1; i <= 100; ++i) query_function(i * 0.01f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1 (using -tau 0.5 -refinement_iterations 2):";
-    query_function(0.09f);
-    query_function(0.12f);
-    query_function(0.18f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/glove200.cu b/src/glove200.cu
deleted file mode 100644
index 221c3f3..0000000
--- a/src/glove200.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: GloVe 200)
-  //
-  /// dimension of the dataset
-  const int D = 200;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Cosine;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 96;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 64;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-
-    m_ggnn.queryLayer<128, 2000, 2048, 32>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 1; i <= 100; ++i) query_function(i * 0.01f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1 (using -tau 0.5 -refinement_iterations 2):";
-    query_function(0.09f);
-    query_function(0.12f);
-    query_function(0.18f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/nytimes.cu b/src/nytimes.cu
deleted file mode 100644
index efca798..0000000
--- a/src/nytimes.cu
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-using KeyT = int32_t;
-using BaseT = float;
-using ValueT = float;
-using BAddrT = uint32_t;
-using GAddrT = uint32_t;
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: NyTimes)
-  //
-  /// dimension of the dataset
-  const int D = 256;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Cosine;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 40;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-
-    m_ggnn.queryLayer<128, 2000, 2048, 32>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 1; i <= 100; ++i) query_function(i * 0.01f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1 (using -tau 0.3 -refinement_iterations 0):";
-    query_function(0.08f);
-    query_function(0.12f);
-    query_function(0.25f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/nytimes_top10.cu b/src/nytimes_top10.cu
deleted file mode 100644
index d25c65c..0000000
--- a/src/nytimes_top10.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-using KeyT = int32_t;
-using BaseT = float;
-using ValueT = float;
-using BAddrT = uint32_t;
-using GAddrT = uint32_t;
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: NyTimes)
-  //
-  /// dimension of the dataset
-  const int D = 256;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Cosine;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 96;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 64;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  CHECK_CUDA(cudaPeekAtLastError());
-  CHECK_CUDA(cudaDeviceSynchronize());
-  CHECK_CUDA(cudaPeekAtLastError());
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-
-    m_ggnn.queryLayer<128, 2000, 2048, 32>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 1; i <= 100; ++i) query_function(i * 0.01f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1 (using -tau 0.3 -refinement_iterations 0):";
-    query_function(0.08f);
-    query_function(0.12f);
-    query_function(0.25f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1b_multi_gpu.cu b/src/sift1b_multi_gpu.cu
deleted file mode 100644
index 32ac524..0000000
--- a/src/sift1b_multi_gpu.cu
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef CUDA_API_PER_THREAD_DEFAULT_STREAM
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-#endif
-
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-// only needed for file_exists check
-#include <sys/stat.h>
-
-inline bool file_exists(const std::string& name) {
-  struct stat buffer;
-  return (stat(name.c_str(), &buffer) == 0);
-}
-
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn_multi_gpu.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-
-DEFINE_string(
-    mode, "bq",
-    "Mode: bq -> build_and_query, bs -> build_and_store, lq -> load_and_query");
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(
-    groundtruth_dir, "",
-    "path to directory with groundtruth vectors of form idx_{B}M.ivecs");
-DEFINE_string(graph_dir, "./", "directory to store and load ggnn graph files.");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(factor, 1000000, "Factor");
-DEFINE_int32(base, 1, "N_base: base x factor");
-DEFINE_int32(shard, 1, "N_shard: shard x factor");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_string(gpu_ids, "0", "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = uint8_t;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint64_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint64_t;
-  //
-  // dataset configuration (here: SIFT1B)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 20;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  std::istringstream iss(FLAGS_gpu_ids);
-  std::vector<std::string> results(std::istream_iterator<std::string>{iss},
-                                   std::istream_iterator<std::string>());
-
-  int numGpus;
-  cudaGetDeviceCount(&numGpus);
-
-  std::vector<int> gpus;
-  for (auto&& r : results) {
-    int gpu_id = atoi(r.c_str());
-    printf("GPU %d: ", gpu_id);
-    {
-      CHECK_GE(gpu_id, 0) << "This GPU does not exist";
-      CHECK_LT(gpu_id, numGpus) << "This GPU does not exist";
-
-      cudaDeviceProp prop;
-      cudaGetDeviceProperties(&prop, gpu_id);
-      printf("Found device name: %s\n", prop.name);
-
-      gpus.push_back(gpu_id);
-    }
-  }
-
-  const size_t N_base = FLAGS_base * FLAGS_factor;
-  const int N_shard = FLAGS_shard * FLAGS_factor;
-
-  std::cout << "FLAGS_groundtruth_dir: " << FLAGS_groundtruth_dir << "\n";
-
-  char groundtruth_filename_buffer[64];
-  snprintf(groundtruth_filename_buffer, 64, "/idx_%dM.ivecs",
-           static_cast<int>(N_base / 1000000));
-
-  std::string groundtruth_filename =
-      FLAGS_groundtruth_dir + groundtruth_filename_buffer;
-
-  std::cout << "groundtruth_filename: " << groundtruth_filename << "\n";
-
-  typedef GGNNMultiGPU<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild,
-                       KF, KQuery, S>
-      GGNN;
-  GGNN ggnn{FLAGS_base_filename,
-            FLAGS_query_filename,
-            file_exists(groundtruth_filename) ? groundtruth_filename : "",
-            L,
-            static_cast<float>(FLAGS_tau),
-            N_base};
-
-  ggnn.ggnnMain(gpus, FLAGS_mode, N_shard, FLAGS_graph_dir,
-                FLAGS_refinement_iterations, FLAGS_grid_search);
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1b_multi_gpu_top10.cu b/src/sift1b_multi_gpu_top10.cu
deleted file mode 100644
index 10e5d65..0000000
--- a/src/sift1b_multi_gpu_top10.cu
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef CUDA_API_PER_THREAD_DEFAULT_STREAM
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-#endif
-
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-// only needed for file_exists check
-#include <sys/stat.h>
-
-inline bool file_exists(const std::string& name) {
-  struct stat buffer;
-  return (stat(name.c_str(), &buffer) == 0);
-}
-
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn_multi_gpu.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-
-DEFINE_string(
-    mode, "bq",
-    "Mode: bq -> build_and_query, bs -> build_and_store, lq -> load_and_query");
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(
-    groundtruth_dir, "",
-    "path to directory with groundtruth vectors of form idx_{B}M.ivecs");
-DEFINE_string(graph_dir, "./", "directory to store and load ggnn graph files.");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(factor, 1000000, "Factor");
-DEFINE_int32(base, 1, "N_base: base x factor");
-DEFINE_int32(shard, 1, "N_shard: shard x factor");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_string(gpu_ids, "0", "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = uint8_t;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint64_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint64_t;
-  //
-  // dataset configuration (here: SIFT1B)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 40;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  std::istringstream iss(FLAGS_gpu_ids);
-  std::vector<std::string> results(std::istream_iterator<std::string>{iss},
-                                   std::istream_iterator<std::string>());
-
-  int numGpus;
-  cudaGetDeviceCount(&numGpus);
-
-  std::vector<int> gpus;
-  for (auto&& r : results) {
-    int gpu_id = atoi(r.c_str());
-    printf("GPU %d: ", gpu_id);
-    {
-      CHECK_GE(gpu_id, 0) << "This GPU does not exist";
-      CHECK_LT(gpu_id, numGpus) << "This GPU does not exist";
-
-      cudaDeviceProp prop;
-      cudaGetDeviceProperties(&prop, gpu_id);
-      printf("Found device name: %s\n", prop.name);
-
-      gpus.push_back(gpu_id);
-    }
-  }
-
-  const size_t N_base = FLAGS_base * FLAGS_factor;
-  const int N_shard = FLAGS_shard * FLAGS_factor;
-
-  std::cout << "FLAGS_groundtruth_dir: " << FLAGS_groundtruth_dir << "\n";
-
-  char groundtruth_filename_buffer[64];
-  snprintf(groundtruth_filename_buffer, 64, "/idx_%dM.ivecs",
-           static_cast<int>(N_base / 1000000));
-
-  std::string groundtruth_filename =
-      FLAGS_groundtruth_dir + groundtruth_filename_buffer;
-
-  std::cout << "groundtruth_filename: " << groundtruth_filename << "\n";
-
-  typedef GGNNMultiGPU<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild,
-                       KF, KQuery, S>
-      GGNN;
-  GGNN ggnn{FLAGS_base_filename,
-            FLAGS_query_filename,
-            file_exists(groundtruth_filename) ? groundtruth_filename : "",
-            L,
-            static_cast<float>(FLAGS_tau),
-            N_base};
-
-  ggnn.ggnnMain(gpus, FLAGS_mode, N_shard, FLAGS_graph_dir,
-                FLAGS_refinement_iterations, FLAGS_grid_search);
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1b_subsets.cu b/src/sift1b_subsets.cu
deleted file mode 100644
index e549336..0000000
--- a/src/sift1b_subsets.cu
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = uint8_t;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint64_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint64_t;
-  //
-  // dataset configuration (here: SIFT1B)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 20;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename,
-              FLAGS_query_filename,
-              "",
-              L,
-              static_cast<float>(FLAGS_tau),
-              100000000};
-
-  for (KeyT n = 10000000; n <= m_ggnn.dataset.N_base; n += 10000000) {
-    LOG(INFO) << "Constructing graph for " << n << " points.";
-    m_ggnn.reinit_graph_for_subset(n);
-    m_ggnn.generateGTUsingBF();
-
-    {
-      std::vector<float> construction_times;
-
-      cudaEvent_t start, stop;
-      cudaEventCreate(&start);
-      cudaEventCreate(&stop);
-
-      LOG(INFO) << "Starting Graph construction... (tau=" << FLAGS_tau << ")";
-
-      cudaEventRecord(start);
-      m_ggnn.build();
-      cudaEventRecord(stop);
-
-      CHECK_CUDA(cudaPeekAtLastError());
-      CHECK_CUDA(cudaDeviceSynchronize());
-      CHECK_CUDA(cudaPeekAtLastError());
-
-      cudaEventSynchronize(stop);
-      float milliseconds = 0;
-      cudaEventElapsedTime(&milliseconds, start, stop);
-      construction_times.push_back(milliseconds);
-
-      for (int refinement_step = 0;
-           refinement_step < FLAGS_refinement_iterations; ++refinement_step) {
-        DLOG(INFO) << "Refinement step " << refinement_step;
-        m_ggnn.refine();
-
-        cudaEventRecord(stop);
-        CHECK_CUDA(cudaPeekAtLastError());
-        CHECK_CUDA(cudaDeviceSynchronize());
-        CHECK_CUDA(cudaPeekAtLastError());
-        cudaEventSynchronize(stop);
-
-        float elapsed_milliseconds = 0;
-        cudaEventElapsedTime(&elapsed_milliseconds, start, stop);
-        construction_times.push_back(elapsed_milliseconds);
-      }
-      cudaEventDestroy(start);
-      cudaEventDestroy(stop);
-
-      for (int refinement_step = 0; refinement_step < construction_times.size();
-           refinement_step++) {
-        const float elapsed_milliseconds = construction_times[refinement_step];
-        const float elapsed_seconds = elapsed_milliseconds / 1000.0f;
-        const int number_of_points = m_ggnn.ggnn_gpu_instance.N_shard;
-
-        LOG(INFO) << "Graph construction + " << refinement_step
-                  << " refinement step(s)";
-        LOG(INFO) << "                   -- secs: " << elapsed_seconds;
-        LOG(INFO) << "                   -- points: " << number_of_points;
-        LOG(INFO) << "                   -- ms/point: "
-                  << elapsed_milliseconds / number_of_points;
-      }
-    }
-
-    {
-      CHECK_CUDA(cudaPeekAtLastError());
-      CHECK_CUDA(cudaDeviceSynchronize());
-      CHECK_CUDA(cudaPeekAtLastError());
-
-      auto query_function = [&m_ggnn](const float tau_query) {
-        cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-        LOG(INFO) << "--";
-        LOG(INFO) << "Query with tau_query " << tau_query;
-        // faster for C@1 = 99%
-        // LOG(INFO) << "fast query (good for C@1)";
-        // m_ggnn.queryLayer<32, 200, 256, 64>();
-        // better for C@10 > 99%
-        LOG(INFO) << "regular query (good for C@10)";
-        m_ggnn.queryLayer<32, 400, 448, 64>();
-        // expensive, can get to 99.99% C@10
-        // m_ggnn.queryLayer<128, 2000, 2048, 256>();
-      };
-
-      {  // by default, just execute a few queries
-        LOG(INFO) << "--";
-        LOG(INFO) << "90, 95, 99% R@1, 99% C@10 (using -tau 0.5 "
-                     "-refinement_iterations 2):";
-        query_function(0.34f);
-        query_function(0.41f);
-        query_function(0.51f);
-        query_function(0.64f);
-      }
-    }
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m.cu b/src/sift1m.cu
deleted file mode 100644
index d1e9774..0000000
--- a/src/sift1m.cu
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 24;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-    // faster for C@1 = 99%
-    LOG(INFO) << "fast query (good for C@1)";
-    m_ggnn.queryLayer<32, 200, 256, 64>();
-    // better for C@10 > 99%
-    LOG(INFO) << "regular query (good for C@10)";
-    m_ggnn.queryLayer<32, 400, 448, 64>();
-    // expensive, can get to 99.99% C@10
-    // m_ggnn.queryLayer<128, 2000, 2048, 256>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 0; i < 70; ++i) query_function(i * 0.01f);
-    for (int i = 7; i <= 20; ++i) query_function(i * 0.1f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1, 99% C@10 (using -tau 0.5 "
-                 "-refinement_iterations 2):";
-    query_function(0.34f);
-    query_function(0.41f);
-    query_function(0.51f);
-    query_function(0.64f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_base_vs_base.cu b/src/sift1m_base_vs_base.cu
deleted file mode 100644
index 1bafa78..0000000
--- a/src/sift1m_base_vs_base.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 40;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  m_ggnn.evaluateKNNGraph();
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_batchsize.cu b/src/sift1m_batchsize.cu
deleted file mode 100644
index 6a64652..0000000
--- a/src/sift1m_batchsize.cu
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 24;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-    // faster for C@1 = 99%
-    // LOG(INFO) << "fast query (good for C@1)";
-    // m_ggnn.queryLayer<32, 200, 256, 64>();
-    // better for C@10 > 99%
-    // LOG(INFO) << "regular query (good for C@10)";
-    m_ggnn.queryLayer<32, 400, 448, 64>();
-    // expensive, can get to 99.99% C@10
-    // m_ggnn.queryLayer<128, 2000, 2048, 256>();
-  };
-
-  const int N_query_total = m_ggnn.dataset.N_query;
-
-  LOG(INFO) << "--";
-  LOG(INFO) << "99% R@1 (using -tau 0.5 -refinement_iterations 2):";
-
-  m_ggnn.dataset.template checkForDuplicatesInGroundTruth<measure, ValueT>(
-      KQuery);
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 1; i <= N_query_total; i += 500) {
-      m_ggnn.dataset.N_query = i;
-      query_function(0.51f);
-
-      if (i == 1) i = 0;
-    }
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "1, 10, 100, 1000, 10000:";
-    for (int i = 1; i <= N_query_total; i *= 10) {
-      m_ggnn.dataset.N_query = i;
-      query_function(0.51f);
-    }
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_load_hnsw.cu b/src/sift1m_load_hnsw.cu
deleted file mode 100644
index 7354369..0000000
--- a/src/sift1m_load_hnsw.cu
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-#include "ggnn/utils/hnswlib_loader.hpp"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized HNSW index (Hnswlib)");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // HNSW configuration
-  const int M = 20;
-  const int KBuild = M * 2;
-  const int KF = KBuild / 2;
-  // only one entry point, otherwise no hierarchy
-  const int S = 1;
-  const int L = 2;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  // static_assert(KBuild-KF < S, "there are not enough points to fill the local
-  // neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  const bool import_graph =
-      !FLAGS_graph_filename.empty() && file_exists(FLAGS_graph_filename);
-
-  CHECK(import_graph) << "A HNSW index must be provided.";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{
-      FLAGS_base_filename, FLAGS_query_filename,
-      file_exists(FLAGS_groundtruth_filename) ? FLAGS_groundtruth_filename : "",
-      L, 0.0f};
-
-  typedef HNSWLoader<ValueT, D, M> HNSWLoader;
-  {
-    // load HNSW
-    HNSWLoader m_hnsw_loader(FLAGS_graph_filename);
-
-    const int N = m_ggnn.ggnn_gpu_instance.N_shard;
-    auto& graph_host = m_ggnn.ggnn_gpu_instance.ggnn_cpu_buffers.at(0);
-    auto& graph_device = m_ggnn.ggnn_gpu_instance.ggnn_shards.at(0);
-
-    // transfer base-level neighborhood information
-    for (size_t n = 0; n < N; ++n) {
-      for (size_t k = 0; k < KBuild; ++k) {
-        if (m_hnsw_loader.data_level0_memory_.at(n).link_count > k)
-          graph_host.h_graph[n * KBuild + k] =
-              m_hnsw_loader.data_level0_memory_.at(n).links[k];
-        else
-          graph_host.h_graph[n * KBuild + k] = -1;
-      }
-    }
-
-    // FIXME: does HNSW have a useful neighborhood we should load on the top
-    // layer?
-    for (size_t k = 0; k < KBuild; ++k) {
-      graph_host.h_graph[N * KBuild + k] = -1;
-    }
-    // set starting point
-    graph_host.h_translation[0] = m_hnsw_loader.hnsw_header.enterpoint_node_;
-    graph_host.h_selection[0] = m_hnsw_loader.hnsw_header.enterpoint_node_;
-
-    // this could be done on the GPU
-    float max_nn1_dist = 0.0f;
-    for (size_t n = 0; n < N; ++n) {
-      max_nn1_dist =
-          std::max(max_nn1_dist,
-                   m_ggnn.dataset
-                       .template compute_distance_base_to_base<measure, ValueT>(
-                           n, graph_host.h_graph[n * KBuild]));
-    }
-    // don't need the mean for querying - just set it to max as well
-    graph_host.h_nn1_stats[0] = max_nn1_dist;
-    graph_host.h_nn1_stats[1] = max_nn1_dist;
-
-    graph_host.uploadAsync(graph_device);
-
-    CHECK_CUDA(cudaStreamSynchronize(graph_device.stream));
-  }
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-    // faster for C@1 = 99%
-    LOG(INFO) << "fast query (good for C@1)";
-    m_ggnn.queryLayer<32, 200, 256, 64>();
-    // better for C@10 > 99%
-    LOG(INFO) << "regular query (good for C@10)";
-    m_ggnn.queryLayer<32, 400, 448, 64>();
-    // expensive, can get to 99.99% C@10
-    // m_ggnn.queryLayer<128, 2000, 2048, 256>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 0; i <= 140; ++i) query_function(i * 0.01f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1, 99% C@10 (using -tau 0.5 "
-                 "-refinement_iterations 2):";
-    query_function(0.34f);
-    query_function(0.41f);
-    query_function(0.51f);
-    query_function(0.64f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_multi_gpu.cu b/src/sift1m_multi_gpu.cu
deleted file mode 100644
index 136ddbf..0000000
--- a/src/sift1m_multi_gpu.cu
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-
-#ifndef CUDA_API_PER_THREAD_DEFAULT_STREAM
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-#endif
-
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-// only needed for file_exists check
-#include <sys/stat.h>
-
-inline bool file_exists(const std::string& name) {
-  struct stat buffer;
-  return (stat(name.c_str(), &buffer) == 0);
-}
-
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn_multi_gpu.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-
-DEFINE_string(
-    mode, "bq",
-    "Mode: bq -> build_and_query, bs -> build_and_store, lq -> load_and_query");
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_dir, "./", "directory to store and load ggnn graph files.");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(factor, 1000000, "Factor");
-DEFINE_int32(base, 1, "N_base: base x factor");
-DEFINE_int32(shard, 1, "N_shard: shard x factor");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_string(gpu_ids, "0", "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-  CHECK(file_exists(FLAGS_groundtruth_filename))
-      << "File for groundtruth vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 24;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  std::istringstream iss(FLAGS_gpu_ids);
-  std::vector<std::string> results(std::istream_iterator<std::string>{iss},
-                                   std::istream_iterator<std::string>());
-
-  int numGpus;
-  cudaGetDeviceCount(&numGpus);
-
-  std::vector<int> gpus;
-  for (auto&& r : results) {
-    int gpu_id = atoi(r.c_str());
-    printf("GPU %d: ", gpu_id);
-    {
-      CHECK_GE(gpu_id, 0) << "This GPU does not exist";
-      CHECK_LT(gpu_id, numGpus) << "This GPU does not exist";
-
-      cudaDeviceProp prop;
-      cudaGetDeviceProperties(&prop, gpu_id);
-      printf("Found device name: %s\n", prop.name);
-
-      gpus.push_back(gpu_id);
-    }
-  }
-
-  const size_t N_base = FLAGS_base * FLAGS_factor;
-  const int N_shard = FLAGS_shard * FLAGS_factor;
-
-  typedef GGNNMultiGPU<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild,
-                       KF, KQuery, S>
-      GGNN;
-  GGNN ggnn{
-      FLAGS_base_filename,
-      FLAGS_query_filename,
-      file_exists(FLAGS_groundtruth_filename) ? FLAGS_groundtruth_filename : "",
-      L,
-      static_cast<float>(FLAGS_tau),
-      N_base};
-
-  ggnn.ggnnMain(gpus, FLAGS_mode, N_shard, FLAGS_graph_dir,
-                FLAGS_refinement_iterations, FLAGS_grid_search);
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_no_slack_query.cu b/src/sift1m_no_slack_query.cu
deleted file mode 100644
index 0e166ae..0000000
--- a/src/sift1m_no_slack_query.cu
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 24;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  const float tau_query = 0.0f;
-  cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-
-#define QUERY_BEST_SIZE(best_size)                              \
-  {                                                             \
-    LOG(INFO) << "Query with best size " << best_size;          \
-    m_ggnn.noSlackQueryLayer<32, 400, 448 + best_size - 10,     \
-                             64 + best_size - 10, best_size>(); \
-  }
-  QUERY_BEST_SIZE(10);
-  QUERY_BEST_SIZE(20);
-  QUERY_BEST_SIZE(30);
-  QUERY_BEST_SIZE(40);
-  QUERY_BEST_SIZE(50);
-  QUERY_BEST_SIZE(60);
-  QUERY_BEST_SIZE(70);
-  QUERY_BEST_SIZE(80);
-  QUERY_BEST_SIZE(90);
-  QUERY_BEST_SIZE(100);
-  QUERY_BEST_SIZE(110);
-  QUERY_BEST_SIZE(120);
-  QUERY_BEST_SIZE(130);
-  QUERY_BEST_SIZE(140);
-  QUERY_BEST_SIZE(150);
-  QUERY_BEST_SIZE(160);
-  QUERY_BEST_SIZE(170);
-  QUERY_BEST_SIZE(180);
-  QUERY_BEST_SIZE(190);
-  QUERY_BEST_SIZE(200);
-  QUERY_BEST_SIZE(220);
-  QUERY_BEST_SIZE(240);
-  QUERY_BEST_SIZE(260);
-  QUERY_BEST_SIZE(280);
-  QUERY_BEST_SIZE(300);
-  QUERY_BEST_SIZE(320);
-  QUERY_BEST_SIZE(340);
-  QUERY_BEST_SIZE(360);
-  QUERY_BEST_SIZE(380);
-  QUERY_BEST_SIZE(400);
-  QUERY_BEST_SIZE(450);
-  QUERY_BEST_SIZE(500);
-  QUERY_BEST_SIZE(550);
-  QUERY_BEST_SIZE(600);
-  QUERY_BEST_SIZE(700);
-  QUERY_BEST_SIZE(800);
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_stats.cu b/src/sift1m_stats.cu
deleted file mode 100644
index b40ad26..0000000
--- a/src/sift1m_stats.cu
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 24;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-    m_ggnn.queryLayerDebug<32, 400, 448, 64>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 0; i < 70; ++i) query_function(i * 0.01f);
-    for (int i = 7; i <= 20; ++i) query_function(i * 0.1f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1, 99% C@10 (using -tau 0.5 "
-                 "-refinement_iterations 2):";
-    // query_function(0.34f);
-    // query_function(0.41f);
-    query_function(0.51f);
-    // query_function(0.64f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_subsets.cu b/src/sift1m_subsets.cu
deleted file mode 100644
index 7120d3a..0000000
--- a/src/sift1m_subsets.cu
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 24;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename, "", L,
-              static_cast<float>(FLAGS_tau)};
-
-  for (KeyT n = 50000; n <= m_ggnn.dataset.N_base; n += 50000) {
-    LOG(INFO) << "Constructing graph for " << n << " points.";
-    m_ggnn.reinit_graph_for_subset(n);
-    m_ggnn.generateGTUsingBF();
-
-    {
-      std::vector<float> construction_times;
-
-      cudaEvent_t start, stop;
-      cudaEventCreate(&start);
-      cudaEventCreate(&stop);
-
-      LOG(INFO) << "Starting Graph construction... (tau=" << FLAGS_tau << ")";
-
-      cudaEventRecord(start);
-      m_ggnn.build();
-      cudaEventRecord(stop);
-
-      CHECK_CUDA(cudaPeekAtLastError());
-      CHECK_CUDA(cudaDeviceSynchronize());
-      CHECK_CUDA(cudaPeekAtLastError());
-
-      cudaEventSynchronize(stop);
-      float milliseconds = 0;
-      cudaEventElapsedTime(&milliseconds, start, stop);
-      construction_times.push_back(milliseconds);
-
-      for (int refinement_step = 0;
-           refinement_step < FLAGS_refinement_iterations; ++refinement_step) {
-        DLOG(INFO) << "Refinement step " << refinement_step;
-        m_ggnn.refine();
-
-        cudaEventRecord(stop);
-        CHECK_CUDA(cudaPeekAtLastError());
-        CHECK_CUDA(cudaDeviceSynchronize());
-        CHECK_CUDA(cudaPeekAtLastError());
-        cudaEventSynchronize(stop);
-
-        float elapsed_milliseconds = 0;
-        cudaEventElapsedTime(&elapsed_milliseconds, start, stop);
-        construction_times.push_back(elapsed_milliseconds);
-      }
-      cudaEventDestroy(start);
-      cudaEventDestroy(stop);
-
-      for (int refinement_step = 0; refinement_step < construction_times.size();
-           refinement_step++) {
-        const float elapsed_milliseconds = construction_times[refinement_step];
-        const float elapsed_seconds = elapsed_milliseconds / 1000.0f;
-        const int number_of_points = m_ggnn.ggnn_gpu_instance.N_shard;
-
-        LOG(INFO) << "Graph construction + " << refinement_step
-                  << " refinement step(s)";
-        LOG(INFO) << "                   -- secs: " << elapsed_seconds;
-        LOG(INFO) << "                   -- points: " << number_of_points;
-        LOG(INFO) << "                   -- ms/point: "
-                  << elapsed_milliseconds / number_of_points;
-      }
-    }
-
-    {
-      CHECK_CUDA(cudaPeekAtLastError());
-      CHECK_CUDA(cudaDeviceSynchronize());
-      CHECK_CUDA(cudaPeekAtLastError());
-
-      auto query_function = [&m_ggnn](const float tau_query) {
-        cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-        LOG(INFO) << "--";
-        LOG(INFO) << "Query with tau_query " << tau_query;
-        // faster for C@1 = 99%
-        // LOG(INFO) << "fast query (good for C@1)";
-        // m_ggnn.queryLayer<32, 200, 256, 64>();
-        // better for C@10 > 99%
-        LOG(INFO) << "regular query (good for C@10)";
-        m_ggnn.queryLayer<32, 400, 448, 64>();
-        // expensive, can get to 99.99% C@10
-        // m_ggnn.queryLayer<128, 2000, 2048, 256>();
-      };
-
-      {  // by default, just execute a few queries
-        LOG(INFO) << "--";
-        LOG(INFO) << "90, 95, 99% R@1, 99% C@10 (using -tau 0.5 "
-                     "-refinement_iterations 2):";
-        query_function(0.34f);
-        query_function(0.41f);
-        query_function(0.51f);
-        query_function(0.64f);
-      }
-    }
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_top10.cu b/src/sift1m_top10.cu
deleted file mode 100644
index e8f583d..0000000
--- a/src/sift1m_top10.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 40;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 10;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-    // faster for C@1 = 99%
-    LOG(INFO) << "fast query (good for C@1)";
-    m_ggnn.queryLayer<32, 200, 256, 64>();
-    // better for C@10 > 99%
-    LOG(INFO) << "regular query (good for C@10)";
-    m_ggnn.queryLayer<32, 400, 448, 64>();
-    // m_ggnn.queryLayer<64, 400, 448, 64>();
-    // expensive, can get to 99.99% C@10
-    // m_ggnn.queryLayer<128, 2000, 2048, 256>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 0; i < 70; ++i) query_function(i * 0.01f);
-    for (int i = 7; i <= 20; ++i) query_function(i * 0.1f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1, 99% C@10 (using -tau 0.5 "
-                 "-refinement_iterations 2):";
-    query_function(0.34f);
-    query_function(0.41f);
-    query_function(0.51f);
-    query_function(0.64f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}
diff --git a/src/sift1m_top100.cu b/src/sift1m_top100.cu
deleted file mode 100644
index 4825a87..0000000
--- a/src/sift1m_top100.cu
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright 2019 ComputerGraphics Tuebingen. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Authors: Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. Lensch
-//
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-#include <iostream>
-#include <vector>
-
-#include "ggnn/cuda_knn_ggnn.cuh"
-#include "ggnn/utils/cuda_knn_constants.cuh"
-#include "ggnn/utils/cuda_knn_dataset.cuh"
-#include "ggnn/utils/cuda_knn_utils.cuh"
-
-DEFINE_string(base_filename, "", "path to file with base vectors");
-DEFINE_string(query_filename, "", "path to file with perform_query vectors");
-DEFINE_string(groundtruth_filename, "",
-              "path to file with groundtruth vectors");
-DEFINE_string(graph_filename, "",
-              "path to file that contains the serialized graph");
-DEFINE_double(tau, 0.5, "Parameter tau");
-DEFINE_int32(refinement_iterations, 2, "Number of refinement iterations");
-DEFINE_int32(gpu_id, 0, "GPU id");
-DEFINE_bool(grid_search, false,
-            "Perform queries for a wide range of parameters.");
-
-int main(int argc, char* argv[]) {
-  google::InitGoogleLogging(argv[0]);
-  google::LogToStderr();
-
-  gflags::SetUsageMessage(
-      "GGNN: Graph-based GPU Nearest Neighbor Search\n"
-      "by Fabian Groh, Lukas Ruppert, Patrick Wieschollek, Hendrik P.A. "
-      "Lensch\n"
-      "(c) 2020 Computer Graphics University of Tuebingen");
-  gflags::SetVersionString("1.0.0");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Reading files";
-  CHECK(file_exists(FLAGS_base_filename))
-      << "File for base vectors has to exist";
-  CHECK(file_exists(FLAGS_query_filename))
-      << "File for perform_query vectors has to exist";
-
-  CHECK_GE(FLAGS_tau, 0) << "Tau has to be bigger or equal 0.";
-  CHECK_GE(FLAGS_refinement_iterations, 0)
-      << "The number of refinement iterations has to be non-negative.";
-
-  // ####################################################################
-  // compile-time configuration
-  //
-  // data types
-  //
-  /// data type for addressing points (needs to be able to represent N)
-  using KeyT = int32_t;
-  /// data type of the dataset (e.g., char, int, float)
-  using BaseT = float;
-  /// data type of computed distances
-  using ValueT = float;
-  /// data type for addressing base-vectors (needs to be able to represent N*D)
-  using BAddrT = uint32_t;
-  /// data type for addressing the graph (needs to be able to represent
-  /// N*KBuild)
-  using GAddrT = uint32_t;
-  //
-  // dataset configuration (here: SIFT1M)
-  //
-  /// dimension of the dataset
-  const int D = 128;
-  /// distance measure (Euclidean or Cosine)
-  const DistanceMeasure measure = Euclidean;
-  //
-  // search-graph configuration
-  //
-  /// number of neighbors per point in the graph
-  const int KBuild = 40;
-  /// maximum number of inverse/symmetric links (KBuild / 2 usually works best)
-  const int KF = KBuild / 2;
-  /// segment/batch size (needs to be > KBuild-KF)
-  const int S = 32;
-  /// graph height / number of layers (4 usually performs best)
-  const int L = 4;
-  //
-  // query configuration
-  //
-  /// number of neighbors to search for
-  const int KQuery = 100;
-
-  static_assert(KBuild - KF < S,
-                "there are not enough points to fill the local neighbor list!");
-
-  LOG(INFO) << "Using the following parameters " << KBuild << " (KBuild) " << KF
-            << " (KF) " << S << " (S) " << L << " (L) " << D << " (D) ";
-
-  // Set the requested GPU id, if possible.
-  {
-    int numGpus;
-    cudaGetDeviceCount(&numGpus);
-    CHECK_GE(FLAGS_gpu_id, 0) << "This GPU does not exist";
-    CHECK_LT(FLAGS_gpu_id, numGpus) << "This GPU does not exist";
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, FLAGS_gpu_id);
-    LOG(INFO) << "device name: " << prop.name;
-  }
-  cudaSetDevice(FLAGS_gpu_id);
-
-  typedef GGNN<measure, KeyT, ValueT, GAddrT, BaseT, BAddrT, D, KBuild, KF,
-               KQuery, S>
-      GGNN;
-  GGNN m_ggnn{FLAGS_base_filename, FLAGS_query_filename,
-              FLAGS_groundtruth_filename, L, static_cast<float>(FLAGS_tau)};
-
-  m_ggnn.ggnnMain(FLAGS_graph_filename, FLAGS_refinement_iterations);
-
-  auto query_function = [&m_ggnn](const float tau_query) {
-    cudaMemcpyToSymbol(c_tau_query, &tau_query, sizeof(float));
-    LOG(INFO) << "--";
-    LOG(INFO) << "Query with tau_query " << tau_query;
-    // faster for C@1 = 99%
-    // LOG(INFO) << "fast query (good for C@1)";
-    // m_ggnn.queryLayer<32, 200, 256, 128>();
-    // better for C@10 > 99%
-    LOG(INFO) << "regular query (good for C@10)";
-    m_ggnn.queryLayer<32, 400, 448, 128>();
-    LOG(INFO) << "extended query (good for C@100)";
-    m_ggnn.queryLayer<64, 1000, 1024, 128>();
-    // expensive, can get to 99.99% C@10
-    // LOG(INFO) << "expensive query";
-    // m_ggnn.queryLayer<128, 2000, 2048, 256>();
-  };
-
-  if (FLAGS_grid_search) {
-    LOG(INFO) << "--";
-    LOG(INFO) << "grid-search:";
-    for (int i = 0; i <= 100; ++i) query_function(i * 0.01f);
-  } else {  // by default, just execute a few queries
-    LOG(INFO) << "--";
-    LOG(INFO) << "90, 95, 99% R@1, 99% C@10 (using -tau 0.5 "
-                 "-refinement_iterations 2):";
-    query_function(0.34f);
-    query_function(0.41f);
-    query_function(0.51f);
-    query_function(0.64f);
-  }
-
-  printf("done! \n");
-  gflags::ShutDownCommandLineFlags();
-  return 0;
-}