Skip to content
22 changes: 14 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# such behavior.
# Contact for this feature: gopalrs.


# Some variables like MSVC are defined only after project(), so put that first.
cmake_minimum_required(VERSION 3.15)
project(diskann)
Expand Down Expand Up @@ -52,6 +53,9 @@ endif()

include_directories(${PROJECT_SOURCE_DIR}/include)

if(NOT PYBIND)
set(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS ON)
endif()
# It's necessary to include tcmalloc headers only if calling into MallocExtension interface.
# For using tcmalloc in DiskANN tools, it's enough to just link with tcmalloc.
if (DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS)
Expand Down Expand Up @@ -92,7 +96,9 @@ if (MSVC)
set(Boost_USE_STATIC_LIBS ON)
endif()

find_package(Boost COMPONENTS program_options)
if(NOT MSVC)
find_package(Boost COMPONENTS program_options)
endif()

# For Windows, fall back to nuget version if find_package didn't find it.
if (MSVC AND NOT Boost_FOUND)
Expand Down Expand Up @@ -219,13 +225,13 @@ if (MSVC)
# Tell CMake how to build the tcmalloc linker library from the submodule.
add_custom_target(build_libtcmalloc_minimal DEPENDS ${TCMALLOC_LINK_LIBRARY})
add_custom_command(OUTPUT ${TCMALLOC_LINK_LIBRARY}
COMMAND ${CMAKE_VS_MSBUILD_COMMAND} gperftools.sln /m /nologo
/t:libtcmalloc_minimal /p:Configuration="Release-Patch"
/property:Platform="x64"
/p:PlatformToolset=v${MSVC_TOOLSET_VERSION}
/p:WindowsTargetPlatformVersion=${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/gperftools)

COMMAND ${CMAKE_VS_MSBUILD_COMMAND} gperftools.sln /m /nologo
/t:libtcmalloc_minimal /p:Configuration="Release-Patch"
/property:Platform="x64"
/p:PlatformToolset=v${MSVC_TOOLSET_VERSION}
/p:WindowsTargetPlatformVersion=${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/gperftools)
add_library(libtcmalloc_minimal_for_exe STATIC IMPORTED)
add_library(libtcmalloc_minimal_for_dll STATIC IMPORTED)

Expand Down
9 changes: 9 additions & 0 deletions apps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ target_link_libraries(search_memory_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} $
add_executable(build_disk_index build_disk_index.cpp)
target_link_libraries(build_disk_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options)

add_executable(split_subgraph_index split_subgraph_index.cpp)
target_link_libraries(split_subgraph_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options)

add_executable(build_subgraph_index build_subgraph_index.cpp)
target_link_libraries(build_subgraph_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options)

add_executable(merge_subgraph_index merge_subgraph_index.cpp)
target_link_libraries(merge_subgraph_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options)

add_executable(search_disk_index search_disk_index.cpp)
target_link_libraries(search_disk_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)

Expand Down
193 changes: 193 additions & 0 deletions apps/build_subgraph_index.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <omp.h>
#include <boost/program_options.hpp>

#include "utils.h"
#include "disk_utils.h"
#include "math_utils.h"
#include "index.h"
#include "partition.h"
#include "program_options_utils.hpp"

namespace po = boost::program_options;

int main(int argc, char **argv)
{
std::string data_type, dist_fn, data_path, index_path_prefix, codebook_prefix, label_file, universal_label,
label_type;
uint32_t num_threads, R, L, disk_PQ, build_PQ, QD, Lf, filter_threshold, subshard_id;
float B, M;
bool append_reorder_data = false;
bool use_opq = false;

po::options_description desc{
program_options_utils::make_program_description("build_disk_index", "Build a disk-based index.")};
try
{
desc.add_options()("help,h", "Print information on arguments");

// Required parameters
po::options_description required_configs("Required");
required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
program_options_utils::DATA_TYPE_DESCRIPTION);
required_configs.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
program_options_utils::DISTANCE_FUNCTION_DESCRIPTION);
required_configs.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
required_configs.add_options()("data_path", po::value<std::string>(&data_path)->required(),
program_options_utils::INPUT_DATA_PATH);
required_configs.add_options()("search_DRAM_budget,B", po::value<float>(&B)->required(),
"DRAM budget in GB for searching the index to set the "
"compressed level for data while search happens");
required_configs.add_options()("build_DRAM_budget,M", po::value<float>(&M)->required(),
"DRAM budget in GB for building the index");

// Optional parameters
po::options_description optional_configs("Optional");
optional_configs.add_options()("num_threads,T",
po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
program_options_utils::NUMBER_THREADS_DESCRIPTION);
optional_configs.add_options()("max_degree,R", po::value<uint32_t>(&R)->default_value(64),
program_options_utils::MAX_BUILD_DEGREE);
optional_configs.add_options()("Lbuild,L", po::value<uint32_t>(&L)->default_value(100),
program_options_utils::GRAPH_BUILD_COMPLEXITY);
optional_configs.add_options()("QD", po::value<uint32_t>(&QD)->default_value(0),
" Quantized Dimension for compression");
optional_configs.add_options()("codebook_prefix", po::value<std::string>(&codebook_prefix)->default_value(""),
"Path prefix for pre-trained codebook");
optional_configs.add_options()("PQ_disk_bytes", po::value<uint32_t>(&disk_PQ)->default_value(0),
"Number of bytes to which vectors should be compressed "
"on SSD; 0 for no compression");
optional_configs.add_options()("append_reorder_data", po::bool_switch()->default_value(false),
"Include full precision data in the index. Use only in "
"conjuction with compressed data on SSD.");
optional_configs.add_options()("build_PQ_bytes", po::value<uint32_t>(&build_PQ)->default_value(0),
program_options_utils::BUIlD_GRAPH_PQ_BYTES);
optional_configs.add_options()("use_opq", po::bool_switch()->default_value(false),
program_options_utils::USE_OPQ);
optional_configs.add_options()("label_file", po::value<std::string>(&label_file)->default_value(""),
program_options_utils::LABEL_FILE);
optional_configs.add_options()("universal_label", po::value<std::string>(&universal_label)->default_value(""),
program_options_utils::UNIVERSAL_LABEL);
optional_configs.add_options()("FilteredLbuild", po::value<uint32_t>(&Lf)->default_value(0),
program_options_utils::FILTERED_LBUILD);
optional_configs.add_options()("filter_threshold,F", po::value<uint32_t>(&filter_threshold)->default_value(0),
"Threshold to break up the existing nodes to generate new graph "
"internally where each node has a maximum F labels.");
optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
program_options_utils::LABEL_TYPE_DESCRIPTION);
optional_configs.add_options()("subshard_id", po::value<uint32_t>(&subshard_id)->default_value(0),
program_options_utils::SUBSHARD_ID_DESCRIPTION);

// Merge required and optional parameters
desc.add(required_configs).add(optional_configs);

po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
{
std::cout << desc;
return 0;
}
po::notify(vm);
if (vm["append_reorder_data"].as<bool>())
append_reorder_data = true;
if (vm["use_opq"].as<bool>())
use_opq = true;
}
catch (const std::exception &ex)
{
std::cerr << ex.what() << '\n';
return -1;
}

bool use_filters = (label_file != "") ? true : false;
diskann::Metric metric;
if (dist_fn == std::string("l2"))
metric = diskann::Metric::L2;
else if (dist_fn == std::string("mips"))
metric = diskann::Metric::INNER_PRODUCT;
else if (dist_fn == std::string("cosine"))
metric = diskann::Metric::COSINE;
else
{
std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl;
return -1;
}

if (append_reorder_data)
{
if (disk_PQ == 0)
{
std::cout << "Error: It is not necessary to append data for reordering "
"when vectors are not compressed on disk."
<< std::endl;
return -1;
}
if (data_type != std::string("float"))
{
std::cout << "Error: Appending data for reordering currently only "
"supported for float data type."
<< std::endl;
return -1;
}
}

std::string params = std::string(std::to_string(R)) + " " + std::string(std::to_string(L)) + " " +
std::string(std::to_string(B)) + " " + std::string(std::to_string(M)) + " " +
std::string(std::to_string(num_threads)) + " " + std::string(std::to_string(disk_PQ)) + " " +
std::string(std::to_string(append_reorder_data)) + " " +
std::string(std::to_string(build_PQ)) + " " + std::string(std::to_string(QD));

try
{
if (label_file != "" && label_type == "ushort")
{
if (data_type == std::string("int8"))
return diskann::build_disk_index<int8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
metric, use_opq, codebook_prefix, use_filters, label_file,
universal_label, filter_threshold, Lf);
else if (data_type == std::string("uint8"))
return diskann::build_disk_index<uint8_t, uint16_t>(
data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix,
use_filters, label_file, universal_label, filter_threshold, Lf);
else if (data_type == std::string("float"))
return diskann::build_disk_index<float, uint16_t>(
data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix,
use_filters, label_file, universal_label, filter_threshold, Lf);
else
{
diskann::cerr << "Error. Unsupported data type" << std::endl;
return -1;
}
}
else
{
if (data_type == std::string("int8"))

return diskann::build_subgraph_index<int8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
metric, use_opq, codebook_prefix, use_filters, label_file, universal_label, filter_threshold, Lf, subshard_id);
//else if (data_type == std::string("uint8"))
// return diskann::build_disk_index<uint8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
// metric, use_opq, codebook_prefix, use_filters, label_file,
// universal_label, filter_threshold, Lf);
//else if (data_type == std::string("float"))
// return diskann::build_disk_index<float>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
// metric, use_opq, codebook_prefix, use_filters, label_file,
// universal_label, filter_threshold, Lf);
else
{
diskann::cerr << "Error. Unsupported data type" << std::endl;
return -1;
}
}
}
catch (const std::exception &e)
{
std::cout << std::string(e.what()) << std::endl;
diskann::cerr << "Index build failed." << std::endl;
return -1;
}
}
Loading