9#include <unordered_set>
16 template<
typename dist_t>
24 loadIndex(location, s, nmslib, max_elements);
52 throw std::runtime_error(
"Not enough memory");
64 throw std::runtime_error(
"Not enough memory: HierarchicalNSW failed to allocate linklists");
71 constexpr bool operator()(std::pair<dist_t, tableint>
const &a,
72 std::pair<dist_t, tableint>
const &b)
const noexcept {
73 return a.first < b.first;
149 std::uniform_real_distribution<double> distribution(0.0, 1.0);
155 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
161 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> top_candidates;
162 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> candidateSet;
167 top_candidates.emplace(dist, ep_id);
169 candidateSet.emplace(-dist, ep_id);
171 lowerBound = std::numeric_limits<dist_t>::max();
172 candidateSet.emplace(-lowerBound, ep_id);
174 visited_array[ep_id] = visited_array_tag;
176 while (!candidateSet.empty()) {
177 std::pair<dist_t, tableint> curr_el_pair = candidateSet.top();
178 if ((-curr_el_pair.first) > lowerBound && top_candidates.size() ==
ef_construction_) {
183 tableint curNodeNum = curr_el_pair.second;
197 _mm_prefetch((
char *) (visited_array + *(data + 1)), _MM_HINT_T0);
198 _mm_prefetch((
char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0);
203 for (
size_t j = 0; j < size; j++) {
204 tableint candidate_id = *(datal + j);
207 _mm_prefetch((
char *) (visited_array + *(datal + j + 1)), _MM_HINT_T0);
210 if (visited_array[candidate_id] == visited_array_tag)
continue;
211 visited_array[candidate_id] = visited_array_tag;
216 candidateSet.emplace(-dist1, candidate_id);
222 top_candidates.emplace(dist1, candidate_id);
225 top_candidates.pop();
227 if (!top_candidates.empty())
228 lowerBound = top_candidates.top().first;
234 return top_candidates;
240 template <
bool has_deletions,
bool collect_metrics=false>
241 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst>
247 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> top_candidates;
248 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> candidate_set;
254 top_candidates.emplace(dist, ep_id);
255 candidate_set.emplace(-dist, ep_id);
257 lowerBound = std::numeric_limits<dist_t>::max();
258 candidate_set.emplace(-lowerBound, ep_id);
261 visited_array[ep_id] = visited_array_tag;
263 while (!candidate_set.empty()) {
265 std::pair<dist_t, tableint> current_node_pair = candidate_set.top();
267 if ((-current_node_pair.first) > lowerBound && (top_candidates.size() == ef || has_deletions ==
false)) {
272 tableint current_node_id = current_node_pair.second;
282 _mm_prefetch((
char *) (visited_array + *(data + 1)), _MM_HINT_T0);
283 _mm_prefetch((
char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0);
285 _mm_prefetch((
char *) (data + 2), _MM_HINT_T0);
288 for (
size_t j = 1; j <= size; j++) {
289 int candidate_id = *(data + j);
292 _mm_prefetch((
char *) (visited_array + *(data + j + 1)), _MM_HINT_T0);
296 if (!(visited_array[candidate_id] == visited_array_tag)) {
298 visited_array[candidate_id] = visited_array_tag;
303 if (top_candidates.size() < ef || lowerBound > dist) {
304 candidate_set.emplace(-dist, candidate_id);
312 top_candidates.emplace(dist, candidate_id);
314 if (top_candidates.size() > ef)
315 top_candidates.pop();
317 if (!top_candidates.empty())
318 lowerBound = top_candidates.top().first;
325 return top_candidates;
329 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> &top_candidates,
331 if (top_candidates.size() < M) {
335 std::priority_queue<std::pair<dist_t, tableint>> queue_closest;
336 std::vector<std::pair<dist_t, tableint>> return_list;
337 while (top_candidates.size() > 0) {
338 queue_closest.emplace(-top_candidates.top().first, top_candidates.top().second);
339 top_candidates.pop();
342 while (queue_closest.size()) {
343 if (return_list.size() >= M)
345 std::pair<dist_t, tableint> curent_pair = queue_closest.top();
346 dist_t dist_to_query = -curent_pair.first;
350 for (std::pair<dist_t, tableint> second_pair : return_list) {
355 if (curdist < dist_to_query) {
361 return_list.push_back(curent_pair);
365 for (std::pair<dist_t, tableint> curent_pair : return_list) {
366 top_candidates.emplace(-curent_pair.first, curent_pair.second);
388 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> &top_candidates,
389 int level,
bool isUpdate) {
392 if (top_candidates.size() >
M_)
393 throw std::runtime_error(
"Should be not be more than M_ candidates returned by the heuristic");
395 std::vector<tableint> selectedNeighbors;
396 selectedNeighbors.reserve(
M_);
397 while (top_candidates.size() > 0) {
398 selectedNeighbors.push_back(top_candidates.top().second);
399 top_candidates.pop();
402 tableint next_closest_entry_point = selectedNeighbors.back();
411 if (*ll_cur && !isUpdate) {
412 throw std::runtime_error(
"The newly inserted element should have blank link list");
416 for (
size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
417 if (data[idx] && !isUpdate)
418 throw std::runtime_error(
"Possible memory corruption");
420 throw std::runtime_error(
"Trying to make a link on a non-existent level");
422 data[idx] = selectedNeighbors[idx];
427 for (
size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
429 std::unique_lock <std::mutex> lock(
link_list_locks_[selectedNeighbors[idx]]);
439 if (sz_link_list_other > Mcurmax)
440 throw std::runtime_error(
"Bad value of sz_link_list_other");
441 if (selectedNeighbors[idx] == cur_c)
442 throw std::runtime_error(
"Trying to connect an element to itself");
444 throw std::runtime_error(
"Trying to make a link on a non-existent level");
448 bool is_cur_c_present =
false;
450 for (
size_t j = 0; j < sz_link_list_other; j++) {
451 if (data[j] == cur_c) {
452 is_cur_c_present =
true;
459 if (!is_cur_c_present) {
460 if (sz_link_list_other < Mcurmax) {
461 data[sz_link_list_other] = cur_c;
468 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> candidates;
469 candidates.emplace(d_max, cur_c);
471 for (
size_t j = 0; j < sz_link_list_other; j++) {
480 while (candidates.size() > 0) {
481 data[indx] = candidates.top().second;
503 return next_closest_entry_point;
515 std::priority_queue<std::pair<dist_t, tableint >> top_candidates;
520 for (
size_t level =
maxlevel_; level > 0; level--) {
528 for (
int i = 0; i < size; i++) {
531 throw std::runtime_error(
"cand error");
546 top_candidates.swap(top_candidates1);
551 top_candidates.swap(top_candidates1);
554 while (top_candidates.size() > k) {
555 top_candidates.pop();
557 return top_candidates;
562 throw std::runtime_error(
"Cannot resize, max element is less than the current number of elements");
575 if (data_level0_memory_new ==
nullptr)
576 throw std::runtime_error(
"Not enough memory: resizeIndex failed to allocate base layer");
580 char ** linkLists_new = (
char **) realloc(
linkLists_,
sizeof(
void *) * new_max_elements);
581 if (linkLists_new ==
nullptr)
582 throw std::runtime_error(
"Not enough memory: resizeIndex failed to allocate other layers");
589 std::ofstream output(location, std::ios::binary);
590 std::streampos position;
619 std::ifstream input(location, std::ios::binary);
621 if (!input.is_open())
622 throw std::runtime_error(
"Cannot open file");
625 input.seekg(0,input.end);
626 std::streampos total_filesize=input.tellg();
627 input.seekg(0,input.beg);
633 size_t max_elements = max_elements_i;
654 auto pos=input.tellg();
661 if(input.tellg() < 0 || input.tellg()>=total_filesize){
662 throw std::runtime_error(
"Index seems to be corrupted or unsupported");
665 unsigned int linkListSize;
667 if (linkListSize != 0) {
668 input.seekg(linkListSize,input.cur);
673 if(input.tellg()!=total_filesize)
674 throw std::runtime_error(
"Index seems to be corrupted or unsupported");
680 input.seekg(pos,input.beg);
684 throw std::runtime_error(
"Not enough memory: loadIndex failed to allocate level0");
695 linkLists_ = (
char **) malloc(
sizeof(
void *) * max_elements);
697 throw std::runtime_error(
"Not enough memory: loadIndex failed to allocate linklists");
703 unsigned int linkListSize;
705 if (linkListSize == 0) {
711 linkLists_[i] = (
char *) malloc(linkListSize);
713 throw std::runtime_error(
"Not enough memory: loadIndex failed to allocate linklist");
728 template<
typename data_t>
734 throw std::runtime_error(
"Label not found");
736 label_c = search->second;
740 std::vector<data_t> data;
741 data_t* data_ptr = (data_t*) data_ptrv;
742 for (
int i = 0; i < dim; i++) {
743 data.push_back(*data_ptr);
759 throw std::runtime_error(
"Label not found");
761 tableint internalId = search->second;
774 unsigned char *ll_cur = ((
unsigned char *)
get_linklist0(internalId))+2;
780 throw std::runtime_error(
"The requested to delete element is already deleted");
792 throw std::runtime_error(
"Label not found");
794 tableint internalId = search->second;
806 unsigned char *ll_cur = ((
unsigned char *)
get_linklist0(internalId))+2;
807 *ll_cur &= ~DELETE_MARK;
812 throw std::runtime_error(
"The requested to undelete element is not deleted");
822 unsigned char *ll_cur = ((
unsigned char*)
get_linklist0(internalId))+2;
827 return *((
unsigned short int *)ptr);
831 *((
unsigned short int*)(ptr))=*((
unsigned short int *)&size);
849 std::uniform_real_distribution<float> distribution(0.0, 1.0);
850 for (
int layer = 0; layer <= elemLevel; layer++) {
851 std::unordered_set<tableint> sCand;
852 std::unordered_set<tableint> sNeigh;
854 if (listOneHop.size() == 0)
857 sCand.insert(internalId);
859 for (
auto&& elOneHop : listOneHop) {
860 sCand.insert(elOneHop);
865 sNeigh.insert(elOneHop);
868 for (
auto&& elTwoHop : listTwoHop) {
869 sCand.insert(elTwoHop);
873 for (
auto&& neigh : sNeigh) {
877 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> candidates;
878 size_t size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1;
880 for (
auto&& cand : sCand) {
885 if (candidates.size() < elementsToKeep) {
886 candidates.emplace(distance, cand);
888 if (distance < candidates.top().first) {
890 candidates.emplace(distance, cand);
902 size_t candSize = candidates.size();
905 for (
size_t idx = 0; idx < candSize; idx++) {
906 data[idx] = candidates.top().second;
917 tableint currObj = entryPointInternalId;
918 if (dataPointLevel < maxLevel) {
920 for (
int level = maxLevel; level > dataPointLevel; level--) {
932 for (
int i = 0; i < size; i++) {
948 if (dataPointLevel > maxLevel)
949 throw std::runtime_error(
"Level of item to be updated cannot be bigger than max level");
951 for (
int level = dataPointLevel; level >= 0; level--) {
953 currObj, dataPoint, level);
955 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> filteredTopCandidates;
956 while (topCandidates.size() > 0) {
957 if (topCandidates.top().second != dataPointInternalId)
958 filteredTopCandidates.push(topCandidates.top());
965 if (filteredTopCandidates.size() > 0) {
970 filteredTopCandidates.pop();
982 std::vector<tableint> result(size);
984 memcpy(result.data(), ll,size *
sizeof(
tableint));
997 tableint existingInternalId = search->second;
998 templock_curr.unlock();
1007 return existingInternalId;
1011 throw std::runtime_error(
"The number of elements exceeds the specified limit");
1029 std::unique_lock <std::mutex> templock(
global);
1031 if (curlevel <= maxlevelcopy)
1047 throw std::runtime_error(
"Not enough memory: addPoint failed to allocate linklist");
1051 if ((
signed)currObj != -1) {
1053 if (curlevel < maxlevelcopy) {
1056 for (
int levelScope = maxlevelcopy; levelScope > curlevel; levelScope--) {
1059 bool changed =
true;
1068 for (
int i = 0; i < size; i++) {
1071 throw std::runtime_error(
"cand error");
1084 for (
int levelScope2 = std::min(curlevel, maxlevelcopy); levelScope2 >= 0; levelScope2--) {
1085 if (levelScope2 > maxlevelcopy || levelScope2 < 0)
1086 throw std::runtime_error(
"Level error");
1088 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> top_candidates =
searchBaseLayer(
1089 currObj, data_point, levelScope2);
1093 top_candidates.pop();
1107 if (curlevel > maxlevelcopy) {
1114 std::priority_queue<std::pair<dist_t, labeltype >>
1116 std::priority_queue<std::pair<dist_t, labeltype >> result;
1122 for (
int level =
maxlevel_; level > 0; level--) {
1123 bool changed =
true;
1134 for (
int i = 0; i < size; i++) {
1137 throw std::runtime_error(
"cand error");
1149 std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
CompareByFirst> top_candidates;
1152 currObj, query_data, std::max(
ef_, k));
1156 currObj, query_data, std::max(
ef_, k));
1159 while (top_candidates.size() > k) {
1160 top_candidates.pop();
1162 while (top_candidates.size() > 0) {
1163 std::pair<dist_t, tableint> rez = top_candidates.top();
1164 result.push(std::pair<dist_t, labeltype>(rez.first,
getExternalLabel(rez.second)));
1165 top_candidates.pop();
1171 int connections_checked=0;
1178 std::unordered_set<tableint> s;
1179 for (
int j=0; j<size; j++){
1180 assert(data[j] > 0);
1182 assert (data[j] != i);
1183 inbound_connections_num[data[j]]++;
1185 connections_checked++;
1188 assert(s.size() == size);
1192 int min1=inbound_connections_num[0], max1=inbound_connections_num[0];
1194 assert(inbound_connections_num[i] > 0);
1195 min1=std::min(inbound_connections_num[i],min1);
1196 max1=std::max(inbound_connections_num[i],max1);
1198 std::cout <<
"Min inbound: " << min1 <<
", Max inbound:" << max1 <<
"\n";
1200 std::cout <<
"integrity ok, checked " << connections_checked <<
" connections\n";
char * data_level0_memory_
Definition hnswalg.h:116
size_t maxM0_
Definition hnswalg.h:96
linklistsizeint * get_linklist(tableint internal_id, int level) const
Definition hnswalg.h:379
size_t size_data_per_element_
Definition hnswalg.h:90
size_t offsetLevel0_
Definition hnswalg.h:114
static const unsigned char DELETE_MARK
Definition hnswalg.h:749
std::vector< int > element_levels_
Definition hnswalg.h:118
~HierarchicalNSW()
Definition hnswalg.h:77
double mult_
Definition hnswalg.h:99
std::priority_queue< std::pair< dist_t, tableint >, std::vector< std::pair< dist_t, tableint > >, CompareByFirst > searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef) const
Definition hnswalg.h:242
unsigned short int getListCount(linklistsizeint *ptr) const
Definition hnswalg.h:826
std::atomic< long > metric_distance_computations
Definition hnswalg.h:237
size_t maxM_
Definition hnswalg.h:95
size_t label_offset_
Definition hnswalg.h:122
labeltype * getExternalLabeLp(tableint internal_id) const
Definition hnswalg.h:140
labeltype getExternalLabel(tableint internal_id) const
Definition hnswalg.h:130
std::atomic< long > metric_hops
Definition hnswalg.h:238
void saveIndex(const std::string &location)
Definition hnswalg.h:588
tableint mutuallyConnectNewElement(const void *, tableint cur_c, std::priority_queue< std::pair< dist_t, tableint >, std::vector< std::pair< dist_t, tableint > >, CompareByFirst > &top_candidates, int level, bool isUpdate)
Definition hnswalg.h:387
std::vector< std::mutex > link_list_update_locks_
Definition hnswalg.h:110
std::vector< tableint > getConnectionsWithLock(tableint internalId, int level)
Definition hnswalg.h:978
size_t data_size_
Definition hnswalg.h:120
std::priority_queue< std::pair< dist_t, labeltype > > searchKnn(const void *query_data, size_t k) const
Definition hnswalg.h:1115
void markDelete(labeltype label)
Definition hnswalg.h:755
std::vector< std::mutex > link_list_locks_
Definition hnswalg.h:106
void * dist_func_param_
Definition hnswalg.h:124
size_t max_elements_
Definition hnswalg.h:88
void unmarkDeletedInternal(tableint internalId)
Definition hnswalg.h:802
size_t offsetData_
Definition hnswalg.h:114
size_t ef_construction_
Definition hnswalg.h:97
VisitedListPool * visited_list_pool_
Definition hnswalg.h:103
static const tableint max_update_element_locks
Definition hnswalg.h:19
char * getDataByInternalId(tableint internal_id) const
Definition hnswalg.h:144
void repairConnectionsForUpdate(const void *dataPoint, tableint entryPointInternalId, tableint dataPointInternalId, int dataPointLevel, int maxLevel)
Definition hnswalg.h:916
size_t cur_element_count
Definition hnswalg.h:89
std::default_random_engine level_generator_
Definition hnswalg.h:127
void resizeIndex(size_t new_max_elements)
Definition hnswalg.h:560
HierarchicalNSW(SpaceInterface< dist_t > *s, const std::string &location, bool nmslib=false, size_t max_elements=0)
Definition hnswalg.h:23
char ** linkLists_
Definition hnswalg.h:117
std::mutex cur_element_count_guard_
Definition hnswalg.h:104
std::mutex global
Definition hnswalg.h:506
std::priority_queue< std::pair< dist_t, tableint > > searchKnnInternal(void *query_data, int k)
Definition hnswalg.h:514
int maxlevel_
Definition hnswalg.h:100
std::unordered_map< labeltype, tableint > label_lookup_
Definition hnswalg.h:125
DISTFUNC< dist_t > fstdistfunc_
Definition hnswalg.h:123
size_t size_links_per_element_
Definition hnswalg.h:91
std::default_random_engine update_probability_generator_
Definition hnswalg.h:128
void updatePoint(const void *dataPoint, tableint internalId, float updateNeighborProbability)
Definition hnswalg.h:838
size_t num_deleted_
Definition hnswalg.h:92
int getRandomLevel(double reverse_size)
Definition hnswalg.h:148
void loadIndex(const std::string &location, SpaceInterface< dist_t > *s, bool, size_t max_elements_i=0)
Definition hnswalg.h:618
HierarchicalNSW(SpaceInterface< dist_t > *)
Definition hnswalg.h:20
tableint enterpoint_node_
Definition hnswalg.h:111
void addPoint(const void *data_point, labeltype label)
Definition hnswalg.h:834
linklistsizeint * get_linklist0(tableint internal_id) const
Definition hnswalg.h:371
HierarchicalNSW(SpaceInterface< dist_t > *s, size_t max_elements, size_t M=16, size_t ef_construction=200, size_t random_seed=100)
Definition hnswalg.h:27
std::priority_queue< std::pair< dist_t, tableint >, std::vector< std::pair< dist_t, tableint > >, CompareByFirst > searchBaseLayer(tableint ep_id, const void *data_point, int layer)
Definition hnswalg.h:156
size_t ef_
Definition hnswalg.h:507
size_t M_
Definition hnswalg.h:94
void getNeighborsByHeuristic2(std::priority_queue< std::pair< dist_t, tableint >, std::vector< std::pair< dist_t, tableint > >, CompareByFirst > &top_candidates, const size_t M)
Definition hnswalg.h:328
size_t size_links_level0_
Definition hnswalg.h:113
void markDeletedInternal(tableint internalId)
Definition hnswalg.h:770
void unmarkDelete(labeltype label)
Definition hnswalg.h:788
void setEf(size_t ef)
Definition hnswalg.h:509
void checkIntegrity()
Definition hnswalg.h:1170
void setExternalLabel(tableint internal_id, labeltype label) const
Definition hnswalg.h:136
double revSize_
Definition hnswalg.h:99
void setListCount(linklistsizeint *ptr, unsigned short int size) const
Definition hnswalg.h:830
std::vector< data_t > getDataByLabel(labeltype label) const
Definition hnswalg.h:729
linklistsizeint * get_linklist_at_level(tableint internal_id, int level) const
Definition hnswalg.h:383
tableint addPoint(const void *data_point, labeltype label, int level)
Definition hnswalg.h:988
bool isMarkedDeleted(tableint internalId) const
Definition hnswalg.h:821
virtual size_t get_data_size()=0
virtual void * get_dist_func_param()=0
virtual DISTFUNC< MTYPE > get_dist_func()=0
Definition visited_list_pool.h:10
vl_type * mass
Definition visited_list_pool.h:13
vl_type curV
Definition visited_list_pool.h:12
Definition visited_list_pool.h:38
void releaseVisitedList(VisitedList *vl)
Definition visited_list_pool.h:65
VisitedList * getFreeVisitedList()
Definition visited_list_pool.h:50
Definition bruteforce.h:7
unsigned int linklistsizeint
Definition hnswalg.h:14
unsigned short int vl_type
Definition visited_list_pool.h:8
size_t labeltype
Definition hnswlib.h:117
static void writeBinaryPOD(std::ostream &out, const T &podRef)
Definition hnswlib.h:128
unsigned int tableint
Definition hnswalg.h:13
static void readBinaryPOD(std::istream &in, T &podRef)
Definition hnswlib.h:133
MTYPE(*)(const void *, const void *, const void *) DISTFUNC
Definition hnswlib.h:138
constexpr bool operator()(std::pair< dist_t, tableint > const &a, std::pair< dist_t, tableint > const &b) const noexcept
Definition hnswalg.h:71