Krotos Modules 3
Loading...
Searching...
No Matches
NearestNeighbourSearch.cpp
Go to the documentation of this file.
1//
2// NearestNeighbourSearch.cpp
3// vector_search
4//
5// Created by Chris Scott on 06/12/2022.
6//
7
9
10namespace krotos
11{
12
14 {
15 dataset = Dataset::read_embeddings(datasetPath);
16
17 if (dataset.filenames.size() == 0)
18 {
19 assert(false); // Empty dataset
20 return;
21 }
22
23 assert(dataset.filenames.size() >= 0);
24 const auto dim = dataset.vectors[dataset.filenames[0]].size();
25 const std::size_t n = 2 * dataset.filenames.size();
26
27 //mSink = std::make_unique<SinkFloat>(sinkChannelCount);
28
29 m_spaceInstance = std::make_unique <hnswlib::InnerProductSpace>(dim);
30 // The line below is for comapatability with hnswlib. space is a ptr to hnswlib::SpaceInterface,
31 // an abstract class which can't be instantiated as is - it is used as an interface class
32 // Assigning m_spaceInstance raw pointer here should be safe because m_spaceInstance
33 // will persist until owning class is destroyed
34 // If anyone can see a better way to do this outside of re-factoring hnswlib, please do !
35 space = m_spaceInstance.get();
36
37 alg = std::make_unique<hnswlib::BruteforceSearch<float>>(space, n);
38 //alg = new hnswlib::HierarchicalNSW<float>(space, n); // alternative
39
41 }
42
44
45 std::vector<std::pair<float, hnswlib::labeltype>> NearestNeighbourSearch::knnQuery(std::string query, int k) {
46 if (dataset.vectors.count(query) > 0)
47 return alg->searchKnnCloserFirst(dataset.vectors[query].data(), k);
48 else
49 return std::vector<std::pair<float, hnswlib::labeltype> >();
50 }
51
53 // Insert each vector into the index.
54 std::size_t index = 0;
55 for (auto word : dataset.filenames) {
56 alg->addPoint(dataset.vectors[word].data(), index);
57 ++index;
58 }
59 }
60
61 // Read a vector collection in csv format.
62 // Each line contains a filename and a comma-separated list of numbers.
63
64 Dataset Dataset::read_embeddings(const std::string& filename) {
65 std::ifstream file(filename);
66
67 if (!file.is_open()) {
68 assert(false); // File not found - should never actually get here as this method should not get called is file doesn't exist
69 return Dataset(); // Return empty dataset
70 }
71
72 std::vector<std::string> filenames;
73 std::map<std::string, std::vector<float>> vectors;
74 while (!file.eof()) {
75 std::string full_line;
76 std::getline(file, full_line);
77 std::istringstream line(full_line);
78
79 std::string key;
80 getline(line, key, ',');
81
82 std::string field;
83 std::vector<float> row;
84 while (getline(line, field, ',')) {
85 auto val = (float)atof(field.c_str());
86 row.push_back(val);
87 }
88
89 if (row.size() != 0) {
90 filenames.push_back(key);
91 vectors[key] = row;
92 }
93 }
94
95 Dataset res;
96 res.filenames = filenames;
97 res.vectors = vectors;
98 return res;
99 }
100
101 void NearestNeighbourSearch::getNearestNeighbours(const std::string& query, const int numMatches, std::vector<std::string>& returnResults)
102 {
103 // Perform a search
104 const auto results = knnQuery(query, numMatches); // find k matches (including query)
105
106 // Results will be empty if the query file wasn't in the dataset
107 if (results.size() > 0)
108 {
109 // Return the k closest matches (including query).
110 //assert(results[0].second == 0); // first match should be query
111 for (std::size_t i = 0; i < results.size(); ++i) {
112 // std::pair<distance score, index>
113 returnResults.push_back( dataset.filenames[results[i].second] );
114 }
115 }
116 }
117
118}
~NearestNeighbourSearch()
Definition NearestNeighbourSearch.cpp:43
hnswlib::SpaceInterface< float > * space
Definition NearestNeighbourSearch.h:51
std::vector< std::pair< float, hnswlib::labeltype > > knnQuery(std::string query, int k)
Definition NearestNeighbourSearch.cpp:45
Dataset dataset
Definition NearestNeighbourSearch.h:48
NearestNeighbourSearch(std::string datasetPath)
Definition NearestNeighbourSearch.cpp:13
void getNearestNeighbours(const std::string &query, const int numMatches, std::vector< std::string > &returnResults)
Definition NearestNeighbourSearch.cpp:101
std::unique_ptr< hnswlib::BruteforceSearch< float > > alg
Definition NearestNeighbourSearch.h:49
std::unique_ptr< hnswlib::InnerProductSpace > m_spaceInstance
Definition NearestNeighbourSearch.h:53
void addDatasetItems()
Definition NearestNeighbourSearch.cpp:52
Definition AirAbsorptionFilter.cpp:2
Definition NearestNeighbourSearch.h:22
std::map< std::string, std::vector< float > > vectors
Definition NearestNeighbourSearch.h:24
std::vector< std::string > filenames
Definition NearestNeighbourSearch.h:23
static Dataset read_embeddings(const std::string &filename)
Definition NearestNeighbourSearch.cpp:64