10 for (std::size_t i = 0; i < corpus.size(); ++i)
12 const auto document =
tokenize(corpus[i]);
14 for (
const auto& word : document)
27 std::vector<String> wordsWithNegativeIDF;
30 const float idf = std::log(
m_corpusSize - documents.size() + 0.5f) - std::log(documents.size() + 0.5f);
35 wordsWithNegativeIDF.push_back(word);
47 for (
const auto& word : wordsWithNegativeIDF)
56 words.addTokens(text.trim().toLowerCase().removeCharacters(
","),
false);
58 results.ensureStorageAllocated(words.size());
59 for (
auto word : words)
79 if (word.endsWith(
"ies"))
81 return word.replaceSection(word.length() - 3, 3,
"y");
83 else if (word.endsWith(
"es"))
85 return word.replaceSection(word.length() - 2, 2,
"");
87 else if (word.endsWith(
"s"))
89 return word.replaceSection(word.length() - 1, 1,
"");
96 const auto queryTerms =
tokenize(query);
99 for (
const auto& token : queryTerms)
116 const auto queryTerms =
tokenize(query);
118 std::vector<float> scores(ids.size(), 0.0f);
119 for (
const auto& token : queryTerms)
123 for (std::size_t i = 0; i < ids.size(); ++i)
125 const auto index = ids[i];
const float m_b
Definition BM25.h:53
float m_averageDocumentLength
Definition BM25.h:55
BM25(const std::vector< String > &corpus)
BM25 ranking algorithm.
Definition BM25.cpp:6
const float m_epsilon
Definition BM25.h:54
StringArray m_stopwords
Definition BM25.h:64
std::vector< int > m_documentLength
Definition BM25.h:61
int m_corpusSize
Definition BM25.h:56
String stemmer(String word)
Apply stemming to a word.
Definition BM25.cpp:72
std::vector< float > getScores(String query)
Calculate BM25 scores between query and all documents in corpus.
Definition BM25.cpp:94
std::unordered_map< String, std::unordered_map< int, int > > m_termToDocument
Definition BM25.h:59
const float m_k1
Definition BM25.h:52
StringArray tokenize(const String &text)
Apply tokenization to a text.
Definition BM25.cpp:53
float m_averageInverseDocumentFrequency
Definition BM25.h:57
void initialise(const std::vector< String > &corpus)
Initialise the BM25 ranking algorithm with a corpus.
Definition BM25.cpp:8
std::vector< float > getBatchScores(String query, const std::vector< std::size_t > &ids)
Calculate BM25 scores between query and a subset of documents in corpus.
Definition BM25.cpp:114
std::unordered_map< String, float > m_inverseDocumentFrequency
Definition BM25.h:60
Definition AirAbsorptionFilter.cpp:2