Krotos Modules 3
Loading...
Searching...
No Matches
BM25.cpp
Go to the documentation of this file.
1#include "BM25.h"
2
3namespace krotos
4{
5
6BM25::BM25(const std::vector<String>& corpus) { initialise(corpus); }
7
8void BM25::initialise(const std::vector<String>& corpus)
9{
10 for (std::size_t i = 0; i < corpus.size(); ++i)
11 {
12 const auto document = tokenize(corpus[i]);
13 m_documentLength.push_back(document.size());
14 for (const auto& word : document)
15 {
16 if (!m_termToDocument[word].contains(i))
17 {
18 m_termToDocument[word][i] = 0;
19 }
20 m_termToDocument[word][i] += 1;
21 }
22 }
23 m_corpusSize = (int)m_documentLength.size();
25 (float)std::reduce(m_documentLength.begin(), m_documentLength.end()) / std::max(m_corpusSize, 1);
26
27 std::vector<String> wordsWithNegativeIDF;
28 for (const auto& [word, documents] : m_termToDocument)
29 {
30 const float idf = std::log(m_corpusSize - documents.size() + 0.5f) - std::log(documents.size() + 0.5f);
32
33 if (idf < 0.0f)
34 {
35 wordsWithNegativeIDF.push_back(word);
36 }
37 }
38
40 for (const auto& [key, value] : m_inverseDocumentFrequency)
41 {
43 }
45
47 for (const auto& word : wordsWithNegativeIDF)
48 {
50 }
51}
52
53StringArray BM25::tokenize(const String& text)
54{
55 StringArray words;
56 words.addTokens(text.trim().toLowerCase().removeCharacters(","), false);
57 StringArray results;
58 results.ensureStorageAllocated(words.size());
59 for (auto word : words)
60 {
61 // check word isn't a stopword
62 if (!m_stopwords.contains(word))
63 {
64 // apply stemmer
65 word = stemmer(word);
66 results.add(word);
67 }
68 }
69 return results;
70}
71
72String BM25::stemmer(String word)
73{
74 // S-Stripper (a weak stemmer)
75 // University of Otago at INEX 2010
76 // X.-F. Jia, D. Alexander, V. Wood, A. Trotman (2010)
77 // Proceedings of INEX 2010, pp. 250-268
78
79 if (word.endsWith("ies"))
80 {
81 return word.replaceSection(word.length() - 3, 3, "y");
82 }
83 else if (word.endsWith("es"))
84 {
85 return word.replaceSection(word.length() - 2, 2, "");
86 }
87 else if (word.endsWith("s"))
88 {
89 return word.replaceSection(word.length() - 1, 1, "");
90 }
91 return word;
92}
93
94std::vector<float> BM25::getScores(String query)
95{
96 const auto queryTerms = tokenize(query);
97
98 std::vector<float> scores(m_corpusSize, 0.0f);
99 for (const auto& token : queryTerms)
100 {
101 if (m_termToDocument.contains(token))
102 {
103 for (const auto& [index, frequency] : m_termToDocument[token])
104 {
105 scores[index] +=
106 m_inverseDocumentFrequency[token] * frequency * (m_k1 + 1) /
107 (frequency + m_k1 * (1 - m_b + m_b * m_documentLength[index] / m_averageDocumentLength));
108 }
109 }
110 }
111 return scores;
112}
113
114std::vector<float> BM25::getBatchScores(String query, const std::vector<std::size_t>& ids)
115{
116 const auto queryTerms = tokenize(query);
117
118 std::vector<float> scores(ids.size(), 0.0f);
119 for (const auto& token : queryTerms)
120 {
121 if (m_termToDocument.contains(token))
122 {
123 for (std::size_t i = 0; i < ids.size(); ++i)
124 {
125 const auto index = ids[i];
126 const auto frequency = m_termToDocument[token][index];
127 scores[i] += m_inverseDocumentFrequency[token] * frequency * (m_k1 + 1) /
128 (frequency + m_k1 * (1 - m_b + m_b * m_documentLength[index] / m_averageDocumentLength));
129 }
130 }
131 }
132 return scores;
133}
134
135} // namespace krotos
const float m_b
Definition BM25.h:53
float m_averageDocumentLength
Definition BM25.h:55
BM25(const std::vector< String > &corpus)
BM25 ranking algorithm.
Definition BM25.cpp:6
const float m_epsilon
Definition BM25.h:54
StringArray m_stopwords
Definition BM25.h:64
std::vector< int > m_documentLength
Definition BM25.h:61
int m_corpusSize
Definition BM25.h:56
String stemmer(String word)
Apply stemming to a word.
Definition BM25.cpp:72
std::vector< float > getScores(String query)
Calculate BM25 scores between query and all documents in corpus.
Definition BM25.cpp:94
std::unordered_map< String, std::unordered_map< int, int > > m_termToDocument
Definition BM25.h:59
const float m_k1
Definition BM25.h:52
StringArray tokenize(const String &text)
Apply tokenization to a text.
Definition BM25.cpp:53
float m_averageInverseDocumentFrequency
Definition BM25.h:57
void initialise(const std::vector< String > &corpus)
Initialise the BM25 ranking algorithm with a corpus.
Definition BM25.cpp:8
std::vector< float > getBatchScores(String query, const std::vector< std::size_t > &ids)
Calculate BM25 scores between query and a subset of documents in corpus.
Definition BM25.cpp:114
std::unordered_map< String, float > m_inverseDocumentFrequency
Definition BM25.h:60
Definition AirAbsorptionFilter.cpp:2