Krotos Modules 3
Loading...
Searching...
No Matches
BM25.h
Go to the documentation of this file.
1#pragma once
2
3namespace krotos
4{
11class BM25
12{
13 public:
18 BM25(const std::vector<String>& corpus);
24 std::vector<float> getScores(String query);
31 std::vector<float> getBatchScores(String query, const std::vector<std::size_t>& ids);
32
33 private:
38 void initialise(const std::vector<String>& corpus);
44 StringArray tokenize(const String& text);
50 String stemmer(String word);
51
52 const float m_k1{1.5f};
53 const float m_b{0.75f};
54 const float m_epsilon{0.25f};
58
59 std::unordered_map<String, std::unordered_map<int, int>> m_termToDocument;
60 std::unordered_map<String, float> m_inverseDocumentFrequency;
61 std::vector<int> m_documentLength;
62
63 // TODO: validate that removing stopwords is useful, and try a smaller set of stopwords
64 StringArray m_stopwords = {
65 "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
66 "you", "you\'re", "you\'ve", "you\'ll", "you\'d", "your", "yours", "yourself",
67 "yourselves", "he", "him", "his", "himself", "she", "she\'s", "her",
68 "hers", "herself", "it", "it\'s", "its", "itself", "they", "them",
69 "their", "theirs", "themselves", "what", "which", "who", "whom", "this",
70 "that", "that\'ll", "these", "those", "am", "is", "are", "was",
71 "were", "be", "been", "being", "have", "has", "had", "having",
72 "do", "does", "did", "doing", "a", "an", "the", "and",
73 "but", "if", "or", "because", "as", "until", "while", "of",
74 "at", "by", "for", "with", "about", "against", "between", "into",
75 "through", "during", "before", "after", "above", "below", "to", "from",
76 "up", "down", "in", "out", "on", "off", "over", "under",
77 "again", "further", "then", "once", "here", "there", "when", "where",
78 "why", "how", "all", "any", "both", "each", "few", "more",
79 "most", "other", "some", "such", "no", "nor", "not", "only",
80 "own", "same", "so", "than", "too", "very", "s", "t",
81 "can", "will", "just", "don", "don\'t", "should", "should\'ve", "now",
82 "d", "ll", "m", "o", "re", "ve", "y", "ain",
83 "aren", "aren\'t", "couldn", "couldn\'t", "didn", "didn\'t", "doesn", "doesn\'t",
84 "hadn", "hadn\'t", "hasn", "hasn\'t", "haven", "haven\'t", "isn", "isn\'t",
85 "ma", "mightn", "mightn\'t", "mustn", "mustn\'t", "needn", "needn\'t", "shan",
86 "shan\'t", "shouldn", "shouldn\'t", "wasn", "wasn\'t", "weren", "weren\'t", "won",
87 "won\'t", "wouldn", "wouldn\'t"};
88};
89
90} // namespace krotos
ATIRE BM25 ranking function. This variant does not result in negative IDF values. Trotman,...
Definition BM25.h:12
const float m_b
Definition BM25.h:53
float m_averageDocumentLength
Definition BM25.h:55
BM25(const std::vector< String > &corpus)
BM25 ranking algorithm.
Definition BM25.cpp:6
const float m_epsilon
Definition BM25.h:54
StringArray m_stopwords
Definition BM25.h:64
std::vector< int > m_documentLength
Definition BM25.h:61
int m_corpusSize
Definition BM25.h:56
String stemmer(String word)
Apply stemming to a word.
Definition BM25.cpp:72
std::vector< float > getScores(String query)
Calculate BM25 scores between query and all documents in corpus.
Definition BM25.cpp:94
std::unordered_map< String, std::unordered_map< int, int > > m_termToDocument
Definition BM25.h:59
const float m_k1
Definition BM25.h:52
StringArray tokenize(const String &text)
Apply tokenization to a text.
Definition BM25.cpp:53
float m_averageInverseDocumentFrequency
Definition BM25.h:57
void initialise(const std::vector< String > &corpus)
Initialise the BM25 ranking algorithm with a corpus.
Definition BM25.cpp:8
std::vector< float > getBatchScores(String query, const std::vector< std::size_t > &ids)
Calculate BM25 scores between query and a subset of documents in corpus.
Definition BM25.cpp:114
std::unordered_map< String, float > m_inverseDocumentFrequency
Definition BM25.h:60
Definition AirAbsorptionFilter.cpp:2