18 BM25(
const std::vector<String>& corpus);
24 std::vector<float>
getScores(String query);
31 std::vector<float>
getBatchScores(String query,
const std::vector<std::size_t>& ids);
38 void initialise(
const std::vector<String>& corpus);
44 StringArray
tokenize(
const String& text);
53 const float m_b{0.75f};
65 "i",
"me",
"my",
"myself",
"we",
"our",
"ours",
"ourselves",
66 "you",
"you\'re",
"you\'ve",
"you\'ll",
"you\'d",
"your",
"yours",
"yourself",
67 "yourselves",
"he",
"him",
"his",
"himself",
"she",
"she\'s",
"her",
68 "hers",
"herself",
"it",
"it\'s",
"its",
"itself",
"they",
"them",
69 "their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"this",
70 "that",
"that\'ll",
"these",
"those",
"am",
"is",
"are",
"was",
71 "were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
72 "do",
"does",
"did",
"doing",
"a",
"an",
"the",
"and",
73 "but",
"if",
"or",
"because",
"as",
"until",
"while",
"of",
74 "at",
"by",
"for",
"with",
"about",
"against",
"between",
"into",
75 "through",
"during",
"before",
"after",
"above",
"below",
"to",
"from",
76 "up",
"down",
"in",
"out",
"on",
"off",
"over",
"under",
77 "again",
"further",
"then",
"once",
"here",
"there",
"when",
"where",
78 "why",
"how",
"all",
"any",
"both",
"each",
"few",
"more",
79 "most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
80 "own",
"same",
"so",
"than",
"too",
"very",
"s",
"t",
81 "can",
"will",
"just",
"don",
"don\'t",
"should",
"should\'ve",
"now",
82 "d",
"ll",
"m",
"o",
"re",
"ve",
"y",
"ain",
83 "aren",
"aren\'t",
"couldn",
"couldn\'t",
"didn",
"didn\'t",
"doesn",
"doesn\'t",
84 "hadn",
"hadn\'t",
"hasn",
"hasn\'t",
"haven",
"haven\'t",
"isn",
"isn\'t",
85 "ma",
"mightn",
"mightn\'t",
"mustn",
"mustn\'t",
"needn",
"needn\'t",
"shan",
86 "shan\'t",
"shouldn",
"shouldn\'t",
"wasn",
"wasn\'t",
"weren",
"weren\'t",
"won",
87 "won\'t",
"wouldn",
"wouldn\'t"};
ATIRE BM25 ranking function. This variant does not result in negative IDF values. Trotman,...
Definition BM25.h:12
const float m_b
Definition BM25.h:53
float m_averageDocumentLength
Definition BM25.h:55
BM25(const std::vector< String > &corpus)
BM25 ranking algorithm.
Definition BM25.cpp:6
const float m_epsilon
Definition BM25.h:54
StringArray m_stopwords
Definition BM25.h:64
std::vector< int > m_documentLength
Definition BM25.h:61
int m_corpusSize
Definition BM25.h:56
String stemmer(String word)
Apply stemming to a word.
Definition BM25.cpp:72
std::vector< float > getScores(String query)
Calculate BM25 scores between query and all documents in corpus.
Definition BM25.cpp:94
std::unordered_map< String, std::unordered_map< int, int > > m_termToDocument
Definition BM25.h:59
const float m_k1
Definition BM25.h:52
StringArray tokenize(const String &text)
Apply tokenization to a text.
Definition BM25.cpp:53
float m_averageInverseDocumentFrequency
Definition BM25.h:57
void initialise(const std::vector< String > &corpus)
Initialise the BM25 ranking algorithm with a corpus.
Definition BM25.cpp:8
std::vector< float > getBatchScores(String query, const std::vector< std::size_t > &ids)
Calculate BM25 scores between query and a subset of documents in corpus.
Definition BM25.cpp:114
std::unordered_map< String, float > m_inverseDocumentFrequency
Definition BM25.h:60
Definition AirAbsorptionFilter.cpp:2