Krotos Modules 3
Loading...
Searching...
No Matches
NER.cpp
Go to the documentation of this file.
1#include "NER.h"
2
3namespace krotos
4{
5 std::vector<NER::Entity> NER::removeOverlapping(std::vector<Entity> entities)
6 {
7 // descending sort based on score, using start and end index to break ties
8 std::sort(entities.begin(), entities.end(), [](const Entity& lhs, const Entity& rhs) {
9 return std::make_tuple(lhs.score, -lhs.startIndex, lhs.endIndex) >
10 std::make_tuple(rhs.score, -rhs.startIndex, rhs.endIndex);
11 });
12
13 std::vector<Entity> results;
14 std::set<std::pair<int, int>> indicesCovered;
15 for (const auto& entity : entities)
16 {
17 bool update = true;
18 for (const auto& [startCovered, endCovered] : indicesCovered)
19 {
20 if (!((entity.startIndex > endCovered) || (entity.endIndex < startCovered)))
21 {
22 update = false;
23 }
24 }
25 if (update)
26 {
27 results.push_back(entity);
28 indicesCovered.insert(std::make_pair(entity.startIndex, entity.endIndex));
29 }
30 }
31
32 // ascending sort based on start index
33 std::sort(results.begin(), results.end(),
34 [](const Entity& lhs, const Entity& rhs) { return lhs.startIndex < rhs.startIndex; });
35 return results;
36 }
37
38 std::tuple<float, String, String> NER::getFuzzySimilarity(String text,
39 const std::unordered_map<String, StringArray>& dictionary,
40 float similarityThreshold)
41 {
42 for (const auto& [category, lexicon] : dictionary)
43 {
44 auto maxScore = 0.0f;
45 String match = "";
46 for (const auto& entity : lexicon)
47 {
48 const auto score = stringSimilarity(text, entity);
49 if (score >= maxScore)
50 {
51 maxScore = score;
52 match = entity;
53 }
54 }
55 if (maxScore >= similarityThreshold)
56 return {maxScore, match, category};
57 }
58 return {-1.0f, "", ""};
59 }
60
61 std::unordered_map<String, StringArray> NER::findEntity(String text,
62 const std::unordered_map<String, StringArray>& dictionary,
63 float similarityThreshold)
64 {
65 // check maximum number of words in dictionary word-level n-grams
66 int maxNgrams = 1;
67 for (const auto& [category, lexicon] : dictionary)
68 {
69 for (const auto& phrase : lexicon)
70 {
71 StringArray words;
72 words.addTokens(phrase, false);
73 maxNgrams = std::max(words.size(), maxNgrams);
74 }
75 }
76
77 // pre-process input text
78 text = text.toLowerCase().replace(",", "").replace("_", " ");
79
80 // split input text into words/tokens
81 StringArray tokens;
82 tokens.addTokens(text, " ", "\"");
83
84 // search for matching named entities
85 std::vector<Entity> entities;
86 for (int n = 1; n <= maxNgrams; ++n)
87 {
88 auto ngramsResult = ngrams(tokens, n);
89 int currentIndex = 0;
90 for (const auto& token : ngramsResult)
91 {
92 auto [score, name, category] = getFuzzySimilarity(token, dictionary, similarityThreshold);
93 if (score > 0)
94 {
95 // find the start and end indices
96 int startIndex = text.indexOf(currentIndex, token);
97 int endIndex = startIndex + token.length();
98 if (n == 1)
99 {
100 currentIndex = endIndex;
101 }
102 else
103 {
104 StringArray words;
105 words.addTokens(token, " ", "\"");
106 auto length = words.joinIntoString(" ", 1).length();
107 currentIndex = endIndex - length - 1;
108 }
109 // storing best matching named entity
110 Entity entity(name, category, score, startIndex, endIndex);
111 entities.push_back(entity);
112 }
113 }
114 }
115
116 // post-process to handle entities with overlapping index ranges
117 entities = removeOverlapping(entities);
118
119 // return results as a dictionary
120 std::unordered_map<String, StringArray> results;
121 for (const auto& entity : entities)
122 {
123 results[entity.category].add(entity.name);
124 }
125 return results;
126 }
127
128 StringArray NER::ngrams(const StringArray& tokens, int n)
129 {
130 if (tokens.size() < n)
131 return {};
132 if (n == 1)
133 return tokens;
134
135 StringArray results;
136 for (int i = 0; i <= tokens.size() - n; ++i)
137 {
138 auto token = tokens.joinIntoString(" ", i, n);
139 results.add(token);
140 }
141 return results;
142 }
143
144 int NER::levenshteinDistance(const String& str1, const String& str2)
145 {
146 int m = str1.length();
147 int n = str2.length();
148
149 std::vector<int> prevRow(n + 1, 0);
150 std::vector<int> currRow(n + 1, 0);
151
152 for (int j = 0; j <= n; j++)
153 {
154 prevRow[j] = j;
155 }
156
157 for (int i = 1; i <= m; i++)
158 {
159 currRow[0] = i;
160
161 for (int j = 1; j <= n; j++)
162 {
163 if (str1[i - 1] == str2[j - 1])
164 {
165 currRow[j] = prevRow[j - 1];
166 }
167 else
168 {
169 currRow[j] = 1 + std::min(currRow[j - 1], std::min(prevRow[j], prevRow[j - 1]));
170 }
171 }
172 prevRow = currRow;
173 }
174 return currRow[n];
175 }
176
177 float NER::stringSimilarity(const String& str1, const String& str2)
178 {
179 if (str1.isEmpty() && str2.isEmpty())
180 return 1.0f;
181
182 const auto dist = levenshteinDistance(str1, str2);
183 return 1.0f - static_cast<float>(dist) / std::max(str1.length(), str2.length());
184 }
185
186} // namespace krotos
float stringSimilarity(const String &str1, const String &str2)
Compute the string similarity.
Definition NER.cpp:177
int levenshteinDistance(const String &str1, const String &str2)
Compute the Levenshtein distance between strings.
Definition NER.cpp:144
std::unordered_map< String, StringArray > findEntity(String text, const std::unordered_map< String, StringArray > &dictionary, float similarityThreshold=0.9f)
Search text for named entities held in dictionary.
Definition NER.cpp:61
std::tuple< float, String, String > getFuzzySimilarity(String text, const std::unordered_map< String, StringArray > &dictionary, float similarityThreshold)
Search for matching named entities using fuzzy string matching.
Definition NER.cpp:38
std::vector< Entity > removeOverlapping(std::vector< Entity > entities)
Remove overlapping entities (keep longest)
Definition NER.cpp:5
StringArray ngrams(const StringArray &tokens, int n=1)
Compute ngrams for the given StringArray.
Definition NER.cpp:128
Definition AirAbsorptionFilter.cpp:2
Definition NER.h:61