8 std::sort(entities.begin(), entities.end(), [](
const Entity& lhs,
const Entity& rhs) {
9 return std::make_tuple(lhs.score, -lhs.startIndex, lhs.endIndex) >
10 std::make_tuple(rhs.score, -rhs.startIndex, rhs.endIndex);
13 std::vector<Entity> results;
14 std::set<std::pair<int, int>> indicesCovered;
15 for (
const auto& entity : entities)
18 for (
const auto& [startCovered, endCovered] : indicesCovered)
20 if (!((entity.startIndex > endCovered) || (entity.endIndex < startCovered)))
27 results.push_back(entity);
28 indicesCovered.insert(std::make_pair(entity.startIndex, entity.endIndex));
33 std::sort(results.begin(), results.end(),
34 [](
const Entity& lhs,
const Entity& rhs) { return lhs.startIndex < rhs.startIndex; });
39 const std::unordered_map<String, StringArray>& dictionary,
40 float similarityThreshold)
42 for (
const auto& [category, lexicon] : dictionary)
46 for (
const auto& entity : lexicon)
49 if (score >= maxScore)
55 if (maxScore >= similarityThreshold)
56 return {maxScore, match, category};
58 return {-1.0f,
"",
""};
62 const std::unordered_map<String, StringArray>& dictionary,
63 float similarityThreshold)
67 for (
const auto& [category, lexicon] : dictionary)
69 for (
const auto& phrase : lexicon)
72 words.addTokens(phrase,
false);
73 maxNgrams = std::max(words.size(), maxNgrams);
78 text = text.toLowerCase().replace(
",",
"").replace(
"_",
" ");
82 tokens.addTokens(text,
" ",
"\"");
85 std::vector<Entity> entities;
86 for (
int n = 1; n <= maxNgrams; ++n)
88 auto ngramsResult =
ngrams(tokens, n);
90 for (
const auto& token : ngramsResult)
92 auto [score, name, category] =
getFuzzySimilarity(token, dictionary, similarityThreshold);
96 int startIndex = text.indexOf(currentIndex, token);
97 int endIndex = startIndex + token.length();
100 currentIndex = endIndex;
105 words.addTokens(token,
" ",
"\"");
106 auto length = words.joinIntoString(
" ", 1).length();
107 currentIndex = endIndex - length - 1;
110 Entity entity(name, category, score, startIndex, endIndex);
111 entities.push_back(entity);
120 std::unordered_map<String, StringArray> results;
121 for (
const auto& entity : entities)
123 results[entity.category].add(entity.name);
146 int m = str1.length();
147 int n = str2.length();
149 std::vector<int> prevRow(n + 1, 0);
150 std::vector<int> currRow(n + 1, 0);
152 for (
int j = 0; j <= n; j++)
157 for (
int i = 1; i <= m; i++)
161 for (
int j = 1; j <= n; j++)
163 if (str1[i - 1] == str2[j - 1])
165 currRow[j] = prevRow[j - 1];
169 currRow[j] = 1 + std::min(currRow[j - 1], std::min(prevRow[j], prevRow[j - 1]));
std::unordered_map< String, StringArray > findEntity(String text, const std::unordered_map< String, StringArray > &dictionary, float similarityThreshold=0.9f)
Search text for named entities held in dictionary.
Definition NER.cpp:61
std::tuple< float, String, String > getFuzzySimilarity(String text, const std::unordered_map< String, StringArray > &dictionary, float similarityThreshold)
Search for matching named entities using fuzzy string matching.
Definition NER.cpp:38