_n_e_r_8cpp_source.html

#include "NER.h"


namespace krotos

{


    std::vector<NER::Entity> NER::removeOverlapping(std::vector<Entity> entities)

    {

        // descending sort based on score, using start and end index to break ties

        std::sort(entities.begin(), entities.end(), [](const Entity& lhs, const Entity& rhs) {

            return std::make_tuple(lhs.score, -lhs.startIndex, lhs.endIndex) >

                   std::make_tuple(rhs.score, -rhs.startIndex, rhs.endIndex);

        });


        std::vector<Entity> results;

        std::set<std::pair<int, int>> indicesCovered;

        for (const auto& entity : entities)

        {

            bool update = true;

            for (const auto& [startCovered, endCovered] : indicesCovered)

            {

                if (!((entity.startIndex > endCovered) || (entity.endIndex < startCovered)))

                {

                    update = false;

                }

            }

            if (update)

            {

                results.push_back(entity);

                indicesCovered.insert(std::make_pair(entity.startIndex, entity.endIndex));

            }

        }


        // ascending sort based on start index

        std::sort(results.begin(), results.end(),

                  [](const Entity& lhs, const Entity& rhs) { return lhs.startIndex < rhs.startIndex; });

        return results;

    }


    std::tuple<float, String, String> NER::getFuzzySimilarity(String text,

                                                              const std::unordered_map<String, StringArray>& dictionary,

                                                              float similarityThreshold)

    {

        for (const auto& [category, lexicon] : dictionary)

        {

            auto maxScore = 0.0f;

            String match = "";

            for (const auto& entity : lexicon)

            {

                const auto score = stringSimilarity(text, entity);

                if (score >= maxScore)

                {

                    maxScore = score;

                    match = entity;

                }

            }

            if (maxScore >= similarityThreshold)

                return {maxScore, match, category};

        }

        return {-1.0f, "", ""};

    }


    std::unordered_map<String, StringArray> NER::findEntity(String text,

                                                            const std::unordered_map<String, StringArray>& dictionary,

                                                            float similarityThreshold)

    {

        // check maximum number of words in dictionary word-level n-grams

        int maxNgrams = 1;

        for (const auto& [category, lexicon] : dictionary)

        {

            for (const auto& phrase : lexicon)

            {

                StringArray words;

                words.addTokens(phrase, false);

                maxNgrams = std::max(words.size(), maxNgrams);

            }

        }


        // pre-process input text

        text = text.toLowerCase().replace(",", "").replace("_", " ");


        // split input text into words/tokens

        StringArray tokens;

        tokens.addTokens(text, " ", "\"");


        // search for matching named entities

        std::vector<Entity> entities;

        for (int n = 1; n <= maxNgrams; ++n)

        {

            auto ngramsResult = ngrams(tokens, n);

            int currentIndex = 0;

            for (const auto& token : ngramsResult)

            {

                auto [score, name, category] = getFuzzySimilarity(token, dictionary, similarityThreshold);

                if (score > 0)

                {

                    // find the start and end indices

                    int startIndex = text.indexOf(currentIndex, token);

                    int endIndex = startIndex + token.length();

                    if (n == 1)

                    {

                        currentIndex = endIndex;

                    }

                    else

                    {

                        StringArray words;

                        words.addTokens(token, " ", "\"");

                        auto length = words.joinIntoString(" ", 1).length();

                        currentIndex = endIndex - length - 1;

                    }

                    // storing best matching named entity

                    Entity entity(name, category, score, startIndex, endIndex);

                    entities.push_back(entity);

                }

            }

        }


        // post-process to handle entities with overlapping index ranges

        entities = removeOverlapping(entities);


        // return results as a dictionary

        std::unordered_map<String, StringArray> results;

        for (const auto& entity : entities)

        {

            results[entity.category].add(entity.name);

        }

        return results;

    }


    StringArray NER::ngrams(const StringArray& tokens, int n)

    {

        if (tokens.size() < n)

            return {};

        if (n == 1)

            return tokens;


        StringArray results;

        for (int i = 0; i <= tokens.size() - n; ++i)

        {

            auto token = tokens.joinIntoString(" ", i, n);

            results.add(token);

        }

        return results;

    }


    int NER::levenshteinDistance(const String& str1, const String& str2)

    {

        int m = str1.length();

        int n = str2.length();


        std::vector<int> prevRow(n + 1, 0);

        std::vector<int> currRow(n + 1, 0);


        for (int j = 0; j <= n; j++)

        {

            prevRow[j] = j;

        }


        for (int i = 1; i <= m; i++)

        {

            currRow[0] = i;


            for (int j = 1; j <= n; j++)

            {

                if (str1[i - 1] == str2[j - 1])

                {

                    currRow[j] = prevRow[j - 1];

                }

                else

                {

                    currRow[j] = 1 + std::min(currRow[j - 1], std::min(prevRow[j], prevRow[j - 1]));

                }

            }

            prevRow = currRow;

        }

        return currRow[n];

    }


    float NER::stringSimilarity(const String& str1, const String& str2)

    {

        if (str1.isEmpty() && str2.isEmpty())

            return 1.0f;


        const auto dist = levenshteinDistance(str1, str2);

        return 1.0f - static_cast<float>(dist) / std::max(str1.length(), str2.length());

    }


} // namespace krotos

NER.h

krotos::NER::stringSimilarity
float stringSimilarity(const String &str1, const String &str2)
Compute the string similarity.
Definition NER.cpp:177

krotos::NER::levenshteinDistance
int levenshteinDistance(const String &str1, const String &str2)
Compute the Levenshtein distance between strings.
Definition NER.cpp:144

krotos::NER::findEntity
std::unordered_map< String, StringArray > findEntity(String text, const std::unordered_map< String, StringArray > &dictionary, float similarityThreshold=0.9f)
Search text for named entities held in dictionary.
Definition NER.cpp:61

krotos::NER::getFuzzySimilarity
std::tuple< float, String, String > getFuzzySimilarity(String text, const std::unordered_map< String, StringArray > &dictionary, float similarityThreshold)
Search for matching named entities using fuzzy string matching.
Definition NER.cpp:38

krotos::NER::removeOverlapping
std::vector< Entity > removeOverlapping(std::vector< Entity > entities)
Remove overlapping entities (keep longest)
Definition NER.cpp:5

krotos::NER::ngrams
StringArray ngrams(const StringArray &tokens, int n=1)
Compute ngrams for the given StringArray.
Definition NER.cpp:128

krotos
Definition AirAbsorptionFilter.cpp:2

krotos::NER::Entity
Definition NER.h:61