How to find keyword matching between multiple text blocks in C#

1 Answer

0 votes
using System;
using System.Collections.Generic;

class KeywordMatching
{
    /*
        Tokenize text into words.
        - Keeps only letters and digits
        - Splits on punctuation and spaces
    */
    static HashSet<string> Tokenize(string text)
    {
        HashSet<string> words = new HashSet<string>();
        string word = "";

        foreach (char c in text) {
            if (char.IsLetterOrDigit(c)) {
                word += char.ToLower(c);
            }
            else if (word.Length > 0) {
                words.Add(word);
                word = "";
            }
        }

        if (word.Length > 0)
            words.Add(word);

        return words;
    }

    /*
        // Find keyword matches across THREE OR MORE texts
        // -------------------------------------------------------------
        This function receives a vector of sets.
        It returns the intersection of ALL sets.
    */
    static HashSet<string> FindMatchesMultiple(List<HashSet<string>> allSets)
    {
        if (allSets.Count == 0)
            return new HashSet<string>();

        // Start with the first set
        HashSet<string> result = new HashSet<string>(allSets[0]);

        // Intersect with each remaining set
        for (int i = 1; i < allSets.Count; i++) {
            HashSet<string> temp = new HashSet<string>();

            foreach (string w in result) {
                if (allSets[i].Contains(w)) {
                    temp.Add(w);
                }
            }

            result = temp;
        }

        return result;
    }

    static void Main()
    {
        // -------------------------------------------------------------
        // Three text blocks to compare
        // -------------------------------------------------------------
        string text1 =
            "Machine learning allows computers to learn from data. " +
            "It is widely used in modern applications.";

        string text2 =
            "Data science uses machine learning techniques. " +
            "Applications rely on data-driven models.";

        string text3 =
            "Modern applications of machine learning include data analysis, " +
            "automation, and intelligent systems.";

        // -------------------------------------------------------------
        // Tokenize all texts
        // -------------------------------------------------------------
        HashSet<string> words1 = Tokenize(text1);
        HashSet<string> words2 = Tokenize(text2);
        HashSet<string> words3 = Tokenize(text3);

        // Put them into a vector for multi-text comparison
        List<HashSet<string>> allSets = new List<HashSet<string>>()
        {
            words1, words2, words3
        };

        // -------------------------------------------------------------
        // Find keyword matches across ALL texts
        // -------------------------------------------------------------
        HashSet<string> matches = FindMatchesMultiple(allSets);

        // -------------------------------------------------------------
        // Output results
        // -------------------------------------------------------------
        Console.WriteLine("Matched Keywords Across ALL Texts:");
        foreach (string w in matches)
            Console.Write(w + " ");

        Console.WriteLine();
    }
}



/*
run:

Matched Keywords Across ALL Texts:
machine learning data applications 

*/

 



answered 8 hours ago by avibootz

Related questions

...