How to find keyword matching between multiple text blocks in Ruby

1 Answer

0 votes
require 'set'

#
#    Tokenize text into words.
#    - Keeps only letters and digits
#    - Splits on punctuation and spaces
#
def tokenize(text)
  words = Set.new
  word = ""

  text.each_char do |c|
    if c.match?(/[A-Za-z0-9]/)
      word << c.downcase
    elsif !word.empty?
      words.add(word)
      word = ""
    end
  end

  words.add(word) unless word.empty?
  words
end

#
#    // Find keyword matches across THREE OR MORE texts
#    // -------------------------------------------------------------
#    This function receives a vector of sets.
#    It returns the intersection of ALL sets.
#
def find_matches_multiple(all_sets)
  return Set.new if all_sets.empty?

  # Start with the first set
  result = all_sets.first.dup

  # Intersect with each remaining set
  all_sets[1..].each do |s|
    temp = Set.new

    result.each do |w|
      temp.add(w) if s.include?(w)
    end

    result = temp
  end

  result
end

#
# -------------------------------------------------------------
# Three text blocks to compare
# -------------------------------------------------------------
#
text1 =
  "Machine learning allows computers to learn from data. " \
  "It is widely used in modern applications."

text2 =
  "Data science uses machine learning techniques. " \
  "Applications rely on data-driven models."

text3 =
  "Modern applications of machine learning include data analysis, " \
  "automation, and intelligent systems."

#
# -------------------------------------------------------------
# Tokenize all texts
# -------------------------------------------------------------
#
words1 = tokenize(text1)
words2 = tokenize(text2)
words3 = tokenize(text3)

# Put them into a vector for multi-text comparison
all_sets = [words1, words2, words3]

#
# -------------------------------------------------------------
# Find keyword matches across ALL texts
# -------------------------------------------------------------
#
matches = find_matches_multiple(all_sets)

#
# -------------------------------------------------------------
# Output results
# -------------------------------------------------------------
#
puts "Matched Keywords Across ALL Texts:"
matches.each { |w| print "#{w} " }



#
# run:
#
# Matched Keywords Across ALL Texts:
# applications data learning machine
#

 



answered 4 hours ago by avibootz

Related questions

...