How to group words by every prefix in Python

1 Answer

import re
from collections import defaultdict

def group_by_all_prefixes(s):
    words = re.findall(r"[a-zA-Z]+", s.lower())
    groups = defaultdict(list)

    for w in words:
        for i in range(1, len(w) + 1):
            groups[w[:i]].append(w)

    return dict(groups)


s = "The Lowly inhabitants of the lowland were surprised to see the lower branches of the trees."

groups = group_by_all_prefixes(s)

# Show only prefixes that appear in 2+ words
filtered = {p: ws for p, ws in groups.items() if len(ws) >= 2}

for prefix, words in filtered.items():
    print(prefix, ":", words)
    
print()
    
for prefix, words in filtered.items():
    print(f"{prefix}: {', '.join(words)}")

print()

# show with counters
for prefix, words in filtered.items():
    #print(f"{prefix} | group_count={len(words)} | prefix_len={len(prefix)} | {words}")
    print(f"{prefix} | prefix_len={len(prefix)} | group_count={len(words)} | {words}")


'''
run:
  
t : ['the', 'the', 'to', 'the', 'the', 'trees']
th : ['the', 'the', 'the', 'the']
the : ['the', 'the', 'the', 'the']
l : ['lowly', 'lowland', 'lower']
lo : ['lowly', 'lowland', 'lower']
low : ['lowly', 'lowland', 'lower']
lowl : ['lowly', 'lowland']
o : ['of', 'of']
of : ['of', 'of']
s : ['surprised', 'see']

t: the, the, to, the, the, trees
th: the, the, the, the
the: the, the, the, the
l: lowly, lowland, lower
lo: lowly, lowland, lower
low: lowly, lowland, lower
lowl: lowly, lowland
o: of, of
of: of, of
s: surprised, see

t | prefix_len=1 | group_count=6 | ['the', 'the', 'to', 'the', 'the', 'trees']
th | prefix_len=2 | group_count=4 | ['the', 'the', 'the', 'the']
the | prefix_len=3 | group_count=4 | ['the', 'the', 'the', 'the']
l | prefix_len=1 | group_count=3 | ['lowly', 'lowland', 'lower']
lo | prefix_len=2 | group_count=3 | ['lowly', 'lowland', 'lower']
low | prefix_len=3 | group_count=3 | ['lowly', 'lowland', 'lower']
lowl | prefix_len=4 | group_count=2 | ['lowly', 'lowland']
o | prefix_len=1 | group_count=2 | ['of', 'of']
of | prefix_len=2 | group_count=2 | ['of', 'of']
s | prefix_len=1 | group_count=2 | ['surprised', 'see']

'''

70+ SQL courses for beginners and professionals

answered 2 days ago by avibootz
edited 2 days ago by avibootz

Most popular tags

How to group words by every prefix in Python

1 Answer

Related questions