How to group words in a string by the first N letters in C

1 Answer

0 votes
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

typedef struct {
    char *prefix;
    char **words;
    size_t count;
    size_t capacity;
} Group;

typedef struct {
    Group *groups;
    size_t count;
    size_t capacity;
} GroupList;

void add_word_to_group(Group *g, const char *word) {
    if (g->count == g->capacity) {
        g->capacity = g->capacity ? g->capacity * 2 : 4;
        g->words = realloc(g->words, g->capacity * sizeof(char *));
    }
    g->words[g->count++] = strdup(word);
}

Group *find_or_create_group(GroupList *gl, const char *prefix) {
    for (size_t i = 0; i < gl->count; i++) {
        if (strcmp(gl->groups[i].prefix, prefix) == 0)
            return &gl->groups[i];
    }

    if (gl->count == gl->capacity) {
        gl->capacity = gl->capacity ? gl->capacity * 2 : 4;
        gl->groups = realloc(gl->groups, gl->capacity * sizeof(Group));
    }

    Group *g = &gl->groups[gl->count++];
    g->prefix = strdup(prefix);
    g->words = NULL;
    g->count = 0;
    g->capacity = 0;
    return g;
}

void group_by_first_n_letters(const char *s, size_t n, GroupList *gl) {
    const char *p = s;

    while (*p) {
        while (*p && !isalpha((unsigned char)*p))
            p++;

        if (!*p) break;

        const char *start = p;
        while (*p && isalpha((unsigned char)*p))
            p++;

        size_t len = p - start;
        if (len >= n) {
            char *word = malloc(len + 1);
            for (size_t i = 0; i < len; i++)
                word[i] = tolower((unsigned char)start[i]);
            word[len] = '\0';

            char prefix[64];
            strncpy(prefix, word, n);
            prefix[n] = '\0';

            Group *g = find_or_create_group(gl, prefix);
            add_word_to_group(g, word);

            free(word);
        }
    }
}

void print_groups(const GroupList *gl) {
    for (size_t i = 0; i < gl->count; i++) {
        printf("%s: ", gl->groups[i].prefix);
        for (size_t j = 0; j < gl->groups[i].count; j++)
            printf("%s ", gl->groups[i].words[j]);
        printf("\n");
    }
}

int main() {
    const char *s =
        "The lowly inhabitants of the lowland were surprised to see "
        "the lower branches of the trees.";

    GroupList gl = {0};

    group_by_first_n_letters(s, 3, &gl);
    print_groups(&gl);

    return 0;
}



/*
run:

the: the the the the 
low: lowly lowland lower 
inh: inhabitants 
wer: were 
sur: surprised 
see: see 
bra: branches 
tre: trees 

*/

 



answered Mar 13 by avibootz
...