How to get the word before the last word from a string (edge‑case‑safe) in C

1 Answer

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdlib.h>

/*
Since C doesn't have a built-in Unicode-aware regex, 
we used a helper function is_japanese_comma to detect 
the specific 3-byte sequence of the Japanese delimiter.

Unsigned Char Casting: When dealing with Unicode bytes 
(which are often > 127$),
*/

/**
 * Checks if a byte is a standard ASCII separator (whitespace or punctuation).
 */
bool is_ascii_separator(unsigned char c) {
    if (isspace(c)) return true;
    const char *punct = ",.!?;:\"'()[]{}-/\\";
    return strchr(punct, c) != NULL;
}

/**
 * Checks specifically for the Japanese comma '、' (UTF-8: 0xE3 0x80 0x81).
 * In C, we check the byte sequence.
 */
bool is_japanese_comma(const unsigned char *s) {
    return (s[0] == 0xE3 && s[1] == 0x80 && s[2] == 0x81);
}

/**
 * Extracts the word before the last word.
 * Returns a heap-allocated string that the caller must free.
 */
char* get_word_before_last(const char *text) {
    char **words = malloc(strlen(text) * sizeof(char*));
    int word_count = 0;
    
    int len = strlen(text);
    char *current = malloc(len + 1);
    int curr_idx = 0;

    for (int i = 0; i < len; i++) {
        unsigned char c = (unsigned char)text[i];

        if (is_ascii_separator(c) || is_japanese_comma((unsigned char*)&text[i])) {
            if (curr_idx > 0) {
                current[curr_idx] = '\0';
                words[word_count++] = strdup(current);
                curr_idx = 0;
            }
            // If it was a 3-byte Japanese comma, skip the extra bytes
            if (!is_ascii_separator(c)) i += 2; 
        } else {
            current[curr_idx++] = text[i];
        }
    }

    // Flush last word
    if (curr_idx > 0) {
        current[curr_idx] = '\0';
        words[word_count++] = strdup(current);
    }

    char *result = NULL;
    if (word_count >= 2) {
        result = strdup(words[word_count - 2]);
    }

    // Cleanup
    for (int i = 0; i < word_count; i++) free(words[i]);
    free(words);
    free(current);

    return result;
}

int main() {
    printf("=== Testing: Get Word Before Last ===\n\n");

    const char *tests[] = {
        "python c++",
        "  many   spaces   here   now  ",
        "OneWord",
        "",
        "   ",
        "Hello, world!",
        "Tabs\tand\nnewlines work too",
        "Unicode 世界、こんにちは",
        "Ends with punctuation.",
        "Multiple words, with punctuation, here!",
        "state-of-the-art program example"
    };

    int num_tests = sizeof(tests) / sizeof(tests[0]);

    for (int i = 0; i < num_tests; i++) {
        char *result = get_word_before_last(tests[i]);
        
        printf("Input: \"%s\"\n", tests[i]);
        printf("Output: %s\n", result ? result : "null");
        printf("----------------------------------------\n");
        
        free(result);
    }

    return 0;
}

/*
OUTPUT:

=== Testing: Get Word Before Last ===

Input: "python c++"
Output: python
----------------------------------------
Input: "  many   spaces   here   now  "
Output: here
----------------------------------------
Input: "OneWord"
Output: null
----------------------------------------
Input: ""
Output: null
----------------------------------------
Input: "   "
Output: null
----------------------------------------
Input: "Hello, world!"
Output: Hello
----------------------------------------
Input: "Tabs	and
newlines work too"
Output: work
----------------------------------------
Input: "Unicode 世界、こんにちは"
Output: 世界
----------------------------------------
Input: "Ends with punctuation."
Output: with
----------------------------------------
Input: "Multiple words, with punctuation, here!"
Output: punctuation
----------------------------------------
Input: "state-of-the-art program example"
Output: program
----------------------------------------

*/
70+ SQL courses for beginners and professionals
answered Mar 28 by avibootz
Most popular tags

How to get the word before the last word from a string (edge‑case‑safe) in C

1 Answer

Related questions