How to implement basic text standardization in C++

1 Answer

0 votes
#include <iostream>
#include <string>
#include <regex>
#include <codecvt> // codecvt_utf8

std::string standardize_text(const std::string& text) {
    std::string result = text;
    
    // Convert to lowercase
    std::transform(result.begin(), result.end(), result.begin(), ::tolower);
    
    // Normalize unicode characters to ASCII
    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
    std::wstring wide_string = converter.from_bytes(result);
    std::string normalized;
    for (wchar_t wch : wide_string) {
        if (wch < 256) {
            normalized += static_cast<char>(wch);
        }
    }
    
    // Remove punctuation
    normalized = std::regex_replace(normalized, std::regex(R"([^\w\s])"), "");
    
    // Remove extra whitespace
    normalized = std::regex_replace(normalized, std::regex(R"(\s+)"), " ");

    return normalized;
}

int main() {
    std::string sentence1 = "the Quick, BROWN Fox Isnt Jumps OVER the lazy dog!!!";
    std::string sentence2 = "The quick;   BROWN big Fox Isn't Jumps    OVER the lãzy dog!";
    
    std::string std_sentence1 = standardize_text(sentence1);
    std::string std_sentence2 = standardize_text(sentence2);
    
    std::cout << std_sentence1 << std::endl;
    std::cout << std_sentence2 << std::endl;
}



/*
run: 
 
the quick brown fox isnt jumps over the lazy dog
the quick brown big fox isnt jumps over the lzy dog
 
*/

 



answered Nov 25, 2024 by avibootz
...