(기록용) Counting word frequency with functional programming in C++.

이경헌·2023년 10월 20일
#include <range/v3/view/chunk_by.hpp>
#include <range/v3/view/split_when.hpp>

import std;

struct case_insensitive_hash{
    constexpr std::size_t operator()(std::string_view text) const{
        return std::accumulate(text.begin(), text.end(), 0UZ, [](std::size_t acc, char c){
            return acc ^ std::hash<char>{}(c) + 0x9e3779b9 + (acc << 6) + (acc >> 2);
        });
    }
};

struct case_insensitive_equal{
    constexpr bool operator()(std::string_view lhs, std::string_view rhs) const{
        if (lhs.size() != rhs.size()){
            return false;
        }

        return std::ranges::equal(lhs, rhs, [](char c1, char c2) {
            return std::tolower(c1) == std::tolower(c2);
        });
    }
};

std::string read_file(const std::filesystem::path &path){
    std::fstream file { path };
    std::stringstream buffer;
    buffer << file.rdbuf();

    return buffer.str();
}

std::string to_lower(std::string text){
    std::ranges::for_each(text, [](char &c){
        c = std::tolower(c);
    });
    return text;
}

int main(int argc, char **argv) {
    auto text = read_file(argv[1]);
    auto words = text
            | ranges::views::split_when([](char c) {
                return !std::isalpha(c);
            })
            | std::views::transform([](auto &&rng) {
                return std::string_view { &*rng.begin(), static_cast<std::size_t>(std::ranges::distance(rng)) };
            });

    std::unordered_map<std::string_view, std::size_t, case_insensitive_hash, case_insensitive_equal> frequencies;
    for (std::string_view word : words){
        ++frequencies[word];
    }

    auto sorted_frequencies = frequencies
            | std::views::transform([](auto &&kv){
                auto &&[key, value] = kv;
                return std::make_pair(value, key);
            })
            | std::ranges::to<std::vector>();
    std::ranges::sort(sorted_frequencies, std::greater{}, [](const auto &kv) { return kv.first; });

    auto result = sorted_frequencies
            | ranges::views::chunk_by([](const auto &kv1, const auto &kv2) {
                return kv1.first == kv2.first;
            })
            | std::views::transform([](auto &&rng) {
                return std::make_pair(
                        rng.front().first,
                        rng | std::views::values
                            | std::views::transform([](std::string_view text) {
                                return to_lower(std::string { text });
                            })
                            | std::ranges::to<std::vector>()
                );
            });

    auto header = std::format("{: >10} | {: >10} | {}", "Occurrence", "Count", "Words");
    std::println("{}\n{:-^{}}", header, "", header.size());

    for (auto &&[frequency, words] : result){
        std::println("{: >10} | {: >10} | {}", frequency, words.size(), words);
    }
}

No heap allocation except loading text into string, unordered_map construction, and lowering words at the final stage. Hashing and equal comparsion is consistent for character casing.

Compiler: Clang 17
External libraries: range-v3

profile
Undergraduate student in Korea University. Major in electrical engineering and computer science.

0개의 댓글