Shamrock 2025.10.0
Astrophysical Code
Loading...
Searching...
No Matches
string_histogram.cpp
Go to the documentation of this file.
1// -------------------------------------------------------//
2//
3// SHAMROCK code for hydrodynamics
4// Copyright (c) 2021-2026 Timothée David--Cléris <tim.shamrock@proton.me>
5// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1
6// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information
7//
8// -------------------------------------------------------//
9
16#include "shambase/checksum.hpp"
18#include "shambase/string.hpp"
23#include "shamcomm/wrapper.hpp"
24#include <unordered_map>
25#include <string>
26#include <vector>
27
28namespace {
29 inline std::vector<std::string> _gather_strings_internal(
30 const std::vector<std::string> &inputs, const std::string &delimiter, bool is_allgather) {
31 std::string accum_loc = "";
32 for (auto &s : inputs) {
33 accum_loc += s + delimiter;
34 }
35 std::string recv = "";
36
37 if (is_allgather) {
39 return shambase::split_str(recv, delimiter);
40 }
41
42 shamalgs::collective::gather_str(accum_loc, recv);
43 if (shamcomm::world_rank() == 0) {
44 return shambase::split_str(recv, delimiter);
45 }
46
47 return {};
48 }
49
50 std::unordered_map<std::string, int> _string_histogram_all_fetch(
51 const std::vector<std::string> &inputs, std::string delimiter, bool is_allgather) {
52
53 auto splitted = _gather_strings_internal(inputs, delimiter, is_allgather);
54
55 if (is_allgather || shamcomm::world_rank() == 0) {
56 std::unordered_map<std::string, int> histogram;
57 for (size_t i = 0; i < splitted.size(); i++) {
58 histogram[splitted[i]] += 1;
59 }
60 return histogram;
61 }
62
63 return {};
64 }
65
66 inline auto hash_inputs(const std::vector<std::string> &inputs) {
67 std::vector<u64_2> fnv1a_in(inputs.size());
68 for (size_t i = 0; i < inputs.size(); i++) {
69 fnv1a_in[i] = {
70 shambase::fnv1a_hash(inputs[i].data(), inputs[i].size()), shamcomm::world_rank()};
71 }
72 return fnv1a_in;
73 }
74
75 struct CaseInfo {
76 u64 min_rank_id;
77 u64 count;
78 };
79
80 auto data_to_case_info(const std::vector<u64_2> &data) {
81 std::unordered_map<u64, CaseInfo> hash_case_info = {};
82 for (size_t i = 0; i < data.size(); i++) {
83 auto hash = data[i].x();
84 auto rank = data[i].y();
85
86 if (hash_case_info.find(hash) == hash_case_info.end()) {
87 hash_case_info[hash] = {rank, 1};
88 } else {
89 hash_case_info[hash].count += 1;
90 hash_case_info[hash].min_rank_id = std::min(hash_case_info[hash].min_rank_id, rank);
91 }
92 }
93 return hash_case_info;
94 }
95
96 std::unordered_map<std::string, int> _string_histogram_hash_fetch(
97 const std::vector<std::string> &inputs, std::string delimiter, bool is_allgather) {
98
99 // compute the hash of the inputs
100 std::vector<u64_2> fnv1a_in = hash_inputs(inputs);
101
102 // gather the hashes
103 std::vector<u64_2> fnv1a_recv;
104 shamalgs::collective::vector_allgatherv(fnv1a_in, fnv1a_recv, MPI_COMM_WORLD);
105
106 // list all the unique hashes, the minimum rank id for the first occurrence and their count
107 std::unordered_map<u64, CaseInfo> hash_case_info = data_to_case_info(fnv1a_recv);
108
109 // restrict the inputs to the minimum rank id for the first occurrence
110 std::vector<std::string> restricted_inputs = {};
111 for (size_t i = 0; i < inputs.size(); i++) {
112 auto hash = fnv1a_in[i].x();
113 if (hash_case_info[hash].min_rank_id == shamcomm::world_rank()) {
114 restricted_inputs.push_back(inputs[i]);
115 }
116 }
117
118 auto histogram = _string_histogram_all_fetch(restricted_inputs, delimiter, is_allgather);
119
120 // override the histogram from the hash case info
121 for (auto &[word, cnt] : histogram) {
122 auto fnv = shambase::fnv1a_hash(word.data(), word.size());
123 cnt = static_cast<int>(hash_case_info[fnv].count);
124 }
125
126 return histogram;
127 }
128
129} // namespace
130
131std::unordered_map<std::string, int> shamalgs::collective::string_histogram(
132 const std::vector<std::string> &inputs, std::string delimiter, bool hash_based) {
133
134 if (hash_based) {
135 return _string_histogram_hash_fetch(inputs, delimiter, false);
136 }
137
138 return _string_histogram_all_fetch(inputs, delimiter, false);
139}
140
141std::unordered_map<std::string, int> shamalgs::collective::all_string_histogram(
142 const std::vector<std::string> &inputs, std::string delimiter, bool hash_based) {
143
144 if (hash_based) {
145 return _string_histogram_hash_fetch(inputs, delimiter, true);
146 }
147
148 return _string_histogram_all_fetch(inputs, delimiter, true);
149}
std::uint64_t u64
64 bit unsigned integer
std::vector< int > vector_allgatherv(const std::vector< T > &send_vec, const MPI_Datatype &send_type, std::vector< T > &recv_vec, const MPI_Datatype &recv_type, const MPI_Comm comm)
allgatherv on vector with size query (size querying variant of vector_allgatherv_ks) //TODO add fault...
Definition exchanges.hpp:98
MPI string gather / allgather helpers (declarations; implementations in shamalgs/src/collective/gathe...
void allgather_str(const std::string &send_vec, std::string &recv_vec)
Allgathers a string from all nodes and concatenates it in a std::string.
void gather_str(const std::string &send_vec, std::string &recv_vec)
Gathers a string from all nodes and store the result in a std::string.
u64 fnv1a_hash(const char *data, size_t size)
Compute the FNV-1a hash of a given data.
Definition checksum.hpp:31
std::vector< std::string > split_str(std::string s, std::string delimiter)
Splits a string into a vector of substrings according to a delimiter.
Definition string.hpp:290
i32 world_rank()
Gives the rank of the current process in the MPI communicator.
Definition worldInfo.cpp:40
This file contains the definition for the stacktrace related functionality.
MPI string gather / allgather helpers (declarations; implementations in shamalgs/src/collective/gathe...
std::unordered_map< std::string, int > string_histogram(const std::vector< std::string > &inputs, std::string delimiter, bool hash_based)
Constructs a histogram from a vector of strings, counting occurrences of each unique string.
std::unordered_map< std::string, int > all_string_histogram(const std::vector< std::string > &inputs, std::string delimiter, bool hash_based)
same as string_histogram but with result return on every rank
Functions related to the MPI communicator.