42 std::array<i32, 9> seq = {0};
61 std::array<i32, 9> seq = {0};
78 std::array<i32, 9> seq = {0};
95 std::array<f64, 9> seq = {0};
112 std::array<f64, 9> seq = {0};
129 std::array<f64, 30> coefs = {0};
133 coefs[3] = 0.16666666666666666;
134 coefs[4] = 0.041666666666666664;
135 coefs[5] = 0.008333333333333333;
136 coefs[6] = 0.001388888888888889;
137 coefs[7] = 0.0001984126984126984;
138 coefs[8] = 2.48015873015873e-05;
139 coefs[9] = 2.7557319223985893e-06;
140 coefs[10] = 2.755731922398589e-07;
141 coefs[11] = 2.505210838544172e-08;
142 coefs[12] = 2.08767569878681e-09;
143 coefs[13] = 1.6059043836821613e-10;
144 coefs[14] = 1.1470745597729725e-11;
145 coefs[15] = 7.647163731819816e-13;
146 coefs[16] = 4.779477332387385e-14;
147 coefs[17] = 2.8114572543455206e-15;
148 coefs[18] = 1.5619206968586225e-16;
149 coefs[19] = 8.22063524662433e-18;
150 coefs[20] = 4.110317623312165e-19;
151 coefs[21] = 1.9572941063391263e-20;
152 coefs[22] = 8.896791392450574e-22;
153 coefs[23] = 3.868170170630684e-23;
154 coefs[24] = 1.6117375710961184e-24;
155 coefs[25] = 6.446950284384474e-26;
156 coefs[26] = 2.4795962632247976e-27;
157 coefs[27] = 9.183689863795546e-29;
158 coefs[28] = 3.279889237069838e-30;
159 coefs[29] = 1.1309962886447716e-31;
177 template<
class T,
class Extents1,
class Layout1,
class Accessor1>
180 std::array<i32, 9> &seq_mk,
181 std::array<f64, 9> &seq_theta_mk,
182 const std::mdspan<T, Extents1, Layout1, Accessor1> &A,
187 m_star = seq_mk[K - 1];
192 mat_L1_norm<T>(A, norm_A);
194 i32 s_tilde =
static_cast<i32>(sycl::ceil(
196 static_cast<f64>(0.0),
197 static_cast<f64>(sycl::log2(norm_A / seq_theta_mk[K - 1])))));
201 for (; k < (K + 1) && !cond; k++) {
202 cond = (norm_A <= seq_theta_mk[k - 1]) && (norm_A <= seq_theta_mk[K - 1]);
203 m_star = cond * seq_mk[k - 1] + !cond * m_star;
204 k_star = cond * k + !cond * k_star;
206 i32 k_choice = sham::min(K, k);
207 auto ld_7 = [&](
i32 s_val) {
208 k_star = k_choice - 1;
209 s_star = sham::max(
static_cast<i32>(0), s_val);
210 m_star = seq_mk[k_star - 1];
213 auto ld_8 = [&](
i32 s_val) {
214 k_star = k_choice - 2;
215 s_star = sham::max(
static_cast<i32>(1), s_val + 1);
216 m_star = seq_mk[k_star - 1];
219 auto ld_9 = [&](
i32 s_val) {
220 k_star = k_choice - 3;
221 s_star = sham::max(
static_cast<i32>(2), s_val + 2);
222 m_star = seq_mk[k_star - 1];
225 f64 cmp_1 = norm_A / (1 << s_tilde);
227 i32 val_2 = ((k_choice >= 8) && (cmp_1 <= 2 * seq_theta_mk[k_choice - 3])) * 2;
228 i32 val_3 = ((k_choice >= 9) && (cmp_1 <= 4 * seq_theta_mk[k_choice - 4])) * 3;
229 i32 val_1 = ((k_choice >= 7) && (cmp_1 <= seq_theta_mk[k_choice - 2]));
231 i32 val = sham::max(val_1, sham::max(val_2, val_3));
233 auto process_val = [&](
int val) {
236 }
else if (val == 2) {
238 }
else if (val == 3) {
277 std::array<f64, 30> &bi_seq,
279 const std::mdspan<T, Extents1, Layout1, Accessor1> &A,
280 const std::mdspan<T, Extents2, Layout2, Accessor2> &F,
281 const std::mdspan<T, Extents3, Layout3, Accessor3> &B,
282 const std::mdspan<T, Extents4, Layout4, Accessor4> &I,
283 const std::mdspan<T, Extents5, Layout5, Accessor5> &Id) {
286 for (
auto k = r - 1; k >= 0; k--) {
287 mat_set_identity<T>(I);
288 mat_set_identity<T>(Id);
292 for (
auto j = 1; j <= q; j++) {
294 mat_gemm<T, U>(1, A, Id, 0, I);
296 mat_axpy_beta<T, U>(bi_seq[cc], I, 1, B);
298 mat_set_identity<T>(Id);
301 mat_axpy_beta<T, U>(1, B, 1, F);
302 mat_axpy_beta<T, U>(1 - cond, Id, cond, I);
304 mat_gemm<T, U>(1, F, I, 0, B);
307 mat_plus_equal_scalar_id<T, U>(F, 1);
338 const std::mdspan<T, Extents1, Layout1, Accessor1> &A,
339 const std::mdspan<T, Extents2, Layout2, Accessor2> &F,
340 const std::mdspan<T, Extents3, Layout3, Accessor3> &B,
341 const std::mdspan<T, Extents4, Layout4, Accessor4> &I,
342 const std::mdspan<T, Extents5, Layout5, Accessor5> &Id,
343 const size_t size_A) {
350 i32 k_star{0}, m_star{0}, s_star{0};
352 order_scale<T>(K, seq_mk, seq_ntheta_mk, A, size_A, k_star, m_star, s_star);
353 i32 r = seq_rk[k_star - 1];
354 i32 q = seq_qk[k_star - 1];
356 i32 pw = (1 << s_star);
357 f64 scale_factor = 1.0 / pw;
358 mat_mul_scalar<T>(A, scale_factor);
361 taylor_eval<T, U>(r, q, seq_bi, size_A, A, F, B, I, Id);
364 mat_set_identity<T>(Id);
365 mat_set_identity<T>(I);
367 for (
auto j = 1; j <= pw; j++) {
369 mat_gemm<T, U>(1, F, Id, 0, I);
double f64
Alias for double.
std::int32_t i32
32 bit integer
namespace for math utility
void taylor_eval(const i32 r, const i32 q, std::array< f64, 30 > &bi_seq, const size_t size, const std::mdspan< T, Extents1, Layout1, Accessor1 > &A, const std::mdspan< T, Extents2, Layout2, Accessor2 > &F, const std::mdspan< T, Extents3, Layout3, Accessor3 > &B, const std::mdspan< T, Extents4, Layout4, Accessor4 > &I, const std::mdspan< T, Extents5, Layout5, Accessor5 > &Id)
This function compute the Taylor polynomial up to order m_star.
constexpr auto sequence_nheta_mk()
precomputed optimal sequence based on backward error analysis
void order_scale(const i32 K, std::array< i32, 9 > &seq_mk, std::array< f64, 9 > &seq_theta_mk, const std::mdspan< T, Extents1, Layout1, Accessor1 > &A, const size_t size_A, i32 &k_star, i32 &m_star, i32 &s_star)
this function compute the Taylor's polynomial order (m_star) the optimal number of matrix product dur...
constexpr auto define_bexp_coef()
1/(i!)
constexpr auto sequence_rk()
precomputed optimal Paterson-Stockmeyer polynomial degrees
constexpr auto sequence_qk()
precomputed optimal Paterson-Stockmeyer intergers (it's used to compute the matrix power)
constexpr auto sequence_theta_mk()
precomputed optimal sequence based on backward error analysis
void mat_exp(const i32 K, const std::mdspan< T, Extents1, Layout1, Accessor1 > &A, const std::mdspan< T, Extents2, Layout2, Accessor2 > &F, const std::mdspan< T, Extents3, Layout3, Accessor3 > &B, const std::mdspan< T, Extents4, Layout4, Accessor4 > &I, const std::mdspan< T, Extents5, Layout5, Accessor5 > &Id, const size_t size_A)
matrix scaling-squaring Talylor-based matrix exponential
constexpr auto sequence_mk()
precomputed optimal Taylor's polynomial orders