// polymult.cpp (Set A, with all inner pragmas kept, and .user/.id/.dest removed) #include "ntt.h" // Dilithium modulus and 1/N mod q for N = 256 coeff_t q = 8380417; coeff_t inv_n = 8347681; // 256 * 8347681 ≡ 1 (mod 8380417) //double_coeff_t v = 20159; /*coeff_t mod(double_coeff_t A) { #pragma HLS inline OFF //double_coeff_t v = (double_coeff_t) ((1<<26) + 1664)/q; double_coeff_t t = (v * A + (1 << 25)) >> 26; t = t * q; coeff_t val; if (A < t) val = A - t + q; else val = A - t; return val; }*/ /* Kyber ap_uint<13> m = 5039; coeff_t mod(double_coeff_t A) { #pragma HLS pipeline II = 1 coeff_t val; ap_uint<36> t123 = m * A; ap_uint<12> t = (t123 >> 24); ap_uint<24> ta = t * q; ap_uint<24> c = A - ta; if (c > q) val = (coeff_t) (c - q); else val = (coeff_t) c; return val; } coeff_t modadd(coeff_t x, coeff_t y) { #pragma HLS inline coeff_t w = x + y; return (coeff_t)(w - (w < q ? (coeff_t)0 : q)); } coeff_t modsub(coeff_t x, coeff_t y) { #pragma HLS inline coeff_t s = x + (x > y ? (coeff_t)0 : q); return (coeff_t)(s - y); } */ //----------------------------------------- // Start "For Dilithium" // mod, modadd, modsub for Dilithium // Generic modular reduction for coeff_t / double_coeff_t // Works for any |A| < 2^63 and any |q| < 2^31. static inline coeff_t mod(double_coeff_t A) { #pragma HLS inline long long x = (long long)A; long long q_long = (long long)q; long long r = x % q_long; if (r < 0) r += q_long; return (coeff_t)r; } // Modular addition: returns (a + b) mod q static inline coeff_t modadd(coeff_t a, coeff_t b) { #pragma HLS inline long long q_long = (long long)q; long long s = (long long)a + (long long)b; if (s >= q_long) s -= q_long; return (coeff_t)s; } // Modular subtraction: returns (a - b) mod q static inline coeff_t modsub(coeff_t a, coeff_t b) { #pragma HLS inline long long q_long = (long long)q; long long d = (long long)a - (long long)b; if (d < 0) d += q_long; return (coeff_t)d; } // ----------------------------------------------------------------------------- // 512-point NTT core (for Dilithium negacyclic convolution) // ----------------------------------------------------------------------------- // Multiplication modulo q, using existing mod() on double_coeff_t static inline coeff_t mul_mod(coeff_t a, coeff_t b) { double_coeff_t prod = (double_coeff_t)a * (double_coeff_t)b; return mod(prod); } // Precomputed stage twiddles for length-512 NTT over q = 8380417 // Stage index s corresponds to len = 2^(s+1): 2,4,8,...,512 static const coeff_t NTT_WLEN[9] = { (coeff_t)8380416, // len = 2 (coeff_t)4808194, // len = 4 (coeff_t)4614810, // len = 8 (coeff_t)2883726, // len = 16 (coeff_t)6250525, // len = 32 (coeff_t)7044481, // len = 64 (coeff_t)3241972, // len = 128 (coeff_t)6644104, // len = 256 (coeff_t)1921994 // len = 512 }; static const coeff_t NTT_WLEN_INV[9] = { (coeff_t)8380416, // len = 2 (coeff_t)3572223, // len = 4 (coeff_t)3761513, // len = 8 (coeff_t)5234739, // len = 16 (coeff_t)3764867, // len = 32 (coeff_t)3227876, // len = 64 (coeff_t)6621070, // len = 128 (coeff_t)6125690, // len = 256 (coeff_t)527981 // len = 512 }; // 512^{-1} mod 8380417 static const coeff_t INV_NTT512 = (coeff_t)8364049; // In-place iterative radix-2 NTT of size 512 static void ntt_512(coeff_t a[512], bool invert) { // Bit-reversal permutation unsigned int j = 0; for (unsigned int i = 1; i < 512; ++i) { unsigned int bit = 512 >> 1; while (j & bit) { j ^= bit; bit >>= 1; } j ^= bit; if (i < j) { coeff_t tmp = a[i]; a[i] = a[j]; a[j] = tmp; } } int len = 2; int stage = 0; while (len <= 512) { coeff_t wlen = invert ? NTT_WLEN_INV[stage] : NTT_WLEN[stage]; int half = len >> 1; for (int i = 0; i < 512; i += len) { coeff_t w = (coeff_t)1; for (int j = 0; j < half; ++j) { coeff_t u = a[i + j]; coeff_t v = mul_mod(a[i + j + half], w); a[i + j] = modadd(u, v); a[i + j + half] = modsub(u, v); w = mul_mod(w, wlen); } } len <<= 1; stage++; } if (invert) { // Multiply by 512^{-1} mod q to finish inverse NTT for (int i = 0; i < 512; ++i) { a[i] = mul_mod(a[i], INV_NTT512); } } } // Negacyclic convolution via 512-point NTT: // c(x) = a(x) * b(x) mod (x^Nt + 1, q), Nt = 256 static void poly_mult_dil_core(coeff_t c[Nt], const coeff_t a[Nt], const coeff_t b[Nt]) { const int N2 = 2 * Nt; // 512 for Nt = 256 coeff_t A[N2]; coeff_t B[N2]; coeff_t C[N2]; // Copy inputs and zero-pad for (int i = 0; i < Nt; ++i) { #pragma HLS pipeline II=1 A[i] = a[i]; B[i] = b[i]; } for (int i = Nt; i < N2; ++i) { #pragma HLS pipeline II=1 A[i] = 0; B[i] = 0; } // Forward 512-point NTT of both sequences ntt_512(A, false); ntt_512(B, false); // Pointwise multiplication in NTT domain for (int i = 0; i < N2; ++i) { #pragma HLS pipeline II=1 C[i] = mul_mod(A[i], B[i]); } // Inverse 512-point NTT -> length-512 cyclic convolution ntt_512(C, true); // Negacyclic fold: c[k] = C[k] - C[k + Nt] mod q for (int k = 0; k < Nt; ++k) { #pragma HLS pipeline II=1 c[k] = modsub(C[k], C[k + Nt]); } } // END "For Dilithium" //---------------------------------------------------------- void butterfly_unit_dif(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y) { #pragma HLS pipeline II = 1 x = modadd(a, b); y = modsub(a, b); y = mod(w * y); } void butterfly_unit_dit(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y) { #pragma HLS pipeline II = 1 coeff_t wb = mod(w * b); x = modadd(a, wb); y = modsub(a, wb); } void delay_cycle() { #ifdef __SYNTHESIS__ ap_wait_n(1); #endif } void ntt_stage1 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeff = 1729; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 64; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } for (int j = 0; j < 1; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 64; k++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeff; butterfly_unit_dit(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; delay_cycle(); } for (int i = 0; i < 64; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); } } } void ntt_stage2 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[2] = {2580, 3289}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 32; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } for (int j = 0; j < 2; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 32; k++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[j]; butterfly_unit_dit(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; delay_cycle(); } for (int i = 0; i < 32; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 1) { it = a.read(); fifo[i + 64] = it; } } } } void ntt_stage3 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[4] = {2642, 630, 1897, 848}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 16; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } for (int j = 0; j < 4; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 16; k++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[j]; butterfly_unit_dit(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; delay_cycle(); } for (int i = 0; i < 16; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 3) { it = a.read(); fifo[i + 64] = it; } } } } void ntt_stage4 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[8] = {1062, 1919, 193, 797, 2786, 3260, 569, 1746}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 8; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } for (int j = 0; j < 8; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; int ind = 1; for (int k = 0; k < 8; k++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[j]; butterfly_unit_dit(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; delay_cycle(); } for (int i = 0; i < 8; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 7) { it = a.read(); fifo[i + 64] = it; } } } } void ntt_stage5 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[16] = {296, 2447, 1339, 1476, 3046, 56, 2240, 1333, 1426, 2094, 535, 2882, 2393, 2879, 1974, 821}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 4; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } for (int j = 0; j < 16; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; int ind = 1; for (int k = 0; k < 4; k = k + 1) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[j]; butterfly_unit_dit(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; delay_cycle(); } for (int i = 0; i < 4; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 15) { it = a.read(); fifo[i + 64] = it; } } } } void ntt_stage6 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[32] = {289, 331, 3253, 1756, 1197, 2304, 2277, 2055, 650, 1977, 2513, 632, 2865, 33, 1320, 1915, 2319, 1435, 807, 452, 1438, 2868, 1534, 2402, 2647, 2617, 1481, 648, 2474, 3110, 1227, 910}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 2; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } for (int j = 0; j < 32; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; int ind = 1; for (int k = 0; k < 2; k = k + 1) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[j]; butterfly_unit_dit(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; delay_cycle(); } for (int i = 0; i < 2; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 31) { it = a.read(); fifo[i + 64] = it; } } } } void ntt_stage7 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS inline off #pragma HLS DEPENDENCE variable = fifo inter RAW false coeff_t twiddle_coeffs[64] = {17, 2761, 583, 2649, 1637, 723, 2288, 1100, 1409, 2662, 3281, 233, 756, 2156, 3015, 3050, 1703, 1651, 2789, 1789, 1847, 952, 1461, 2687, 939, 2308, 2437, 2388, 733, 2337, 268, 641, 1584, 2298, 2037, 3220, 375, 2549, 2090, 1645, 1063, 319, 2773, 757, 2099, 561, 2466, 2594, 2804, 1092, 403, 1026, 1143, 2150, 2775, 886, 1722, 1212, 1874, 1029, 2110, 2935, 885, 2154}; int x, y; coeff_t u, t, it, bf1, bf2; u = a.read(); for (int j = 0; j < 64; j++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false t = a.read(); butterfly_unit_dit(twiddle_coeffs[j], u, t, bf1, bf2); b.write(bf1); b.write(bf2); if (j < 63) u = a.read(); } } void intt_stage1 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { coeff_t twiddle_coeffs[64] = {1175, 2444, 394, 1219, 2300, 1455, 2117, 1607, 2443, 554, 1179, 2186, 2303, 2926, 2237, 525, 735, 863, 2768, 1230, 2572, 556, 3010, 2266, 1684, 1239, 780, 2954, 109, 1292, 1031, 1745, 2688, 3061, 992, 2596, 941, 892, 1021, 2390, 642, 1868, 2377, 1482, 1540, 540, 1678, 1626, 279, 314, 1173, 2573, 3096, 48, 667, 1920, 2229, 1041, 2606, 1692, 680, 2746, 568, 3312}; #pragma HLS inline off #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t u, t, it, bf1, bf2; u = a.read(); for (int j = 0; j < 64; j++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false t = a.read(); butterfly_unit_dif(twiddle_coeffs[j], u, t, bf1, bf2); b.write(bf1); b.write(bf2); if (j < 63) u = a.read(); } } void intt_stage2 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[32] = {2419, 2102, 219, 855, 2681, 1848, 712, 682, 927, 1795, 461, 1891, 2877, 2522, 1894, 1010, 1414, 2009, 3296, 464, 2697, 816, 1352, 2679, 1274, 1052, 1025, 2132, 1573, 76, 2998, 3040}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 2; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } int ind = 0; int count = 0; for (int j = 0; j < 32; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 2; k = k + 1) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[ind]; butterfly_unit_dif(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; count++; if (count % 2 == 0) ind++; delay_cycle(); } for (int i = 0; i < 2; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 31) { it = a.read(); fifo[i + 64] = it; } } } } void intt_stage3 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[16] = {2508, 1355, 450, 936, 447, 2794, 1235, 1903, 1996, 1089, 3273, 283, 1853, 1990, 882, 3033}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; int m = 4; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 4; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } int ind = 0; int count = 0; for (int j = 0; j < 16; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 4; k = k + 1) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[ind]; butterfly_unit_dif(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; count++; if (count % 4 == 0) ind++; delay_cycle(); } for (int i = 0; i < 4; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 15) { it = a.read(); fifo[i + 64] = it; } } } } void intt_stage4 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[8] = {1583, 2760, 69, 543, 2532, 3136, 1410, 2267}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 8; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } int ind = 0; int count = 0; for (int j = 0; j < 8; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 8; k = k + 1) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[ind]; butterfly_unit_dif(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; count++; if (count % 8 == 0) ind++; delay_cycle(); } for (int i = 0; i < 8; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 7) { it = a.read(); fifo[i + 64] = it; } } } } void intt_stage5 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[4] = {2481, 1432, 2699, 687}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 16; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } int ind = 0; int count = 0; for (int j = 0; j < 4; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 16; k = k + 1) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[ind]; butterfly_unit_dif(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; count++; if (count % 16 == 0) ind++; delay_cycle(); } for (int i = 0; i < 16; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 3) { it = a.read(); fifo[i + 64] = it; } } } } void intt_stage6 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS dataflow coeff_t twiddle_coeffs[2] = {40, 749}; #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, tf; for (int i = 0; i < 32; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } int ind = 0; int count = 0; for (int j = 0; j < 2; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 32; k = k + 1) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = twiddle_coeffs[ind]; butterfly_unit_dif(tf, a_, b_, bf1, bf2); b.write(bf1); fifo[iter] = bf2; iter++; count++; if (count == 32) ind++; delay_cycle(); } for (int i = 0; i < 32; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); if (j < 1) { it = a.read(); fifo[i + 64] = it; } } } } void intt_stage7 (hls::stream &a, hls::stream &b, coeff_t fifo[]) { #pragma HLS inline off #pragma HLS DEPENDENCE variable = fifo inter RAW false int x, y; coeff_t a_, b_, it, bf1, bf2, bfn1, bfn2, tf; for (int i = 0; i < 64; i++) { #pragma HLS pipeline it = a.read(); fifo[i + 64] = it; } for (int j = 0; j < 1; j++) { #pragma HLS DEPENDENCE variable = fifo inter RAW false int iter = 0; for (int k = 0; k < 64; k = k + 1) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false a_ = fifo[iter + 64]; b_ = a.read(); tf = 1600; butterfly_unit_dif(tf, a_, b_, bf1, bf2); bfn1 = mod(bf1 * inv_n); bfn2 = mod(bf2 * inv_n); b.write(bfn1); fifo[iter] = bfn2; iter++; delay_cycle(); } for (int i = 0; i < 64; i++) { #pragma HLS pipeline II = 1 #pragma HLS DEPENDENCE variable = fifo inter RAW false b.write(fifo[i]); delay_cycle(); } } } void read_inputs (hls::stream &input, hls::stream &se, hls::stream &so) { coeff_t_stream x; coeff_t a; int i; for (i=0; i &se, hls::stream &so, hls::stream &output) { coeff_t a1, a0; coeff_t_stream y; int i; y.last = 0; for (i=0; i &input, hls::stream &output) { #pragma HLS dataflow hls::stream s0o("s0o"), s1o("s1o"), s2o("s2o"), s3o("s3o"), s4o("s4o"), s5o("s5o"), s6o("s6o"), s7o("s7o"), s0e("s0e"), s1e("s1e"), s2e("s2e"), s3e("s3e"), s4e("s4e"), s5e("s5e"), s6e("s6e"), s7e("s7e"); coeff_t fo7[65], fo6[66], fo5[68], fo4[72], fo3[80], fo2[96], fo1[128]; coeff_t fe7[65], fe6[66], fe5[68], fe4[72], fe3[80], fe2[96], fe1[128]; coeff_t_stream x, y; #pragma HLS STREAM variable = s7o depth = 1 #pragma HLS STREAM variable = s6o depth = 2 #pragma HLS STREAM variable = s5o depth = 4 #pragma HLS STREAM variable = s4o depth = 8 #pragma HLS STREAM variable = s3o depth = 16 #pragma HLS STREAM variable = s2o depth = 32 #pragma HLS STREAM variable = s1o depth = 64 #pragma HLS STREAM variable = s0o depth = 128 #pragma HLS STREAM variable = s7e depth = 1 #pragma HLS STREAM variable = s6e depth = 2 #pragma HLS STREAM variable = s5e depth = 4 #pragma HLS STREAM variable = s4e depth = 8 #pragma HLS STREAM variable = s3e depth = 16 #pragma HLS STREAM variable = s2e depth = 32 #pragma HLS STREAM variable = s1e depth = 64 #pragma HLS STREAM variable = s0e depth = 128 read_inputs(input, s0e, s0o); ntt_stage1 (s0e, s1e, fe1); ntt_stage1 (s0o, s1o, fo1); ntt_stage2 (s1e, s2e, fe2); ntt_stage2 (s1o, s2o, fo2); ntt_stage3 (s2e, s3e, fe3); ntt_stage3 (s2o, s3o, fo3); ntt_stage4 (s3e, s4e, fe4); ntt_stage4 (s3o, s4o, fo4); ntt_stage5 (s4e, s5e, fe5); ntt_stage5 (s4o, s5o, fo5); ntt_stage6 (s5e, s6e, fe6); ntt_stage6 (s5o, s6o, fo6); ntt_stage7 (s6e, s7e, fe7); ntt_stage7 (s6o, s7o, fo7); write_outputs(s7e, s7o, output); } void gs_intt (hls::stream &input, hls::stream &output) { #pragma HLS dataflow hls::stream s0o("s0o"), s1o("s1o"), s2o("s2o"), s3o("s3o"), s4o("s4o"), s5o("s5o"), s6o("s6o"), s7o("s7o"), s0e("s0e"), s1e("s1e"), s2e("s2e"), s3e("s3e"), s4e("s4e"), s5e("s5e"), s6e("s6e"), s7e("s7e"); coeff_t fo7[128], fo6[96], fo5[80], fo4[72], fo3[68], fo2[66], fo1[65]; coeff_t fe7[128], fe6[96], fe5[80], fe4[72], fe3[68], fe2[66], fe1[65]; coeff_t_stream x, y; #pragma HLS STREAM variable = s7o depth = 1 #pragma HLS STREAM variable = s6o depth = 2 #pragma HLS STREAM variable = s5o depth = 4 #pragma HLS STREAM variable = s4o depth = 8 #pragma HLS STREAM variable = s3o depth = 16 #pragma HLS STREAM variable = s2o depth = 32 #pragma HLS STREAM variable = s1o depth = 64 #pragma HLS STREAM variable = s0o depth = 128 #pragma HLS STREAM variable = s7e depth = 1 #pragma HLS STREAM variable = s6e depth = 2 #pragma HLS STREAM variable = s5e depth = 4 #pragma HLS STREAM variable = s4e depth = 8 #pragma HLS STREAM variable = s3e depth = 16 #pragma HLS STREAM variable = s2e depth = 32 #pragma HLS STREAM variable = s1e depth = 64 #pragma HLS STREAM variable = s0e depth = 128 read_inputs(input, s0e, s0o); intt_stage1 (s0e, s1e, fe1); intt_stage1 (s0o, s1o, fo1); intt_stage2 (s1e, s2e, fe2); intt_stage2 (s1o, s2o, fo2); intt_stage3 (s2e, s3e, fe3); intt_stage3 (s2o, s3o, fo3); intt_stage4 (s3e, s4e, fe4); intt_stage4 (s3o, s4o, fo4); intt_stage5 (s4e, s5e, fe5); intt_stage5 (s4o, s5o, fo5); intt_stage6 (s5e, s6e, fe6); intt_stage6 (s5o, s6o, fo6); intt_stage7 (s6e, s7e, fe7); intt_stage7 (s6o, s7o, fo7); write_outputs(s7e, s7o, output); } void stream_split (hls::stream &input, hls::stream &input1, hls::stream &input2) { coeff_t_stream_big x; double_coeff_t a; coeff_t_stream x1, x2; coeff_t a1, a2; int i; for (i=0; i &input1, hls::stream &input2, hls::stream &output) { coeff_t_stream xe, xo, ye, yo, z; coeff_t ae, be, ce, ao, bo, co, c1, c2, c2s, c3, c4; int i; coeff_t pm_factors[128] = {17, 3312, 2761, 568, 583, 2746, 2649, 680, 1637, 1692, 723, 2606, 2288, 1041, 1100, 2229, 1409, 1920, 2662, 667, 3281, 48, 233, 3096, 756, 2573, 2156, 1173, 3015, 314, 3050, 279, 1703, 1626, 1651, 1678, 2789, 540, 1789, 1540, 1847, 1482, 952, 2377, 1461, 1868, 2687, 642, 939, 2390, 2308, 1021, 2437, 892, 2388, 941, 733, 2596, 2337, 992, 268, 3061, 641, 2688, 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109, 375, 2954, 2549, 780, 2090, 1239, 1645, 1684, 1063, 2266, 319, 3010, 2773, 556, 757, 2572, 2099, 1230, 561, 2768, 2466, 863, 2594, 735, 2804, 525, 1092, 2237, 403, 2926, 1026, 2303, 1143, 2186, 2150, 1179, 2775, 554, 886, 2443, 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300, 2110, 1219, 2935, 394, 885, 2444, 2154, 1175}; z.last = 0; for (i=0; i internal stream conversion helpers (only at top level) // ----------------------------------------------------------------------------- static void axis_to_internal_input(hls::stream &axis_in, hls::stream &int_in) { coeff_axis_big_t a; coeff_t_stream_big x; for (int i = 0; i < Nt; i++) { #pragma HLS pipeline II = 1 a = axis_in.read(); x.value = (double_coeff_t)a.data; x.last = a.last; int_in.write(x); // Optional: break on TLAST if you want to be robust to shorter packets if (a.last) break; } } static void internal_to_axis_output(hls::stream &int_out, hls::stream &axis_out) { coeff_t_stream x; coeff_axis_t a; for (int i = 0; i < Nt; i++) { #pragma HLS pipeline II = 1 x = int_out.read(); a.data = (ap_uint<32>)x.value; a.last = x.last; // Mark all bytes valid; side channels are disabled in this ap_axiu config a.keep = -1; a.strb = -1; axis_out.write(a); if (x.last) break; } } // ----------------------------------------------------------------------------- // Top-level function with AXI4-Stream ports (for DMA) and internal NTT pipeline // ----------------------------------------------------------------------------- /* int poly_mult_dil (hls::stream &input, hls::stream &output) { #pragma HLS INTERFACE axis register port=input #pragma HLS INTERFACE axis register port=output #pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS #pragma HLS dataflow // Internal streams using the original coeff_t_stream{,_big} types hls::stream in_internal("in_internal"); hls::stream input1("input1"), input2("input2"); hls::stream middle1("middle1"), middle2("middle2"); hls::stream middle3("middle3"), out_internal("out_internal"); axis_to_internal_input(input, in_internal); stream_split(in_internal, input1, input2); ct_ntt(input1, middle1); ct_ntt(input2, middle2); point_wise_mult(middle1, middle2, middle3); gs_intt(middle3, out_internal); internal_to_axis_output(out_internal, output); return 0; } */ // For Dilithium // ----------------------------------------------------------------------------- // Top-level function: now uses array-based 512-NTT core instead of Kyber NTT // ----------------------------------------------------------------------------- int poly_mult_dil(hls::stream &input, hls::stream &output) { #pragma HLS INTERFACE axis register port=input #pragma HLS INTERFACE axis register port=output #pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS #pragma HLS dataflow // Internal streams (same as before) hls::stream in_internal("in_internal"); hls::stream input1("input1"), input2("input2"); hls::stream out_internal("out_internal"); // Existing helpers: keep as they are in your file axis_to_internal_input(input, in_internal); stream_split(in_internal, input1, input2); // Local polynomial buffers coeff_t poly_a[Nt]; coeff_t poly_b[Nt]; coeff_t poly_c[Nt]; // Read Nt coefficients for each polynomial from the two internal streams for (int i = 0; i < Nt; ++i) { #pragma HLS pipeline II=1 coeff_t_stream x1 = input1.read(); coeff_t_stream x2 = input2.read(); poly_a[i] = x1.value; poly_b[i] = x2.value; } // Core negacyclic multiplication via 512-point NTT poly_mult_dil_core(poly_c, poly_a, poly_b); // Stream result back out as coeff_t_stream (value + last) for (int i = 0; i < Nt; ++i) { #pragma HLS pipeline II=1 coeff_t_stream y; y.value = poly_c[i]; y.last = (i == Nt - 1) ? (bit)1 : (bit)0; out_internal.write(y); } // Existing helper: convert internal stream to AXI output internal_to_axis_output(out_internal, output); return 0; }