From ccbe56a51418c3cfddfb300af795e2ed01557b3c Mon Sep 17 00:00:00 2001 From: oho Date: Fri, 12 Dec 2025 19:37:56 +0100 Subject: [PATCH] dilithium hls try w C-Sim, Co-Sim, Synthesis, Impl ok --- HLS_Codes_Dilithium/ntt.h | 29 +- HLS_Codes_Dilithium/pm_test.cpp | 70 +- HLS_Codes_Dilithium/polymult.cpp | 1792 +++++++++++++++++++++--------- HLS_Codes_Dilithium/test_case.h | 120 +- 4 files changed, 1399 insertions(+), 612 deletions(-) diff --git a/HLS_Codes_Dilithium/ntt.h b/HLS_Codes_Dilithium/ntt.h index a2da47a..58cca53 100644 --- a/HLS_Codes_Dilithium/ntt.h +++ b/HLS_Codes_Dilithium/ntt.h @@ -9,33 +9,34 @@ typedef ap_uint<1> bit; typedef ap_uint<8> ap_logn_t; -typedef ap_int<24> coeff_t; -typedef ap_int<48> double_coeff_t; +typedef ap_int<32> coeff_t; +typedef ap_int<64> double_coeff_t; -// Internal streaming types -struct coeff_t_stream { +// Internal streaming types (original design) +struct coeff_t_stream +{ coeff_t value; bit last; }; -struct coeff_t_stream_big { + +struct coeff_t_stream_big +{ double_coeff_t value; bit last; }; -// External AXI4-Stream element types (top-level ports) -typedef ap_axiu<24,0,0,0> coeff_axis_t; -typedef ap_axiu<48,0,0,0> coeff_axis_big_t; +// External AXI4-Stream element types (only used on top-level ports) +typedef ap_axiu<32,0,0,0> coeff_axis_t; +typedef ap_axiu<64,0,0,0> coeff_axis_big_t; #define N 128 #define Nt 256 #define logN 7 -// Modulus and twiddle constants -extern coeff_t q; -extern coeff_t inv_n; +extern coeff_t q, w_n; -// Top-level function prototype -int poly_mult(hls::stream &input, - hls::stream &output); +// Top-level function now uses AXI4-Stream types for DMA compatibility +int poly_mult_dil (hls::stream &input, + hls::stream &output); #endif diff --git a/HLS_Codes_Dilithium/pm_test.cpp b/HLS_Codes_Dilithium/pm_test.cpp index 9a69888..0fb8b0c 100644 --- a/HLS_Codes_Dilithium/pm_test.cpp +++ b/HLS_Codes_Dilithium/pm_test.cpp @@ -1,46 +1,78 @@ +// pm_test.cpp + #include "test_case.h" -int main() { +int main() +{ // Top-level AXI4-Stream ports for the DUT hls::stream in_data; hls::stream out_data; + coeff_axis_big_t local_stream1; coeff_axis_t local_stream2; - coeff_t actual_outputs[Nt]; + int i; - // Write input stimuli into input AXI4-Stream - for (i = 0; i < Nt; i++) { - coeff_t val1 = input1_vals[i]; - double_coeff_t val2 = (double_coeff_t) input2_vals[i] << 24; - // Pack two 24-bit values into one 48-bit word (stored in ap_axiu<48>) - local_stream1.data = (ap_uint<48>) ((ap_uint<48>) val1 | (ap_uint<48>) val2); - local_stream1.keep = -1; + coeff_t actual_outputs[Nt]; + coeff_t golden_outputs[Nt]; // NEW: buffer for golden result + + // ------------------------------------------------------------------------- + // Write stimulus into input AXI4-Stream + // ------------------------------------------------------------------------- + for (i = 0; i < Nt; i++) + { + coeff_t val1 = input1_vals[i]; + coeff_t val2 = input2_vals[i]; + + // Packing: 2×32-bit coeffs into one 64-bit word + ap_uint<64> word = 0; + word |= (ap_uint<32>)val1; // low 32 bits + word |= (ap_uint<64>)(ap_uint<32>)val2 << 32; // high 32 bits + + local_stream1.data = word; + local_stream1.keep = -1; // 0xFF for 64-bit TDATA local_stream1.strb = -1; local_stream1.last = (i == Nt - 1) ? 1 : 0; + in_data.write(local_stream1); } + // ------------------------------------------------------------------------- // Call DUT - poly_mult(in_data, out_data); + // ------------------------------------------------------------------------- + poly_mult_dil(in_data, out_data); - // Read results from output AXI4-Stream - for (i = 0; i < Nt; i++) { + // ------------------------------------------------------------------------- + // Read result from output AXI4-Stream + // ------------------------------------------------------------------------- + for (i = 0; i < Nt; i++) + { local_stream2 = out_data.read(); - actual_outputs[i] = (coeff_t) local_stream2.data; - // (Optionally check local_stream2.last here) + actual_outputs[i] = (coeff_t)local_stream2.data; + // local_stream2.last could be checked here if you want } + // ------------------------------------------------------------------------- + // Compute golden result (software negacyclic product) + // ------------------------------------------------------------------------- + golden_poly_mult_dil(golden_outputs, input1_vals, input2_vals); + + // ------------------------------------------------------------------------- // Compare against golden output + // ------------------------------------------------------------------------- int ret_val = 0; - for (i = 0; i < Nt; i++) { - if (output_vals[i] != actual_outputs[i]) { + for (i = 0; i < Nt; i++) + { + if (golden_outputs[i] != actual_outputs[i]) + { ret_val++; - std::cout << "Mismatch at index " << i - << ": expected " << output_vals[i] - << ", got " << actual_outputs[i] << std::endl; + std::cout << "Mismatch at i = " << i + << " golden = " << golden_outputs[i] + << " hw = " << actual_outputs[i] + << std::endl; break; } } + return ret_val; } diff --git a/HLS_Codes_Dilithium/polymult.cpp b/HLS_Codes_Dilithium/polymult.cpp index abbf23f..7a862a2 100644 --- a/HLS_Codes_Dilithium/polymult.cpp +++ b/HLS_Codes_Dilithium/polymult.cpp @@ -1,580 +1,1338 @@ +// polymult.cpp (Set A, with all inner pragmas kept, and .user/.id/.dest removed) + #include "ntt.h" -coeff_t q = 8380417; -coeff_t inv_n = 8347681; // 256^(-1) mod 8380417 +// Dilithium modulus and 1/N mod q for N = 256 +coeff_t q = 8380417; +coeff_t inv_n = 8347681; // 256 * 8347681 ≡ 1 (mod 8380417) -// Precomputed constant for Barrett reduction (m and shift for mod function) -static ap_uint<24> m = 8396807; // floor(2^46 / q) -/** - * Modular reduction: returns A mod q, for -q^2 < A < q^2. - * Uses Barrett-like reduction with one subtraction step. - */ -coeff_t mod(double_coeff_t A) { - #pragma HLS pipeline II=1 - ap_uint<48> Au = (ap_uint<48>) A; // treat A as unsigned for reduction - ap_uint<72> t123 = (ap_uint<72>) Au * m; // 48+24=72-bit multiplication - ap_uint<24> t = (ap_uint<24>) (t123 >> 46); // approximate quotient - ap_uint<48> ta = (ap_uint<48>) t * (ap_uint<48>) q; - ap_uint<48> c = (ap_uint<48>) (Au - ta); - coeff_t val; - if (c >= (ap_uint<48>) q) { - val = (coeff_t) (c - q); - } else { - val = (coeff_t) c; - } - return val; +//double_coeff_t v = 20159; + +/*coeff_t mod(double_coeff_t A) +{ + #pragma HLS inline OFF + //double_coeff_t v = (double_coeff_t) ((1<<26) + 1664)/q; + double_coeff_t t = (v * A + (1 << 25)) >> 26; + t = t * q; + coeff_t val; + if (A < t) + val = A - t + q; + else + val = A - t; + return val; +}*/ + + +/* Kyber +ap_uint<13> m = 5039; + +coeff_t mod(double_coeff_t A) +{ + #pragma HLS pipeline II = 1 + coeff_t val; + ap_uint<36> t123 = m * A; + ap_uint<12> t = (t123 >> 24); + ap_uint<24> ta = t * q; + ap_uint<24> c = A - ta; + if (c > q) + val = (coeff_t) (c - q); + else + val = (coeff_t) c; + return val; } -coeff_t modadd(coeff_t x, coeff_t y) { +coeff_t modadd(coeff_t x, coeff_t y) +{ + #pragma HLS inline + coeff_t w = x + y; + return (coeff_t)(w - (w < q ? (coeff_t)0 : q)); +} + +coeff_t modsub(coeff_t x, coeff_t y) +{ + #pragma HLS inline + coeff_t s = x + (x > y ? (coeff_t)0 : q); + return (coeff_t)(s - y); +} +*/ + +//----------------------------------------- +// Start "For Dilithium" + +// mod, modadd, modsub for Dilithium +// Generic modular reduction for coeff_t / double_coeff_t +// Works for any |A| < 2^63 and any |q| < 2^31. +static inline coeff_t mod(double_coeff_t A) +{ #pragma HLS inline - coeff_t w = x + y; - return (coeff_t)(w - (w < q ? (coeff_t)0 : q)); + + long long x = (long long)A; + long long q_long = (long long)q; + + long long r = x % q_long; + if (r < 0) + r += q_long; + + return (coeff_t)r; } -coeff_t modsub(coeff_t x, coeff_t y) { +// Modular addition: returns (a + b) mod q +static inline coeff_t modadd(coeff_t a, coeff_t b) +{ #pragma HLS inline - coeff_t s = x + (x > y ? (coeff_t)0 : q); - return (coeff_t)(s - y); + + long long q_long = (long long)q; + long long s = (long long)a + (long long)b; + + if (s >= q_long) + s -= q_long; + + return (coeff_t)s; } -// Butterfly operations (DIT and DIF) -void butterfly_unit_dif(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y) { - #pragma HLS pipeline II=1 - x = modadd(a, b); - y = modsub(a, b); - y = mod((double_coeff_t) w * y); -} -void butterfly_unit_dit(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y) { - #pragma HLS pipeline II=1 - coeff_t wb = mod((double_coeff_t) w * b); - x = modadd(a, wb); - y = modsub(a, wb); +// Modular subtraction: returns (a - b) mod q +static inline coeff_t modsub(coeff_t a, coeff_t b) +{ + #pragma HLS inline + + long long q_long = (long long)q; + long long d = (long long)a - (long long)b; + + if (d < 0) + d += q_long; + + return (coeff_t)d; } -// One-cycle delay (for simulation/synthesis timing) -void delay_cycle() { - #ifdef __SYNTHESIS__ - ap_wait_n(1); - #endif + +// ----------------------------------------------------------------------------- +// 512-point NTT core (for Dilithium negacyclic convolution) +// ----------------------------------------------------------------------------- + +// Multiplication modulo q, using existing mod() on double_coeff_t +static inline coeff_t mul_mod(coeff_t a, coeff_t b) +{ + double_coeff_t prod = (double_coeff_t)a * (double_coeff_t)b; + return mod(prod); } -// Cooley-Tukey NTT stages (for 128-point NTT on even and odd halves) -void ntt_stage1(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS dataflow - coeff_t twiddle_coeff = 4808194; // zetas[1] - #pragma HLS DEPENDENCE variable=fifo inter RAW false +// Precomputed stage twiddles for length-512 NTT over q = 8380417 +// Stage index s corresponds to len = 2^(s+1): 2,4,8,...,512 +static const coeff_t NTT_WLEN[9] = { + (coeff_t)8380416, // len = 2 + (coeff_t)4808194, // len = 4 + (coeff_t)4614810, // len = 8 + (coeff_t)2883726, // len = 16 + (coeff_t)6250525, // len = 32 + (coeff_t)7044481, // len = 64 + (coeff_t)3241972, // len = 128 + (coeff_t)6644104, // len = 256 + (coeff_t)1921994 // len = 512 +}; - coeff_t it, a_, b_, bf1, bf2; - // Read 64 values into FIFO - for (int i = 0; i < 64; i++) { - #pragma HLS pipeline - it = a.read(); - fifo[i + 64] = it; - } - // Single iteration (j=0) since stage1 uses one twiddle - int iter = 0; - for (int k = 0; k < 64; k++) { - #pragma HLS pipeline II=1 - a_ = fifo[iter + 64]; - b_ = a.read(); - butterfly_unit_dit(twiddle_coeff, a_, b_, bf1, bf2); - b.write(bf1); - fifo[iter] = bf2; - iter++; - delay_cycle(); - } - // Drain FIFO to output stream - for (int i = 0; i < 64; i++) { - #pragma HLS pipeline II=1 - b.write(fifo[i]); - delay_cycle(); - } -} +static const coeff_t NTT_WLEN_INV[9] = { + (coeff_t)8380416, // len = 2 + (coeff_t)3572223, // len = 4 + (coeff_t)3761513, // len = 8 + (coeff_t)5234739, // len = 16 + (coeff_t)3764867, // len = 32 + (coeff_t)3227876, // len = 64 + (coeff_t)6621070, // len = 128 + (coeff_t)6125690, // len = 256 + (coeff_t)527981 // len = 512 +}; -void ntt_stage2(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS dataflow - coeff_t twiddle_coeffs[2] = {3765607, 3761513}; // zetas[2], zetas[3] - #pragma HLS DEPENDENCE variable=fifo inter RAW false +// 512^{-1} mod 8380417 +static const coeff_t INV_NTT512 = (coeff_t)8364049; - coeff_t it, a_, b_, bf1, bf2, tf; - // Read 32 values into FIFO - for (int i = 0; i < 32; i++) { - #pragma HLS pipeline - it = a.read(); - fifo[i + 64] = it; - } - // Two iterations (j=0,1) for stage2 - for (int j = 0; j < 2; j++) { - int iter = 0; - for (int k = 0; k < 32; k++) { - #pragma HLS pipeline II=1 - a_ = fifo[iter + 64]; - b_ = a.read(); - tf = twiddle_coeffs[j]; - butterfly_unit_dit(tf, a_, b_, bf1, bf2); - b.write(bf1); - fifo[iter] = bf2; - iter++; - delay_cycle(); +// In-place iterative radix-2 NTT of size 512 +static void ntt_512(coeff_t a[512], bool invert) +{ + // Bit-reversal permutation + unsigned int j = 0; + for (unsigned int i = 1; i < 512; ++i) { + unsigned int bit = 512 >> 1; + while (j & bit) { + j ^= bit; + bit >>= 1; } - // Move results from FIFO to output for this iteration - for (int i = 0; i < 32; i++) { - #pragma HLS pipeline II=1 - b.write(fifo[i]); - delay_cycle(); - if (j < 1) { - // Refill FIFO for next iteration - it = a.read(); - fifo[i + 64] = it; + j ^= bit; + if (i < j) { + coeff_t tmp = a[i]; + a[i] = a[j]; + a[j] = tmp; + } + } + + int len = 2; + int stage = 0; + while (len <= 512) { + coeff_t wlen = invert ? NTT_WLEN_INV[stage] : NTT_WLEN[stage]; + int half = len >> 1; + + for (int i = 0; i < 512; i += len) { + coeff_t w = (coeff_t)1; + for (int j = 0; j < half; ++j) { + coeff_t u = a[i + j]; + coeff_t v = mul_mod(a[i + j + half], w); + a[i + j] = modadd(u, v); + a[i + j + half] = modsub(u, v); + w = mul_mod(w, wlen); } } - } -} -void ntt_stage3(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS dataflow - coeff_t twiddle_coeffs[4] = {5178923, 5496691, 5234739, 5178987}; // zetas[4..7] - #pragma HLS DEPENDENCE variable=fifo inter RAW false - - coeff_t it, a_, b_, bf1, bf2, tf; - // Read 16 values into FIFO - for (int i = 0; i < 16; i++) { - #pragma HLS pipeline - it = a.read(); - fifo[i + 64] = it; + len <<= 1; + stage++; } - // Four iterations (j=0..3) - for (int j = 0; j < 4; j++) { - int iter = 0; - for (int k = 0; k < 16; k++) { - #pragma HLS pipeline II=1 - a_ = fifo[iter + 64]; - b_ = a.read(); - tf = twiddle_coeffs[j]; - butterfly_unit_dit(tf, a_, b_, bf1, bf2); - b.write(bf1); - fifo[iter] = bf2; - iter++; - delay_cycle(); - } - for (int i = 0; i < 16; i++) { - #pragma HLS pipeline II=1 - b.write(fifo[i]); - delay_cycle(); - if (j < 3) { - it = a.read(); - fifo[i + 64] = it; - } + + if (invert) { + // Multiply by 512^{-1} mod q to finish inverse NTT + for (int i = 0; i < 512; ++i) { + a[i] = mul_mod(a[i], INV_NTT512); } } } -void ntt_stage4(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS dataflow - coeff_t twiddle_coeffs[8] = {7778734, 3542485, 2682288, 2129892, 3764867, 7375178, 557458, 7159240}; // zetas[8..15] - #pragma HLS DEPENDENCE variable=fifo inter RAW false +// Negacyclic convolution via 512-point NTT: +// c(x) = a(x) * b(x) mod (x^Nt + 1, q), Nt = 256 +static void poly_mult_dil_core(coeff_t c[Nt], + const coeff_t a[Nt], + const coeff_t b[Nt]) +{ + const int N2 = 2 * Nt; // 512 for Nt = 256 + coeff_t A[N2]; + coeff_t B[N2]; + coeff_t C[N2]; - coeff_t it, a_, b_, bf1, bf2, tf; - // Read 8 values into FIFO - for (int i = 0; i < 8; i++) { - #pragma HLS pipeline - it = a.read(); - fifo[i + 64] = it; - } - for (int j = 0; j < 8; j++) { - int iter = 0; - for (int k = 0; k < 8; k++) { - #pragma HLS pipeline II=1 - a_ = fifo[iter + 64]; - b_ = a.read(); - tf = twiddle_coeffs[j]; - butterfly_unit_dit(tf, a_, b_, bf1, bf2); - b.write(bf1); - fifo[iter] = bf2; - iter++; - delay_cycle(); - } - for (int i = 0; i < 8; i++) { - #pragma HLS pipeline II=1 - b.write(fifo[i]); - delay_cycle(); - if (j < 7) { - it = a.read(); - fifo[i + 64] = it; - } - } - } -} - -void ntt_stage5(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS dataflow - coeff_t twiddle_coeffs[16] = {6444997, 1935420, 758451, 3144429, 4509984, 2341984, 3246732, 5860400, - 2312402, 804963, 725031, 3379856, 3427835, 2667861, 5128059, 3006285}; // zetas[16..31] - #pragma HLS DEPENDENCE variable=fifo inter RAW false - - coeff_t it, a_, b_, bf1, bf2, tf; - // Read 4 values into FIFO - for (int i = 0; i < 4; i++) { - #pragma HLS pipeline - it = a.read(); - fifo[i + 64] = it; - } - for (int j = 0; j < 16; j++) { - int iter = 0; - for (int k = 0; k < 4; k++) { - #pragma HLS pipeline II=1 - a_ = fifo[iter + 64]; - b_ = a.read(); - tf = twiddle_coeffs[j]; - butterfly_unit_dit(tf, a_, b_, bf1, bf2); - b.write(bf1); - fifo[iter] = bf2; - iter++; - delay_cycle(); - } - for (int i = 0; i < 4; i++) { - #pragma HLS pipeline II=1 - b.write(fifo[i]); - delay_cycle(); - if (j < 15) { - it = a.read(); - fifo[i + 64] = it; - } - } - } -} - -void ntt_stage6(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS dataflow - coeff_t twiddle_coeffs[32] = {3464972, 3314078, 2117899, 6534358, 2054587, 5011888, 2700113, 1217931, - 5833231, 2344214, 3782571, 4605192, 1703062, 5540785, 1319459, 1890611, - 4940651, 781404, 3266285, 816525, 2535052, 4276470, 3967860, 2047244, - 1578017, 327500, 730000, 5730796, 671093, 1925063, 3915834, 4083499}; // zetas[32..63] - #pragma HLS DEPENDENCE variable=fifo inter RAW false - - coeff_t it, a_, b_, bf1, bf2, tf; - // Read 2 values into FIFO - for (int i = 0; i < 2; i++) { - #pragma HLS pipeline - it = a.read(); - fifo[i + 64] = it; - } - for (int j = 0; j < 32; j++) { - int iter = 0; - for (int k = 0; k < 2; k++) { - #pragma HLS pipeline II=1 - a_ = fifo[iter + 64]; - b_ = a.read(); - tf = twiddle_coeffs[j]; - butterfly_unit_dit(tf, a_, b_, bf1, bf2); - b.write(bf1); - fifo[iter] = bf2; - iter++; - delay_cycle(); - } - for (int i = 0; i < 2; i++) { - #pragma HLS pipeline II=1 - b.write(fifo[i]); - delay_cycle(); - if (j < 31) { - it = a.read(); - fifo[i + 64] = it; - } - } - } -} - -void ntt_stage7(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS inline off - coeff_t twiddle_coeffs[64] = {3073009, 5307408, 1059855, 7320562, 2447023, 5933394, 792093, 7588324, - 2905547, 5474870, 1638942, 6741475, 1572578, 6794011, 832358, 5458059, - 3529344, 360527, 2590147, 2255688, 2160675, 6219742, 1474570, 539386, - 5079153, 3308886, 4520271, 3650694, 4642538, 4400500, 807498, 136874, - 3787775, 4592642, 5308709, 708402, 776149, 4379844, 92198, 210900, - 6520686, 5057309, 3766986, 725250, 674483, 2092149, 334831, 4235840, - 663807, 3469593, 4168073, 752744, 4608668, 717773, 1803252, 606508, - 816722, 2933738, 1919820, 4873877, 1486229, 1590146, 6600782, 503907}; // zetas[64..127] - #pragma HLS DEPENDENCE variable=fifo inter RAW false - - coeff_t u, t, bf1, bf2; - // Initial read - u = a.read(); - for (int j = 0; j < 64; j++) { + // Copy inputs and zero-pad + for (int i = 0; i < Nt; ++i) { #pragma HLS pipeline II=1 - t = a.read(); - butterfly_unit_dit(twiddle_coeffs[j], u, t, bf1, bf2); - b.write(bf1); - b.write(bf2); - if (j < 63) { - u = a.read(); - } + A[i] = a[i]; + B[i] = b[i]; } -} - -// Gentleman-Sande INTT stages (for 128-point INTT on even and odd halves) -void intt_stage1(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS inline off - coeff_t twiddle_coeffs[64] = {7325939, 2236726, 7985040, 7159498, 2220417, 6925862, 626953, 677441, - 5474870, 2905547, 6133394, 2447023, 7320562, 1059855, 5307408, 3073009, - 4504440, 780313, 4388586, 1744897, 6219742, 2160675, 2255688, 2590147, - 360527, 3529344, 5458059, 832358, 6794011, 1572578, 6741475, 1638942, - 5474870, 2905547, 792093, 5933394, 7588324, 792093, 5933394, 7588324, - 792093, 5933394, 792093, 5933394, 792093, 5933394, 792093, 5933394, - 792093, 5933394, 792093, 5933394, 792093, 5933394, 792093, 5933394, - 792093, 5933394, 792093, 5933394, 792093, 5933394, 792093, 5933394}; - // (The above array is filled with proper values for zetas[127..64] negated) - #pragma HLS DEPENDENCE variable=fifo inter RAW false - - coeff_t u, t, bf1, bf2; - // Initial read - u = a.read(); - for (int j = 0; j < 64; j++) { + for (int i = Nt; i < N2; ++i) { #pragma HLS pipeline II=1 - t = a.read(); - butterfly_unit_dif(twiddle_coeffs[j], u, t, bf1, bf2); - b.write(bf1); - b.write(bf2); - if (j < 63) { - u = a.read(); - } + A[i] = 0; + B[i] = 0; } -} -// (Note: The intt_stage1 array above is truncated in this snippet due to its length. -// In the actual code, it should contain 64 values corresponding to q - zetas[127..64].) + // Forward 512-point NTT of both sequences + ntt_512(A, false); + ntt_512(B, false); -void intt_stage2(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS dataflow - coeff_t twiddle_coeffs[32] = {2419, 2102, 219, 855, 2681, 1848, 712, 682, - 927, 1795, 461, 1891, 2877, 2522, 1894, 1010, - 1414, 2009, 3296, 464, 2697, 816, 1352, 2679, - 1274, 1052, 1025, 2132, 1573, 76, 2998, 3040}; - // (Update values above to q - zetas[63..32]) - #pragma HLS DEPENDENCE variable=fifo inter RAW false - - coeff_t it, a_, b_, bf1, bf2, tf; - // Read 2 values into FIFO - for (int i = 0; i < 2; i++) { - #pragma HLS pipeline - it = a.read(); - fifo[i + 64] = it; - } - int ind = 0; - for (int j = 0; j < 32; j++) { - int iter = 0; - for (int k = 0; k < 2; k++) { - #pragma HLS pipeline II=1 - a_ = fifo[iter + 64]; - b_ = a.read(); - tf = twiddle_coeffs[ind]; - butterfly_unit_dif(tf, a_, b_, bf1, bf2); - b.write(bf1); - fifo[iter] = bf2; - iter++; - if (++ind, (ind, ind)) {} // placeholder to increment ind - delay_cycle(); - } - for (int i = 0; i < 2; i++) { - #pragma HLS pipeline II=1 - b.write(fifo[i]); - delay_cycle(); - if (j < 31) { - it = a.read(); - fifo[i + 64] = it; - } - } - } -} - -// (The intt_stage2 through intt_stage7 functions should similarly use the updated -// inverse twiddle arrays based on q - zetas in reverse order. Due to length, they are -// not fully expanded here but must be filled with the correct constants.) - -void intt_stage7(hls::stream &a, hls::stream &b, coeff_t fifo[]) { - #pragma HLS inline off - coeff_t inv_twiddle = 3572223; // q - zetas[1] - coeff_t a_, b_, bf1, bf2, bf1n, bf2n; - // Read 64 values into FIFO - for (int i = 0; i < 64; i++) { - #pragma HLS pipeline - fifo[i + 64] = a.read(); - } - // Single iteration - int iter = 0; - for (int k = 0; k < 64; k++) { + // Pointwise multiplication in NTT domain + for (int i = 0; i < N2; ++i) { #pragma HLS pipeline II=1 - a_ = fifo[iter + 64]; - b_ = a.read(); - butterfly_unit_dif(inv_twiddle, a_, b_, bf1, bf2); - bf1n = mod((double_coeff_t) bf1 * inv_n); - bf2n = mod((double_coeff_t) bf2 * inv_n); - b.write(bf1n); - fifo[iter] = bf2n; - iter++; - delay_cycle(); + C[i] = mul_mod(A[i], B[i]); } - for (int i = 0; i < 64; i++) { + + // Inverse 512-point NTT -> length-512 cyclic convolution + ntt_512(C, true); + + // Negacyclic fold: c[k] = C[k] - C[k + Nt] mod q + for (int k = 0; k < Nt; ++k) { #pragma HLS pipeline II=1 - b.write(fifo[i]); - delay_cycle(); + c[k] = modsub(C[k], C[k + Nt]); } } -// Splitting input stream of 256 values into two 128-length streams (even and odd indices) -void stream_split(hls::stream &input, - hls::stream &input1, - hls::stream &input2) { - coeff_t_stream_big x; - double_coeff_t A; - coeff_t_stream x1, x2; - coeff_t a1, a2; - for (int i = 0; i < Nt; i++) { - #pragma HLS pipeline II=1 - x = input.read(); - A = x.value; - // Upper 24 bits (a1) and lower 24 bits (a2) from 48-bit input - a1 = (coeff_t) (A >> 24); - a2 = (coeff_t) (A & 0xFFFFFF); - x1.last = (i == Nt - 1) ? 1 : 0; - x2.last = (i == Nt - 1) ? 1 : 0; - x1.value = a1; - x2.value = a2; - input1.write(x1); - input2.write(x2); - } + +// END "For Dilithium" +//---------------------------------------------------------- + + + +void butterfly_unit_dif(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y) +{ + #pragma HLS pipeline II = 1 + x = modadd(a, b); + y = modsub(a, b); + y = mod(w * y); } -// Pointwise multiplication of two polynomials in NTT domain (128-point segments) -void point_wise_mult(hls::stream &input1, - hls::stream &input2, - hls::stream &output) { - coeff_t_stream xe, xo, ye, yo, z; - coeff_t ae, ao, be, bo; - coeff_t c1, c2, c2s, c3, c4, ce, co; - // Precomputed factors for combining even/odd results (zetas[128..255] and negatives) - const coeff_t pm_factors[256] = { - 1753, 8378664, 6444997, 1935420, 2076525, 6303892, 170554, 8219863, - 2861582, 5518835, 4736363, 3644054, 1284551, 7095866, 4674408, 3706009, - 1703515, 6676902, 6270501, 211, 210306, 8170111, 5026888, 3353529, - 3821789, 4558628, 5349716, 3030701, 4762485, 3617932, 694359, 7686058, - 7180203, 1203993, 5380777, 2999640, 1470738, 182767, 6775507, 1604910, - 3953406, 4427011, 7645474, 7344583, 2301921, 3079873, 5457470, 2922947, - 3160165, 5220252, 6822694, 1557723, 3485688, 489920, 4470900, 6951308, - 3631685, 1292402, 2457654, 5922763, 3084048, 5296369, 3889861, 4490556, - 4786681, 359373, 1200966, 7202573, 7314419, 106599, 735782, 496854, - 2048786, 490557, 7260057, 1120366, 5088054, 6318234, 7347057, 966360, - 4760745, 6182010, 6561879, 1818538, 5907988, 2607559, 782297, 7006181, - 5742811, 466956, 1318919, 1078884, 2303021, 6077396, 4208760, 7534770, - 784477, 1483457, 2560207, 3936377, 3744674, 4588648, 5123635, 6000581, - 6717385, 2817068, 3969034, 2474430, 6920900, 1459517, 2932921, 2880620, - 4090318, 4299625, 6783841, 2529645, 4251080, 1195350, 4615261, 1989427, - 4947661, 7030184, 1746361, 3473799, 2817213, 6337613, 6413348, 595005, - 3853325, 573861, 7558651, 323795, 5319769, 3124755, 2851397, 3919588, - 6166293, 2212410, 7601862, 195655, 6841930, 2185851, 494078, 5892136, - 4477829, 610638, 3146478, 2235709, 7506764, 873653, 1326355, 2692427, - 3633128, 4748095, 4533405, 855796, 5848968, 6652853, 4890346, 693609, - 1535434, 3299512, 781075, 5930470, 2493616, 1850670, 4069982, 1282899, - 5778393, 2602024, 7287008, 333697, 3315151, 5058045, 1326139, 3661528, - 364298, 8016119, 1417858, 6962559, 2212614, 6167803, 807224, 3811192, - 6770466, 1616378, 6143691, 2236726, 7325939, 1054478, 5307408, 3073009, - 7159498, 2220417, 677441, 626953, 6925862, 2220417, 7159498, 2220417, - 7159498, 2220417, 7159498, 2220417, 7159498, 2220417, 7159498, 2220417, - 7159498, 2220417, 7159498, 2220417, 7159498, 2220417, 7159498, 2220417 - }; - // (Above pm_factors array contains 128 pairs: zetas[128..255] and their negatives, interleaved.) - - z.last = 0; - for (int i = 0; i < N; i++) { - #pragma HLS pipeline II=1 - xe = input1.read(); - xo = input1.read(); - ye = input2.read(); - yo = input2.read(); - ae = xe.value; - ao = xo.value; - be = ye.value; - bo = yo.value; - c1 = mod((double_coeff_t) ae * be); - c2 = mod((double_coeff_t) ao * bo); - c2s = mod((double_coeff_t) c2 * pm_factors[i * 2]); // factor even index - // pm_factors array is interleaved: for index i, pm_factors[2*i] = ζ^(some), [2*i+1] = -ζ^(some). - // Here we use the appropriate factor for c2 (even part) and effectively include sign in formula. - c3 = mod((double_coeff_t) ae * bo); - c4 = mod((double_coeff_t) ao * be); - ce = modadd(c1, c2s); - co = modadd(c3, c4); - z.value = ce; - output.write(z); - if (i == N - 1) { - z.last = 1; - } - z.value = co; - output.write(z); - } +void butterfly_unit_dit(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y) +{ + #pragma HLS pipeline II = 1 + coeff_t wb = mod(w * b); + x = modadd(a, wb); + y = modsub(a, wb); } -// AXI4-Stream to internal stream conversion for input +void delay_cycle() +{ + #ifdef __SYNTHESIS__ + ap_wait_n(1); + #endif +} + +void ntt_stage1 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeff = 1729; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 64; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + for (int j = 0; j < 1; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 64; k++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeff; + butterfly_unit_dit(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + delay_cycle(); + } + + for (int i = 0; i < 64; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + } + } +} + +void ntt_stage2 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[2] = {2580, 3289}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 32; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + for (int j = 0; j < 2; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 32; k++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[j]; + butterfly_unit_dit(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + delay_cycle(); + } + + for (int i = 0; i < 32; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 1) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void ntt_stage3 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[4] = {2642, 630, 1897, 848}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 16; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + for (int j = 0; j < 4; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 16; k++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[j]; + butterfly_unit_dit(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + delay_cycle(); + } + + for (int i = 0; i < 16; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 3) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void ntt_stage4 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[8] = {1062, 1919, 193, 797, 2786, 3260, 569, 1746}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 8; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + for (int j = 0; j < 8; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + int ind = 1; + for (int k = 0; k < 8; k++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[j]; + butterfly_unit_dit(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + delay_cycle(); + } + + for (int i = 0; i < 8; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 7) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void ntt_stage5 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[16] = {296, 2447, 1339, 1476, 3046, 56, 2240, 1333, + 1426, 2094, 535, 2882, 2393, 2879, 1974, 821}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 4; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + for (int j = 0; j < 16; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + int ind = 1; + for (int k = 0; k < 4; k = k + 1) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[j]; + butterfly_unit_dit(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + delay_cycle(); + } + + for (int i = 0; i < 4; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 15) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void ntt_stage6 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[32] = {289, 331, 3253, 1756, 1197, 2304, 2277, 2055, + 650, 1977, 2513, 632, 2865, 33, 1320, 1915, + 2319, 1435, 807, 452, 1438, 2868, 1534, 2402, + 2647, 2617, 1481, 648, 2474, 3110, 1227, 910}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 2; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + for (int j = 0; j < 32; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + int ind = 1; + for (int k = 0; k < 2; k = k + 1) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[j]; + butterfly_unit_dit(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + delay_cycle(); + } + + for (int i = 0; i < 2; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 31) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void ntt_stage7 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS inline off + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + coeff_t twiddle_coeffs[64] = {17, 2761, 583, 2649, 1637, 723, 2288, 1100, + 1409, 2662, 3281, 233, 756, 2156, 3015, 3050, + 1703, 1651, 2789, 1789, 1847, 952, 1461, 2687, + 939, 2308, 2437, 2388, 733, 2337, 268, 641, + 1584, 2298, 2037, 3220, 375, 2549, 2090, 1645, + 1063, 319, 2773, 757, 2099, 561, 2466, 2594, + 2804, 1092, 403, 1026, 1143, 2150, 2775, 886, + 1722, 1212, 1874, 1029, 2110, 2935, 885, 2154}; + int x, y; + coeff_t u, t, it, bf1, bf2; + + u = a.read(); + + for (int j = 0; j < 64; j++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + t = a.read(); + butterfly_unit_dit(twiddle_coeffs[j], u, t, bf1, bf2); + b.write(bf1); + b.write(bf2); + if (j < 63) + u = a.read(); + } +} + +void intt_stage1 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + coeff_t twiddle_coeffs[64] = {1175, 2444, 394, 1219, 2300, 1455, 2117, 1607, + 2443, 554, 1179, 2186, 2303, 2926, 2237, 525, + 735, 863, 2768, 1230, 2572, 556, 3010, 2266, + 1684, 1239, 780, 2954, 109, 1292, 1031, 1745, + 2688, 3061, 992, 2596, 941, 892, 1021, 2390, + 642, 1868, 2377, 1482, 1540, 540, 1678, 1626, + 279, 314, 1173, 2573, 3096, 48, 667, 1920, + 2229, 1041, 2606, 1692, 680, 2746, 568, 3312}; + + #pragma HLS inline off + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int x, y; + coeff_t u, t, it, bf1, bf2; + + u = a.read(); + + for (int j = 0; j < 64; j++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + t = a.read(); + butterfly_unit_dif(twiddle_coeffs[j], u, t, bf1, bf2); + b.write(bf1); + b.write(bf2); + if (j < 63) + u = a.read(); + } +} + +void intt_stage2 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[32] = {2419, 2102, 219, 855, 2681, 1848, 712, 682, + 927, 1795, 461, 1891, 2877, 2522, 1894, 1010, + 1414, 2009, 3296, 464, 2697, 816, 1352, 2679, + 1274, 1052, 1025, 2132, 1573, 76, 2998, 3040}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 2; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + int ind = 0; + int count = 0; + for (int j = 0; j < 32; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 2; k = k + 1) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[ind]; + butterfly_unit_dif(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + count++; + if (count % 2 == 0) + ind++; + delay_cycle(); + } + + for (int i = 0; i < 2; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 31) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void intt_stage3 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[16] = {2508, 1355, 450, 936, 447, 2794, 1235, 1903, + 1996, 1089, 3273, 283, 1853, 1990, 882, 3033}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + int m = 4; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 4; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + int ind = 0; + int count = 0; + for (int j = 0; j < 16; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 4; k = k + 1) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[ind]; + butterfly_unit_dif(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + count++; + if (count % 4 == 0) + ind++; + delay_cycle(); + } + + for (int i = 0; i < 4; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 15) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void intt_stage4 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[8] = {1583, 2760, 69, 543, 2532, 3136, 1410, 2267}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 8; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + int ind = 0; + int count = 0; + for (int j = 0; j < 8; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 8; k = k + 1) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[ind]; + butterfly_unit_dif(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + count++; + if (count % 8 == 0) + ind++; + delay_cycle(); + } + + for (int i = 0; i < 8; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 7) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void intt_stage5 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[4] = {2481, 1432, 2699, 687}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 16; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + int ind = 0; + int count = 0; + for (int j = 0; j < 4; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 16; k = k + 1) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[ind]; + butterfly_unit_dif(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + count++; + if (count % 16 == 0) + ind++; + delay_cycle(); + } + + for (int i = 0; i < 16; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 3) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void intt_stage6 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS dataflow + coeff_t twiddle_coeffs[2] = {40, 749}; + + #pragma HLS DEPENDENCE variable = fifo inter RAW false + + int x, y; + coeff_t a_, b_, it, bf1, bf2, tf; + + for (int i = 0; i < 32; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + int ind = 0; + int count = 0; + for (int j = 0; j < 2; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 32; k = k + 1) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = twiddle_coeffs[ind]; + butterfly_unit_dif(tf, a_, b_, bf1, bf2); + b.write(bf1); + fifo[iter] = bf2; + iter++; + count++; + if (count == 32) + ind++; + delay_cycle(); + } + + for (int i = 0; i < 32; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + if (j < 1) + { + it = a.read(); + fifo[i + 64] = it; + } + } + } +} + +void intt_stage7 (hls::stream &a, hls::stream &b, coeff_t fifo[]) +{ + #pragma HLS inline off + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int x, y; + coeff_t a_, b_, it, bf1, bf2, bfn1, bfn2, tf; + + for (int i = 0; i < 64; i++) + { + #pragma HLS pipeline + it = a.read(); + fifo[i + 64] = it; + } + + for (int j = 0; j < 1; j++) + { + #pragma HLS DEPENDENCE variable = fifo inter RAW false + int iter = 0; + for (int k = 0; k < 64; k = k + 1) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + a_ = fifo[iter + 64]; + b_ = a.read(); + tf = 1600; + butterfly_unit_dif(tf, a_, b_, bf1, bf2); + bfn1 = mod(bf1 * inv_n); + bfn2 = mod(bf2 * inv_n); + b.write(bfn1); + fifo[iter] = bfn2; + iter++; + delay_cycle(); + } + + for (int i = 0; i < 64; i++) + { + #pragma HLS pipeline II = 1 + #pragma HLS DEPENDENCE variable = fifo inter RAW false + b.write(fifo[i]); + delay_cycle(); + } + } +} + +void read_inputs (hls::stream &input, hls::stream &se, hls::stream &so) +{ + coeff_t_stream x; + coeff_t a; + int i; + + for (i=0; i &se, hls::stream &so, hls::stream &output) +{ + coeff_t a1, a0; + coeff_t_stream y; + int i; + + y.last = 0; + for (i=0; i &input, hls::stream &output) +{ + #pragma HLS dataflow + + hls::stream s0o("s0o"), s1o("s1o"), s2o("s2o"), s3o("s3o"), + s4o("s4o"), s5o("s5o"), s6o("s6o"), s7o("s7o"), + s0e("s0e"), s1e("s1e"), s2e("s2e"), s3e("s3e"), + s4e("s4e"), s5e("s5e"), s6e("s6e"), s7e("s7e"); + + coeff_t fo7[65], fo6[66], fo5[68], fo4[72], fo3[80], fo2[96], fo1[128]; + coeff_t fe7[65], fe6[66], fe5[68], fe4[72], fe3[80], fe2[96], fe1[128]; + + coeff_t_stream x, y; + + #pragma HLS STREAM variable = s7o depth = 1 + #pragma HLS STREAM variable = s6o depth = 2 + #pragma HLS STREAM variable = s5o depth = 4 + #pragma HLS STREAM variable = s4o depth = 8 + #pragma HLS STREAM variable = s3o depth = 16 + #pragma HLS STREAM variable = s2o depth = 32 + #pragma HLS STREAM variable = s1o depth = 64 + #pragma HLS STREAM variable = s0o depth = 128 + + #pragma HLS STREAM variable = s7e depth = 1 + #pragma HLS STREAM variable = s6e depth = 2 + #pragma HLS STREAM variable = s5e depth = 4 + #pragma HLS STREAM variable = s4e depth = 8 + #pragma HLS STREAM variable = s3e depth = 16 + #pragma HLS STREAM variable = s2e depth = 32 + #pragma HLS STREAM variable = s1e depth = 64 + #pragma HLS STREAM variable = s0e depth = 128 + + + read_inputs(input, s0e, s0o); + + ntt_stage1 (s0e, s1e, fe1); + ntt_stage1 (s0o, s1o, fo1); + + ntt_stage2 (s1e, s2e, fe2); + ntt_stage2 (s1o, s2o, fo2); + + ntt_stage3 (s2e, s3e, fe3); + ntt_stage3 (s2o, s3o, fo3); + + ntt_stage4 (s3e, s4e, fe4); + ntt_stage4 (s3o, s4o, fo4); + + ntt_stage5 (s4e, s5e, fe5); + ntt_stage5 (s4o, s5o, fo5); + + ntt_stage6 (s5e, s6e, fe6); + ntt_stage6 (s5o, s6o, fo6); + + ntt_stage7 (s6e, s7e, fe7); + ntt_stage7 (s6o, s7o, fo7); + + write_outputs(s7e, s7o, output); +} + +void gs_intt (hls::stream &input, hls::stream &output) +{ + #pragma HLS dataflow + + hls::stream s0o("s0o"), s1o("s1o"), s2o("s2o"), s3o("s3o"), + s4o("s4o"), s5o("s5o"), s6o("s6o"), s7o("s7o"), + s0e("s0e"), s1e("s1e"), s2e("s2e"), s3e("s3e"), + s4e("s4e"), s5e("s5e"), s6e("s6e"), s7e("s7e"); + + coeff_t fo7[128], fo6[96], fo5[80], fo4[72], fo3[68], fo2[66], fo1[65]; + coeff_t fe7[128], fe6[96], fe5[80], fe4[72], fe3[68], fe2[66], fe1[65]; + + coeff_t_stream x, y; + + #pragma HLS STREAM variable = s7o depth = 1 + #pragma HLS STREAM variable = s6o depth = 2 + #pragma HLS STREAM variable = s5o depth = 4 + #pragma HLS STREAM variable = s4o depth = 8 + #pragma HLS STREAM variable = s3o depth = 16 + #pragma HLS STREAM variable = s2o depth = 32 + #pragma HLS STREAM variable = s1o depth = 64 + #pragma HLS STREAM variable = s0o depth = 128 + + #pragma HLS STREAM variable = s7e depth = 1 + #pragma HLS STREAM variable = s6e depth = 2 + #pragma HLS STREAM variable = s5e depth = 4 + #pragma HLS STREAM variable = s4e depth = 8 + #pragma HLS STREAM variable = s3e depth = 16 + #pragma HLS STREAM variable = s2e depth = 32 + #pragma HLS STREAM variable = s1e depth = 64 + #pragma HLS STREAM variable = s0e depth = 128 + + read_inputs(input, s0e, s0o); + + intt_stage1 (s0e, s1e, fe1); + intt_stage1 (s0o, s1o, fo1); + + intt_stage2 (s1e, s2e, fe2); + intt_stage2 (s1o, s2o, fo2); + + intt_stage3 (s2e, s3e, fe3); + intt_stage3 (s2o, s3o, fo3); + + intt_stage4 (s3e, s4e, fe4); + intt_stage4 (s3o, s4o, fo4); + + intt_stage5 (s4e, s5e, fe5); + intt_stage5 (s4o, s5o, fo5); + + intt_stage6 (s5e, s6e, fe6); + intt_stage6 (s5o, s6o, fo6); + + intt_stage7 (s6e, s7e, fe7); + intt_stage7 (s6o, s7o, fo7); + + write_outputs(s7e, s7o, output); +} + +void stream_split (hls::stream &input, + hls::stream &input1, + hls::stream &input2) +{ + + coeff_t_stream_big x; + double_coeff_t a; + coeff_t_stream x1, x2; + coeff_t a1, a2; + int i; + + for (i=0; i &input1, + hls::stream &input2, + hls::stream &output) +{ + coeff_t_stream xe, xo, ye, yo, z; + coeff_t ae, be, ce, ao, bo, co, c1, c2, c2s, c3, c4; + int i; + + coeff_t pm_factors[128] = {17, 3312, 2761, 568, 583, 2746, 2649, 680, + 1637, 1692, 723, 2606, 2288, 1041, 1100, 2229, + 1409, 1920, 2662, 667, 3281, 48, 233, 3096, + 756, 2573, 2156, 1173, 3015, 314, 3050, 279, + 1703, 1626, 1651, 1678, 2789, 540, 1789, 1540, + 1847, 1482, 952, 2377, 1461, 1868, 2687, 642, + 939, 2390, 2308, 1021, 2437, 892, 2388, 941, + 733, 2596, 2337, 992, 268, 3061, 641, 2688, + 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109, + 375, 2954, 2549, 780, 2090, 1239, 1645, 1684, + 1063, 2266, 319, 3010, 2773, 556, 757, 2572, + 2099, 1230, 561, 2768, 2466, 863, 2594, 735, + 2804, 525, 1092, 2237, 403, 2926, 1026, 2303, + 1143, 2186, 2150, 1179, 2775, 554, 886, 2443, + 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300, + 2110, 1219, 2935, 394, 885, 2444, 2154, 1175}; + + z.last = 0; + for (i=0; i internal stream conversion helpers (only at top level) +// ----------------------------------------------------------------------------- + static void axis_to_internal_input(hls::stream &axis_in, - hls::stream &int_in) { - coeff_axis_big_t a; - coeff_t_stream_big x; - for (int i = 0; i < Nt; i++) { - #pragma HLS pipeline II=1 - a = axis_in.read(); - x.value = (double_coeff_t) a.data; - x.last = a.last; - int_in.write(x); - if (a.last) break; - } + hls::stream &int_in) +{ + coeff_axis_big_t a; + coeff_t_stream_big x; + + for (int i = 0; i < Nt; i++) + { + #pragma HLS pipeline II = 1 + a = axis_in.read(); + + x.value = (double_coeff_t)a.data; + x.last = a.last; + + int_in.write(x); + + // Optional: break on TLAST if you want to be robust to shorter packets + if (a.last) + break; + } } -// Internal stream to AXI4-Stream conversion for output static void internal_to_axis_output(hls::stream &int_out, - hls::stream &axis_out) { - coeff_t_stream x; - coeff_axis_t a; - for (int i = 0; i < Nt; i++) { - #pragma HLS pipeline II=1 - x = int_out.read(); - a.data = (ap_uint<24>) x.value; - a.last = x.last; - a.keep = -1; - a.strb = -1; - axis_out.write(a); - if (x.last) break; - } + hls::stream &axis_out) +{ + coeff_t_stream x; + coeff_axis_t a; + + for (int i = 0; i < Nt; i++) + { + #pragma HLS pipeline II = 1 + x = int_out.read(); + + a.data = (ap_uint<32>)x.value; + a.last = x.last; + + // Mark all bytes valid; side channels are disabled in this ap_axiu config + a.keep = -1; + a.strb = -1; + + axis_out.write(a); + + if (x.last) + break; + } } -// Top-level function -int poly_mult(hls::stream &input, - hls::stream &output) { +// ----------------------------------------------------------------------------- +// Top-level function with AXI4-Stream ports (for DMA) and internal NTT pipeline +// ----------------------------------------------------------------------------- +/* +int poly_mult_dil (hls::stream &input, + hls::stream &output) +{ + #pragma HLS INTERFACE axis register port=input + #pragma HLS INTERFACE axis register port=output + #pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS + #pragma HLS dataflow + + // Internal streams using the original coeff_t_stream{,_big} types + hls::stream in_internal("in_internal"); + hls::stream input1("input1"), input2("input2"); + hls::stream middle1("middle1"), middle2("middle2"); + hls::stream middle3("middle3"), out_internal("out_internal"); + + axis_to_internal_input(input, in_internal); + stream_split(in_internal, input1, input2); + ct_ntt(input1, middle1); + ct_ntt(input2, middle2); + point_wise_mult(middle1, middle2, middle3); + gs_intt(middle3, out_internal); + internal_to_axis_output(out_internal, output); + + return 0; +} +*/ + +// For Dilithium +// ----------------------------------------------------------------------------- +// Top-level function: now uses array-based 512-NTT core instead of Kyber NTT +// ----------------------------------------------------------------------------- + +int poly_mult_dil(hls::stream &input, + hls::stream &output) +{ #pragma HLS INTERFACE axis register port=input #pragma HLS INTERFACE axis register port=output #pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS #pragma HLS dataflow - // Internal streams + // Internal streams (same as before) hls::stream in_internal("in_internal"); - hls::stream input1("input1"), input2("input2"); - hls::stream mid1("mid1"), mid2("mid2"); - hls::stream mid3("mid3"), out_internal("out_internal"); + hls::stream input1("input1"), input2("input2"); + hls::stream out_internal("out_internal"); - coeff_t fe1[128], fe2[96], fe3[80], fe4[72], fe5[68], fe6[66], fe7[65]; - coeff_t fo1[128], fo2[96], fo3[80], fo4[72], fo5[68], fo6[66], fo7[65]; - #pragma HLS STREAM variable=mid1 depth=1 - #pragma HLS STREAM variable=mid2 depth=1 - #pragma HLS STREAM variable=mid3 depth=1 - #pragma HLS STREAM variable=out_internal depth=1 - - // Dataflow pipeline + // Existing helpers: keep as they are in your file axis_to_internal_input(input, in_internal); stream_split(in_internal, input1, input2); - ct_ntt(input1, mid1); - ct_ntt(input2, mid2); - point_wise_mult(mid1, mid2, mid3); - gs_intt(mid3, out_internal); + + // Local polynomial buffers + coeff_t poly_a[Nt]; + coeff_t poly_b[Nt]; + coeff_t poly_c[Nt]; + + // Read Nt coefficients for each polynomial from the two internal streams + for (int i = 0; i < Nt; ++i) { + #pragma HLS pipeline II=1 + coeff_t_stream x1 = input1.read(); + coeff_t_stream x2 = input2.read(); + poly_a[i] = x1.value; + poly_b[i] = x2.value; + } + + // Core negacyclic multiplication via 512-point NTT + poly_mult_dil_core(poly_c, poly_a, poly_b); + + // Stream result back out as coeff_t_stream (value + last) + for (int i = 0; i < Nt; ++i) { + #pragma HLS pipeline II=1 + coeff_t_stream y; + y.value = poly_c[i]; + y.last = (i == Nt - 1) ? (bit)1 : (bit)0; + out_internal.write(y); + } + + // Existing helper: convert internal stream to AXI output internal_to_axis_output(out_internal, output); return 0; -} +} \ No newline at end of file diff --git a/HLS_Codes_Dilithium/test_case.h b/HLS_Codes_Dilithium/test_case.h index 3322459..faf24f1 100644 --- a/HLS_Codes_Dilithium/test_case.h +++ b/HLS_Codes_Dilithium/test_case.h @@ -1,64 +1,60 @@ #include "ntt.h" -// Test input and expected output arrays for Dilithium polynomial multiplication -coeff_t input1_vals[] = { - 1477, 218, 784, 251, 747, 1051, 1924, 133, 2953, 1295, 2989, 1519, 1701, 1874, 2806, 423, - 2883, 327, 47, 2525, 1508, 214, 2998, 217, 1852, 2624, 2286, 3039, 3076, 1213, 1808, 2554, - 1129, 1353, 2690, 2839, 1778, 2752, 1378, 601, 914, 2335, 2497, 1139, 2611, 129, 1318, 1570, - 3190, 1868, 940, 2901, 2626, 2473, 3195, 2621, 2436, 3046, 1018, 1139, 1729, 3021, 2064, 945, - 690, 1700, 1836, 1943, 2333, 2131, 1618, 1741, 2639, 2653, 301, 2013, 2744, 2406, 2995, 2463, - 2366, 1495, 442, 224, 1349, 11, 2342, 1712, 2847, 1578, 2654, 2734, 3131, 1245, 1862, 527, - 2400, 2043, 1360, 451, 573, 898, 2018, 3100, 161, 284, 1949, 362, 755, 2916, 1288, 1616, 876, - 1682, 853, 2772, 2956, 1101, 2, 214, 2589, 211, 1025, 610, 1225, 2118, 224, 1296, 2612, 2634, - 2056, 3227, 1712, 1258, 552, 1345, 786, 2124, 2915, 1226, 1233, 2654, 2786, 2636, 2234, 727, - 2444, 199, 600, 2262, 3221, 915, 63, 318, 74, 2396, 1690, 2390, 1711, 414, 10, 2298, 1082, - 1419, 3151, 1723, 2744, 3274, 2518, 2954, 1208, 2941, 2089, 3288, 1370, 783, 2517, 3190, 3069, - 2505, 2840, 1427, 1670, 3091, 655, 96, 1935, 880, 2511, 876, 2371, 341, 196, 2849, 919, 161, - 603, 2993, 2903, 1721, 139, 3326, 1876, 379, 2508, 2094, 1929, 430, 1033, 2604, 1955, 1333, - 2274, 3312, 2604, 1585, 2317, 3230, 3068, 2905, 3268, 2844, 1023, 2824, 1731, 643, 820, 462, - 2975, 314, 2218, 2011, 649, 383, 874, 2181, 866, 1192, 2914, 2290, 1820, 1572, 1030, 3076, - 1526, 2760, 12, 529, 1242, 560, 2723, 2894, 1097, 778, 1495, 371 -}; -coeff_t input2_vals[] = { - 2960, 3124, 509, 485, 2525, 385, 608, 2893, 2423, 1802, 2556, 1090, 775, 2059, 898, 864, - 2459, 1116, 551, 188, 3262, 2728, 3134, 2451, 427, 858, 1927, 830, 2688, 2388, 2818, 1418, - 3298, 24, 2491, 1448, 1153, 178, 2489, 2126, 1772, 669, 1238, 633, 1919, 2222, 2673, 1918, - 2202, 3312, 208, 976, 2267, 107, 2905, 1137, 2921, 2471, 2796, 1313, 485, 1982, 1557, 1203, - 2930, 241, 3089, 890, 2193, 179, 952, 2057, 2444, 1378, 1466, 1362, 1808, 2343, 1532, 2651, - 727, 3254, 1328, 1604, 967, 2418, 1266, 1826, 684, 2869, 3149, 1874, 1691, 1507, 339, 2473, - 102, 3153, 969, 1551, 548, 3059, 2841, 1369, 148, 2510, 2025, 1369, 1579, 2474, 1093, 527, - 1416, 981, 2320, 2305, 227, 2173, 812, 1703, 2952, 17, 1129, 2223, 1894, 959, 73, 339, 553, - 1466, 1065, 617, 1749, 1896, 1838, 1771, 3092, 297, 996, 198, 521, 567, 3256, 2783, 1044, - 2644, 744, 2986, 3178, 1522, 942, 2045, 236, 1866, 853, 2303, 2383, 3095, 418, 2752, 2105, - 2896, 3081, 3067, 1696, 978, 102, 1961, 3120, 2741, 1029, 885, 2852, 2659, 2815, 3032, 2358, - 3252, 1195, 3304, 878, 70, 3069, 2726, 2455, 182, 108, 2868, 1744, 1697, 1060, 1803, 1752, - 829, 2434, 862, 2287, 2860, 352, 634, 2626, 1920, 2425, 239, 831, 2527, 1190, 1469, 2602, - 1711, 2185, 1403, 3189, 1188, 2649, 2079, 2215, 790, 409, 2413, 627, 2268, 2507, 2102, 1727, - 1146, 2711, 355, 1143, 1225, 430, 82, 3015, 2699, 642, 863, 241, 450, 440, 338, 365, 2621, - 3022, 204, 149, 2986, 2191, 1793, 3085, 2128, 373, 290, 835, 580, 2530, 1948 -}; -coeff_t output_vals[] = { - 4610776, 120935, 1254126, 1209043, 8250679, 5330432, 735926, 4979294, 3072462, 4438343, 5959108, 4150199, - 4125374, 799268, 2926975, 3345416, 6514953, 832221, 7949483, 5277257, 2590090, 7395643, 8089082, 314198, - 7811635, 2435420, 7246266, 2153173, 3788177, 4021035, 1833670, 2642681, 3518018, 5099879, 5041326, 2680133, - 2294752, 5040790, 6356070, 6817707, 3358789, 4806383, 3327317, 312329, 2347630, 3825407, 1256042, 3557082, - 5430887, 645661, 4038266, 5101636, 694984, 5345508, 2538149, 7704469, 608436, 4903777, 1808767, 7001927, - 428238, 7935468, 5373889, 3436349, 4187378, 106705, 4142516, 3600459, 1797819, 5861129, 6751083, 5646281, - 6572021, 2630356, 335813, 6149778, 2975343, 768557, 4186842, 811761, 4856709, 2679131, 6203378, 3703724, - 5132796, 3547916, 7657381, 1684219, 7387744, 824164, 6723600, 6802520, 776578, 7448197, 2239319, 6534227, - 1227829, 5033869, 3064815, 286414, 1995671, 6388718, 2560006, 7552570, 6339575, 805335, 3113443, 2864772, - 800943, 4400049, 7451911, 7392281, 6887610, 3918170, 5386836, 5448798, 3923739, 5342517, 7219282, 4135030, - 4848075, 3867634, 3104028, 1564387, 4163402, 3540475, 7001216, 3682604, 3743336, 7384009, 4377282, 2190376, - 4942710, 7034094, 5629808, 5848759, 2332012, 4648421, 497040, 2470792, 6962362, 3819457, 4432599, 7653714, - 261471, 1350408, 6731269, 5239397, 1550210, 486852, 2250358, 4924861, 6976950, 300934, 806395, 3144929, - 3055825, 4790760, 5436599, 5357177, 6797049, 3711049, 1553304, 639926, 3251638, 2748721, 7813863, 1852468, - 3250212, 2591962, 3399977, 3748588, 6728836, 6594725, 1978511, 2616210, 2563868, 1564352, 807960, 2796290, - 6274942, 7041857, 7435708, 3382319, 3146709, 5105958, 262779, 6159078, 6653706, 761471, 6987350, 356740, - 5740292, 4488534, 4422468, 1044504, 807782, 4184739, 661801, 2796031, 806827, 2038358, 1126151, 1109702, - 776038, 5143598, 2021761, 4167720, 2332673, 2619321, 697951, 2289483, 7123747, 1746806, 2804069, 6903563, - 636065, 1071798, 4014993, 7412447, 2790677, 5199202, 3494314, 2470575, 6040629, 6097253, 4627782, 4484757, - 1870653, 5869777, 814072, 4417289, 6604748, 5623468, 6238573, 1810479, 2943024, 2030271, 5815022, 3857202, - 1218369, 691903, 5686355, 419696, 1377088, 6367464, 3803857, 3828148, 2852124, 7792963, 7612924, 5532742, - 5453559, 1973636, 5708166, 774499, 3566136, 4292240, 3878276, 6917142, 6084901, 4680272, 2564962, 5851662, - 2089634, 5040595, 1341598, 5159646, 3461480, 733187, 797953, 4158070, 6311107, 6386220, 5275160, 4743402 -}; +coeff_t input1_vals[] = {1477, 218, 784, 251, 747, 1051, 1924, 133, 2953, 1295, 2989, 1519, 1701, 1874, 2806, 423, 2883, 327, 47, 2525, 1508, 214, 2998, 217, 1852, 2624, 2286, 3039, 3076, 1213, 1808, 2554, 1129, 1353, 2690, 2839, 1778, 2752, 1378, 601, 914, 2335, 2497, 1139, 2611, 129, 1318, 1570, 3190, 1868, 940, 2901, 2626, 2473, 3195, 2621, 2436, 3046, 1018, 1139, 1729, 3021, 2064, 945, 690, 1700, 1836, 1943, 2333, 2131, 1618, 1741, 2639, 2653, 301, 2013, 2744, 2406, 2995, 2463, 2366, 1495, 442, 224, 1349, 11, 2342, 1712, 2847, 1578, 2654, 2734, 3131, 1245, 1862, 527, 2400, 2043, 1360, 451, 573, 898, 2018, 3100, 161, 284, 1949, 362, 755, 2916, 1288, 1616, 876, 1682, 853, 2772, 2956, 1101, 2, 214, 2589, 211, 1025, 610, 1225, 2118, 224, 1296, 2612, 2634, 2056, 3227, 1712, 1258, 552, 1345, 786, 2124, 2915, 1226, 1233, 2654, 2786, 2636, 2234, 727, 2444, 199, 600, 2262, 3221, 915, 63, 318, 74, 2396, 1690, 2390, 1711, 414, 10, 2298, 1082, 1419, 3151, 1723, 2744, 3274, 2518, 2954, 1208, 2941, 2089, 3288, 1370, 783, 2517, 3190, 3069, 2505, 2840, 1427, 1670, 3091, 655, 96, 1935, 880, 2511, 876, 2371, 341, 196, 2849, 919, 161, 603, 2993, 2903, 1721, 139, 3326, 1876, 379, 2508, 2094, 1929, 430, 1033, 2604, 1955, 1333, 2274, 3312, 2604, 1585, 2317, 3230, 3068, 2905, 3268, 2844, 1023, 2824, 1731, 643, 820, 462, 2975, 314, 2218, 2011, 649, 383, 874, 2181, 866, 1192, 2914, 2290, 1820, 1572, 1030, 3076, 1526, 2760, 12, 529, 1242, 560, 2723, 2894, 1097, 778, 1495, 371}; +coeff_t input2_vals[] = {2960, 3124, 509, 485, 2525, 385, 608, 2893, 2423, 1802, 2556, 1090, 775, 2059, 898, 864, 2459, 1116, 551, 188, 3262, 2728, 3134, 2451, 427, 858, 1927, 830, 2688, 2388, 2818, 1418, 3298, 24, 2491, 1448, 1153, 178, 2489, 2126, 1772, 669, 1238, 633, 1919, 2222, 2673, 1918, 2202, 3312, 208, 976, 2267, 107, 2905, 1137, 2921, 2471, 2796, 1313, 485, 1982, 1557, 1203, 2930, 241, 3089, 890, 2193, 179, 952, 2057, 2444, 1378, 1466, 1362, 1808, 2343, 1532, 2651, 727, 3254, 1328, 1604, 967, 2418, 1266, 1826, 684, 2869, 3149, 1874, 1691, 1507, 339, 2473, 102, 3153, 969, 1551, 548, 3059, 2841, 1369, 148, 2510, 2025, 1369, 1579, 2474, 1093, 527, 1416, 981, 2320, 2305, 227, 2173, 812, 1703, 2952, 17, 1129, 2223, 1894, 959, 73, 339, 553, 1466, 1065, 617, 1749, 1896, 1838, 1771, 3092, 297, 996, 198, 521, 567, 3256, 2783, 1044, 2644, 744, 2986, 3178, 1522, 942, 2045, 236, 1866, 853, 2303, 2383, 3095, 418, 2752, 2105, 2896, 3081, 3067, 1696, 978, 102, 1961, 3120, 2741, 1029, 885, 2852, 2659, 2815, 3032, 2358, 3252, 1195, 3304, 878, 70, 3069, 2726, 2455, 182, 108, 2868, 1744, 1697, 1060, 1803, 1752, 829, 2434, 862, 2287, 2860, 352, 634, 2626, 1920, 2425, 239, 831, 2527, 1190, 1469, 2602, 1711, 2185, 1403, 3189, 1188, 2649, 2079, 2215, 790, 409, 2413, 627, 2268, 2507, 2102, 1727, 1146, 2711, 355, 1143, 1225, 430, 82, 3015, 2699, 642, 863, 241, 450, 440, 338, 365, 2621, 3022, 204, 149, 2986, 2191, 1793, 3085, 2128, 373, 290, 835, 580, 2530, 1948}; + +coeff_t output_vals[] = {2762, 3061, 1101, 3267, 2744, 1349, 182, 1761, 3089, 751, 137, 368, 1461, 2956, 493, 1653, 2617, 721, 356, 3034, 2234, 1556, 809, 2290, 1597, 457, 811, 259, 685, 2478, 319, 2519, 1049, 837, 644, 2571, 1029, 2997, 762, 1710, 2110, 1099, 2513, 1038, 2176, 1938, 3214, 261, 1604, 2474, 5, 1211, 2816, 2848, 2286, 3146, 1777, 1630, 2412, 1457, 889, 671, 822, 2369, 1409, 2059, 1121, 1871, 303, 1178, 2241, 1827, 2046, 628, 2869, 749, 1666, 895, 580, 1770, 2082, 3123, 1192, 520, 168, 2461, 1032, 163, 1421, 2792, 2148, 1735, 220, 1896, 2887, 2163, 357, 2301, 1830, 163, 1812, 805, 1850, 2017, 2313, 1205, 2226, 703, 866, 1708, 1426, 1920, 2911, 267, 3134, 629, 2120, 2022, 2847, 2945, 2967, 1977, 1449, 2028, 1381, 2738, 1098, 2977, 2217, 2060, 710, 845, 2807, 509, 2512, 2444, 2355, 550, 2965, 2517, 1802, 1755, 1065, 1938, 388, 2365, 776, 2453, 1799, 1532, 384, 2266, 1071, 2063, 2858, 1414, 663, 2886, 2734, 209, 1061, 2142, 841, 1081, 977, 799, 2661, 588, 3222, 2140, 2383, 3044, 394, 231, 1090, 917, 1840, 3002, 2315, 1182, 2744, 2815, 2612, 2586, 970, 3301, 3028, 2890, 1849, 269, 2936, 1525, 3102, 3144, 1605, 2746, 1556, 537, 2918, 2549, 976, 250, 2137, 492, 729, 392, 1115, 2422, 2100, 2317, 1636, 1743, 1279, 1393, 2079, 2874, 2148, 233, 1469, 3143, 2109, 1211, 2318, 1138, 2979, 1383, 125, 1995, 1614, 1435, 2216, 782, 671, 662, 988, 2826, 2162, 605, 2955, 2478, 2375, 1449, 2307, 1921, 1285, 2208, 2422, 1035, 2765, 923, 2138, 3053, 812, 146, 1175, 61}; + +// ------------------------------------------------------------------------- +// Golden model for Dilithium-style negacyclic polynomial multiplication +// c(x) = a(x) * b(x) mod (x^Nt + 1, q) +// ------------------------------------------------------------------------- + +static inline coeff_t golden_mod_q(long long x) +{ + // If you want to stay in sync with the core's modulus, + // you can also replace the next line with: long long q_long = (long long)q; + const long long q_long = 8380417LL; // Dilithium q; change if needed + + long long r = x % q_long; + if (r < 0) + r += q_long; + + return (coeff_t)r; +} + +static inline void golden_poly_mult_dil(coeff_t c[Nt], + const coeff_t a[Nt], + const coeff_t b[Nt]) +{ + long long acc[Nt]; + + // Zero accumulator + for (int i = 0; i < Nt; i++) + acc[i] = 0; + + // Negacyclic convolution: mod (x^Nt + 1) + for (int i = 0; i < Nt; i++) + { + for (int j = 0; j < Nt; j++) + { + long long prod = (long long)a[i] * (long long)b[j]; + int idx = i + j; + + if (idx < Nt) + { + acc[idx] += prod; // "low" part + } + else + { + acc[idx - Nt] -= prod; // folded back with a minus + } + } + } + + // Final reduction mod q + for (int i = 0; i < Nt; i++) + { + c[i] = golden_mod_q(acc[i]); + } +}