mirror of
https://github.com/saymrwulf/pqc-accelerate.git
synced 2026-05-14 20:48:07 +00:00
1338 lines
No EOL
31 KiB
C++
1338 lines
No EOL
31 KiB
C++
// polymult.cpp (Set A, with all inner pragmas kept, and .user/.id/.dest removed)
|
|
|
|
#include "ntt.h"
|
|
|
|
// Dilithium modulus and 1/N mod q for N = 256
|
|
coeff_t q = 8380417;
|
|
coeff_t inv_n = 8347681; // 256 * 8347681 ≡ 1 (mod 8380417)
|
|
|
|
|
|
//double_coeff_t v = 20159;
|
|
|
|
/*coeff_t mod(double_coeff_t A)
|
|
{
|
|
#pragma HLS inline OFF
|
|
//double_coeff_t v = (double_coeff_t) ((1<<26) + 1664)/q;
|
|
double_coeff_t t = (v * A + (1 << 25)) >> 26;
|
|
t = t * q;
|
|
coeff_t val;
|
|
if (A < t)
|
|
val = A - t + q;
|
|
else
|
|
val = A - t;
|
|
return val;
|
|
}*/
|
|
|
|
|
|
/* Kyber
|
|
ap_uint<13> m = 5039;
|
|
|
|
coeff_t mod(double_coeff_t A)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
coeff_t val;
|
|
ap_uint<36> t123 = m * A;
|
|
ap_uint<12> t = (t123 >> 24);
|
|
ap_uint<24> ta = t * q;
|
|
ap_uint<24> c = A - ta;
|
|
if (c > q)
|
|
val = (coeff_t) (c - q);
|
|
else
|
|
val = (coeff_t) c;
|
|
return val;
|
|
}
|
|
|
|
coeff_t modadd(coeff_t x, coeff_t y)
|
|
{
|
|
#pragma HLS inline
|
|
coeff_t w = x + y;
|
|
return (coeff_t)(w - (w < q ? (coeff_t)0 : q));
|
|
}
|
|
|
|
coeff_t modsub(coeff_t x, coeff_t y)
|
|
{
|
|
#pragma HLS inline
|
|
coeff_t s = x + (x > y ? (coeff_t)0 : q);
|
|
return (coeff_t)(s - y);
|
|
}
|
|
*/
|
|
|
|
//-----------------------------------------
|
|
// Start "For Dilithium"
|
|
|
|
// mod, modadd, modsub for Dilithium
|
|
// Generic modular reduction for coeff_t / double_coeff_t
|
|
// Works for any |A| < 2^63 and any |q| < 2^31.
|
|
static inline coeff_t mod(double_coeff_t A)
|
|
{
|
|
#pragma HLS inline
|
|
|
|
long long x = (long long)A;
|
|
long long q_long = (long long)q;
|
|
|
|
long long r = x % q_long;
|
|
if (r < 0)
|
|
r += q_long;
|
|
|
|
return (coeff_t)r;
|
|
}
|
|
|
|
// Modular addition: returns (a + b) mod q
|
|
static inline coeff_t modadd(coeff_t a, coeff_t b)
|
|
{
|
|
#pragma HLS inline
|
|
|
|
long long q_long = (long long)q;
|
|
long long s = (long long)a + (long long)b;
|
|
|
|
if (s >= q_long)
|
|
s -= q_long;
|
|
|
|
return (coeff_t)s;
|
|
}
|
|
|
|
// Modular subtraction: returns (a - b) mod q
|
|
static inline coeff_t modsub(coeff_t a, coeff_t b)
|
|
{
|
|
#pragma HLS inline
|
|
|
|
long long q_long = (long long)q;
|
|
long long d = (long long)a - (long long)b;
|
|
|
|
if (d < 0)
|
|
d += q_long;
|
|
|
|
return (coeff_t)d;
|
|
}
|
|
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// 512-point NTT core (for Dilithium negacyclic convolution)
|
|
// -----------------------------------------------------------------------------
|
|
|
|
// Multiplication modulo q, using existing mod() on double_coeff_t
|
|
static inline coeff_t mul_mod(coeff_t a, coeff_t b)
|
|
{
|
|
double_coeff_t prod = (double_coeff_t)a * (double_coeff_t)b;
|
|
return mod(prod);
|
|
}
|
|
|
|
// Precomputed stage twiddles for length-512 NTT over q = 8380417
|
|
// Stage index s corresponds to len = 2^(s+1): 2,4,8,...,512
|
|
static const coeff_t NTT_WLEN[9] = {
|
|
(coeff_t)8380416, // len = 2
|
|
(coeff_t)4808194, // len = 4
|
|
(coeff_t)4614810, // len = 8
|
|
(coeff_t)2883726, // len = 16
|
|
(coeff_t)6250525, // len = 32
|
|
(coeff_t)7044481, // len = 64
|
|
(coeff_t)3241972, // len = 128
|
|
(coeff_t)6644104, // len = 256
|
|
(coeff_t)1921994 // len = 512
|
|
};
|
|
|
|
static const coeff_t NTT_WLEN_INV[9] = {
|
|
(coeff_t)8380416, // len = 2
|
|
(coeff_t)3572223, // len = 4
|
|
(coeff_t)3761513, // len = 8
|
|
(coeff_t)5234739, // len = 16
|
|
(coeff_t)3764867, // len = 32
|
|
(coeff_t)3227876, // len = 64
|
|
(coeff_t)6621070, // len = 128
|
|
(coeff_t)6125690, // len = 256
|
|
(coeff_t)527981 // len = 512
|
|
};
|
|
|
|
// 512^{-1} mod 8380417
|
|
static const coeff_t INV_NTT512 = (coeff_t)8364049;
|
|
|
|
// In-place iterative radix-2 NTT of size 512
|
|
static void ntt_512(coeff_t a[512], bool invert)
|
|
{
|
|
// Bit-reversal permutation
|
|
unsigned int j = 0;
|
|
for (unsigned int i = 1; i < 512; ++i) {
|
|
unsigned int bit = 512 >> 1;
|
|
while (j & bit) {
|
|
j ^= bit;
|
|
bit >>= 1;
|
|
}
|
|
j ^= bit;
|
|
if (i < j) {
|
|
coeff_t tmp = a[i];
|
|
a[i] = a[j];
|
|
a[j] = tmp;
|
|
}
|
|
}
|
|
|
|
int len = 2;
|
|
int stage = 0;
|
|
while (len <= 512) {
|
|
coeff_t wlen = invert ? NTT_WLEN_INV[stage] : NTT_WLEN[stage];
|
|
int half = len >> 1;
|
|
|
|
for (int i = 0; i < 512; i += len) {
|
|
coeff_t w = (coeff_t)1;
|
|
for (int j = 0; j < half; ++j) {
|
|
coeff_t u = a[i + j];
|
|
coeff_t v = mul_mod(a[i + j + half], w);
|
|
a[i + j] = modadd(u, v);
|
|
a[i + j + half] = modsub(u, v);
|
|
w = mul_mod(w, wlen);
|
|
}
|
|
}
|
|
|
|
len <<= 1;
|
|
stage++;
|
|
}
|
|
|
|
if (invert) {
|
|
// Multiply by 512^{-1} mod q to finish inverse NTT
|
|
for (int i = 0; i < 512; ++i) {
|
|
a[i] = mul_mod(a[i], INV_NTT512);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Negacyclic convolution via 512-point NTT:
|
|
// c(x) = a(x) * b(x) mod (x^Nt + 1, q), Nt = 256
|
|
static void poly_mult_dil_core(coeff_t c[Nt],
|
|
const coeff_t a[Nt],
|
|
const coeff_t b[Nt])
|
|
{
|
|
const int N2 = 2 * Nt; // 512 for Nt = 256
|
|
coeff_t A[N2];
|
|
coeff_t B[N2];
|
|
coeff_t C[N2];
|
|
|
|
// Copy inputs and zero-pad
|
|
for (int i = 0; i < Nt; ++i) {
|
|
#pragma HLS pipeline II=1
|
|
A[i] = a[i];
|
|
B[i] = b[i];
|
|
}
|
|
for (int i = Nt; i < N2; ++i) {
|
|
#pragma HLS pipeline II=1
|
|
A[i] = 0;
|
|
B[i] = 0;
|
|
}
|
|
|
|
// Forward 512-point NTT of both sequences
|
|
ntt_512(A, false);
|
|
ntt_512(B, false);
|
|
|
|
// Pointwise multiplication in NTT domain
|
|
for (int i = 0; i < N2; ++i) {
|
|
#pragma HLS pipeline II=1
|
|
C[i] = mul_mod(A[i], B[i]);
|
|
}
|
|
|
|
// Inverse 512-point NTT -> length-512 cyclic convolution
|
|
ntt_512(C, true);
|
|
|
|
// Negacyclic fold: c[k] = C[k] - C[k + Nt] mod q
|
|
for (int k = 0; k < Nt; ++k) {
|
|
#pragma HLS pipeline II=1
|
|
c[k] = modsub(C[k], C[k + Nt]);
|
|
}
|
|
}
|
|
|
|
|
|
// END "For Dilithium"
|
|
//----------------------------------------------------------
|
|
|
|
|
|
|
|
void butterfly_unit_dif(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
x = modadd(a, b);
|
|
y = modsub(a, b);
|
|
y = mod(w * y);
|
|
}
|
|
|
|
void butterfly_unit_dit(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
coeff_t wb = mod(w * b);
|
|
x = modadd(a, wb);
|
|
y = modsub(a, wb);
|
|
}
|
|
|
|
void delay_cycle()
|
|
{
|
|
#ifdef __SYNTHESIS__
|
|
ap_wait_n(1);
|
|
#endif
|
|
}
|
|
|
|
void ntt_stage1 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeff = 1729;
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 64; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
for (int j = 0; j < 1; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 64; k++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeff;
|
|
butterfly_unit_dit(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 64; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
}
|
|
}
|
|
}
|
|
|
|
void ntt_stage2 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[2] = {2580, 3289};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 32; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 32; k++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[j];
|
|
butterfly_unit_dit(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 32; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 1)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ntt_stage3 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[4] = {2642, 630, 1897, 848};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
for (int j = 0; j < 4; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 16; k++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[j];
|
|
butterfly_unit_dit(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 3)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ntt_stage4 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[8] = {1062, 1919, 193, 797, 2786, 3260, 569, 1746};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
for (int j = 0; j < 8; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
int ind = 1;
|
|
for (int k = 0; k < 8; k++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[j];
|
|
butterfly_unit_dit(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 7)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ntt_stage5 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[16] = {296, 2447, 1339, 1476, 3046, 56, 2240, 1333,
|
|
1426, 2094, 535, 2882, 2393, 2879, 1974, 821};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
for (int j = 0; j < 16; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
int ind = 1;
|
|
for (int k = 0; k < 4; k = k + 1)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[j];
|
|
butterfly_unit_dit(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 15)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ntt_stage6 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[32] = {289, 331, 3253, 1756, 1197, 2304, 2277, 2055,
|
|
650, 1977, 2513, 632, 2865, 33, 1320, 1915,
|
|
2319, 1435, 807, 452, 1438, 2868, 1534, 2402,
|
|
2647, 2617, 1481, 648, 2474, 3110, 1227, 910};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 2; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
for (int j = 0; j < 32; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
int ind = 1;
|
|
for (int k = 0; k < 2; k = k + 1)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[j];
|
|
butterfly_unit_dit(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 2; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 31)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ntt_stage7 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS inline off
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
coeff_t twiddle_coeffs[64] = {17, 2761, 583, 2649, 1637, 723, 2288, 1100,
|
|
1409, 2662, 3281, 233, 756, 2156, 3015, 3050,
|
|
1703, 1651, 2789, 1789, 1847, 952, 1461, 2687,
|
|
939, 2308, 2437, 2388, 733, 2337, 268, 641,
|
|
1584, 2298, 2037, 3220, 375, 2549, 2090, 1645,
|
|
1063, 319, 2773, 757, 2099, 561, 2466, 2594,
|
|
2804, 1092, 403, 1026, 1143, 2150, 2775, 886,
|
|
1722, 1212, 1874, 1029, 2110, 2935, 885, 2154};
|
|
int x, y;
|
|
coeff_t u, t, it, bf1, bf2;
|
|
|
|
u = a.read();
|
|
|
|
for (int j = 0; j < 64; j++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
t = a.read();
|
|
butterfly_unit_dit(twiddle_coeffs[j], u, t, bf1, bf2);
|
|
b.write(bf1);
|
|
b.write(bf2);
|
|
if (j < 63)
|
|
u = a.read();
|
|
}
|
|
}
|
|
|
|
void intt_stage1 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
coeff_t twiddle_coeffs[64] = {1175, 2444, 394, 1219, 2300, 1455, 2117, 1607,
|
|
2443, 554, 1179, 2186, 2303, 2926, 2237, 525,
|
|
735, 863, 2768, 1230, 2572, 556, 3010, 2266,
|
|
1684, 1239, 780, 2954, 109, 1292, 1031, 1745,
|
|
2688, 3061, 992, 2596, 941, 892, 1021, 2390,
|
|
642, 1868, 2377, 1482, 1540, 540, 1678, 1626,
|
|
279, 314, 1173, 2573, 3096, 48, 667, 1920,
|
|
2229, 1041, 2606, 1692, 680, 2746, 568, 3312};
|
|
|
|
#pragma HLS inline off
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int x, y;
|
|
coeff_t u, t, it, bf1, bf2;
|
|
|
|
u = a.read();
|
|
|
|
for (int j = 0; j < 64; j++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
t = a.read();
|
|
butterfly_unit_dif(twiddle_coeffs[j], u, t, bf1, bf2);
|
|
b.write(bf1);
|
|
b.write(bf2);
|
|
if (j < 63)
|
|
u = a.read();
|
|
}
|
|
}
|
|
|
|
void intt_stage2 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[32] = {2419, 2102, 219, 855, 2681, 1848, 712, 682,
|
|
927, 1795, 461, 1891, 2877, 2522, 1894, 1010,
|
|
1414, 2009, 3296, 464, 2697, 816, 1352, 2679,
|
|
1274, 1052, 1025, 2132, 1573, 76, 2998, 3040};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 2; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
int ind = 0;
|
|
int count = 0;
|
|
for (int j = 0; j < 32; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 2; k = k + 1)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[ind];
|
|
butterfly_unit_dif(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
count++;
|
|
if (count % 2 == 0)
|
|
ind++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 2; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 31)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void intt_stage3 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[16] = {2508, 1355, 450, 936, 447, 2794, 1235, 1903,
|
|
1996, 1089, 3273, 283, 1853, 1990, 882, 3033};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
int m = 4;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
int ind = 0;
|
|
int count = 0;
|
|
for (int j = 0; j < 16; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 4; k = k + 1)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[ind];
|
|
butterfly_unit_dif(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
count++;
|
|
if (count % 4 == 0)
|
|
ind++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 15)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void intt_stage4 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[8] = {1583, 2760, 69, 543, 2532, 3136, 1410, 2267};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
int ind = 0;
|
|
int count = 0;
|
|
for (int j = 0; j < 8; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 8; k = k + 1)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[ind];
|
|
butterfly_unit_dif(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
count++;
|
|
if (count % 8 == 0)
|
|
ind++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 7)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void intt_stage5 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[4] = {2481, 1432, 2699, 687};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
int ind = 0;
|
|
int count = 0;
|
|
for (int j = 0; j < 4; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 16; k = k + 1)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[ind];
|
|
butterfly_unit_dif(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
count++;
|
|
if (count % 16 == 0)
|
|
ind++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 3)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void intt_stage6 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS dataflow
|
|
coeff_t twiddle_coeffs[2] = {40, 749};
|
|
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, tf;
|
|
|
|
for (int i = 0; i < 32; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
int ind = 0;
|
|
int count = 0;
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 32; k = k + 1)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = twiddle_coeffs[ind];
|
|
butterfly_unit_dif(tf, a_, b_, bf1, bf2);
|
|
b.write(bf1);
|
|
fifo[iter] = bf2;
|
|
iter++;
|
|
count++;
|
|
if (count == 32)
|
|
ind++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 32; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
if (j < 1)
|
|
{
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void intt_stage7 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
|
|
{
|
|
#pragma HLS inline off
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int x, y;
|
|
coeff_t a_, b_, it, bf1, bf2, bfn1, bfn2, tf;
|
|
|
|
for (int i = 0; i < 64; i++)
|
|
{
|
|
#pragma HLS pipeline
|
|
it = a.read();
|
|
fifo[i + 64] = it;
|
|
}
|
|
|
|
for (int j = 0; j < 1; j++)
|
|
{
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
int iter = 0;
|
|
for (int k = 0; k < 64; k = k + 1)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
a_ = fifo[iter + 64];
|
|
b_ = a.read();
|
|
tf = 1600;
|
|
butterfly_unit_dif(tf, a_, b_, bf1, bf2);
|
|
bfn1 = mod(bf1 * inv_n);
|
|
bfn2 = mod(bf2 * inv_n);
|
|
b.write(bfn1);
|
|
fifo[iter] = bfn2;
|
|
iter++;
|
|
delay_cycle();
|
|
}
|
|
|
|
for (int i = 0; i < 64; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
#pragma HLS DEPENDENCE variable = fifo inter RAW false
|
|
b.write(fifo[i]);
|
|
delay_cycle();
|
|
}
|
|
}
|
|
}
|
|
|
|
void read_inputs (hls::stream<coeff_t_stream> &input, hls::stream<coeff_t> &se, hls::stream<coeff_t> &so)
|
|
{
|
|
coeff_t_stream x;
|
|
coeff_t a;
|
|
int i;
|
|
|
|
for (i=0; i<Nt; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
x = input.read();
|
|
a = x.value;
|
|
if (i%2 == 0)
|
|
se.write(a);
|
|
else
|
|
so.write(a);
|
|
}
|
|
}
|
|
|
|
void write_outputs (hls::stream<coeff_t> &se, hls::stream<coeff_t> &so, hls::stream<coeff_t_stream> &output)
|
|
{
|
|
coeff_t a1, a0;
|
|
coeff_t_stream y;
|
|
int i;
|
|
|
|
y.last = 0;
|
|
for (i=0; i<N; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
a0 = se.read();
|
|
a1 = so.read();
|
|
y.value = a0;
|
|
output.write(y);
|
|
y.value = a1;
|
|
if (i == N-1)
|
|
y.last = 1;
|
|
output.write(y);
|
|
}
|
|
}
|
|
|
|
void ct_ntt (hls::stream<coeff_t_stream> &input, hls::stream<coeff_t_stream> &output)
|
|
{
|
|
#pragma HLS dataflow
|
|
|
|
hls::stream<coeff_t> s0o("s0o"), s1o("s1o"), s2o("s2o"), s3o("s3o"),
|
|
s4o("s4o"), s5o("s5o"), s6o("s6o"), s7o("s7o"),
|
|
s0e("s0e"), s1e("s1e"), s2e("s2e"), s3e("s3e"),
|
|
s4e("s4e"), s5e("s5e"), s6e("s6e"), s7e("s7e");
|
|
|
|
coeff_t fo7[65], fo6[66], fo5[68], fo4[72], fo3[80], fo2[96], fo1[128];
|
|
coeff_t fe7[65], fe6[66], fe5[68], fe4[72], fe3[80], fe2[96], fe1[128];
|
|
|
|
coeff_t_stream x, y;
|
|
|
|
#pragma HLS STREAM variable = s7o depth = 1
|
|
#pragma HLS STREAM variable = s6o depth = 2
|
|
#pragma HLS STREAM variable = s5o depth = 4
|
|
#pragma HLS STREAM variable = s4o depth = 8
|
|
#pragma HLS STREAM variable = s3o depth = 16
|
|
#pragma HLS STREAM variable = s2o depth = 32
|
|
#pragma HLS STREAM variable = s1o depth = 64
|
|
#pragma HLS STREAM variable = s0o depth = 128
|
|
|
|
#pragma HLS STREAM variable = s7e depth = 1
|
|
#pragma HLS STREAM variable = s6e depth = 2
|
|
#pragma HLS STREAM variable = s5e depth = 4
|
|
#pragma HLS STREAM variable = s4e depth = 8
|
|
#pragma HLS STREAM variable = s3e depth = 16
|
|
#pragma HLS STREAM variable = s2e depth = 32
|
|
#pragma HLS STREAM variable = s1e depth = 64
|
|
#pragma HLS STREAM variable = s0e depth = 128
|
|
|
|
|
|
read_inputs(input, s0e, s0o);
|
|
|
|
ntt_stage1 (s0e, s1e, fe1);
|
|
ntt_stage1 (s0o, s1o, fo1);
|
|
|
|
ntt_stage2 (s1e, s2e, fe2);
|
|
ntt_stage2 (s1o, s2o, fo2);
|
|
|
|
ntt_stage3 (s2e, s3e, fe3);
|
|
ntt_stage3 (s2o, s3o, fo3);
|
|
|
|
ntt_stage4 (s3e, s4e, fe4);
|
|
ntt_stage4 (s3o, s4o, fo4);
|
|
|
|
ntt_stage5 (s4e, s5e, fe5);
|
|
ntt_stage5 (s4o, s5o, fo5);
|
|
|
|
ntt_stage6 (s5e, s6e, fe6);
|
|
ntt_stage6 (s5o, s6o, fo6);
|
|
|
|
ntt_stage7 (s6e, s7e, fe7);
|
|
ntt_stage7 (s6o, s7o, fo7);
|
|
|
|
write_outputs(s7e, s7o, output);
|
|
}
|
|
|
|
void gs_intt (hls::stream<coeff_t_stream> &input, hls::stream<coeff_t_stream> &output)
|
|
{
|
|
#pragma HLS dataflow
|
|
|
|
hls::stream<coeff_t> s0o("s0o"), s1o("s1o"), s2o("s2o"), s3o("s3o"),
|
|
s4o("s4o"), s5o("s5o"), s6o("s6o"), s7o("s7o"),
|
|
s0e("s0e"), s1e("s1e"), s2e("s2e"), s3e("s3e"),
|
|
s4e("s4e"), s5e("s5e"), s6e("s6e"), s7e("s7e");
|
|
|
|
coeff_t fo7[128], fo6[96], fo5[80], fo4[72], fo3[68], fo2[66], fo1[65];
|
|
coeff_t fe7[128], fe6[96], fe5[80], fe4[72], fe3[68], fe2[66], fe1[65];
|
|
|
|
coeff_t_stream x, y;
|
|
|
|
#pragma HLS STREAM variable = s7o depth = 1
|
|
#pragma HLS STREAM variable = s6o depth = 2
|
|
#pragma HLS STREAM variable = s5o depth = 4
|
|
#pragma HLS STREAM variable = s4o depth = 8
|
|
#pragma HLS STREAM variable = s3o depth = 16
|
|
#pragma HLS STREAM variable = s2o depth = 32
|
|
#pragma HLS STREAM variable = s1o depth = 64
|
|
#pragma HLS STREAM variable = s0o depth = 128
|
|
|
|
#pragma HLS STREAM variable = s7e depth = 1
|
|
#pragma HLS STREAM variable = s6e depth = 2
|
|
#pragma HLS STREAM variable = s5e depth = 4
|
|
#pragma HLS STREAM variable = s4e depth = 8
|
|
#pragma HLS STREAM variable = s3e depth = 16
|
|
#pragma HLS STREAM variable = s2e depth = 32
|
|
#pragma HLS STREAM variable = s1e depth = 64
|
|
#pragma HLS STREAM variable = s0e depth = 128
|
|
|
|
read_inputs(input, s0e, s0o);
|
|
|
|
intt_stage1 (s0e, s1e, fe1);
|
|
intt_stage1 (s0o, s1o, fo1);
|
|
|
|
intt_stage2 (s1e, s2e, fe2);
|
|
intt_stage2 (s1o, s2o, fo2);
|
|
|
|
intt_stage3 (s2e, s3e, fe3);
|
|
intt_stage3 (s2o, s3o, fo3);
|
|
|
|
intt_stage4 (s3e, s4e, fe4);
|
|
intt_stage4 (s3o, s4o, fo4);
|
|
|
|
intt_stage5 (s4e, s5e, fe5);
|
|
intt_stage5 (s4o, s5o, fo5);
|
|
|
|
intt_stage6 (s5e, s6e, fe6);
|
|
intt_stage6 (s5o, s6o, fo6);
|
|
|
|
intt_stage7 (s6e, s7e, fe7);
|
|
intt_stage7 (s6o, s7o, fo7);
|
|
|
|
write_outputs(s7e, s7o, output);
|
|
}
|
|
|
|
void stream_split (hls::stream<coeff_t_stream_big> &input,
|
|
hls::stream<coeff_t_stream> &input1,
|
|
hls::stream<coeff_t_stream> &input2)
|
|
{
|
|
|
|
coeff_t_stream_big x;
|
|
double_coeff_t a;
|
|
coeff_t_stream x1, x2;
|
|
coeff_t a1, a2;
|
|
int i;
|
|
|
|
for (i=0; i<Nt; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
x = input.read();
|
|
a = x.value;
|
|
a1 = a(double_coeff_t::width - 1, coeff_t::width);
|
|
a2 = a(coeff_t::width - 1, 0);
|
|
if (i == Nt-1)
|
|
{
|
|
x1.last = 1;
|
|
x2.last = 1;
|
|
}
|
|
else
|
|
{
|
|
x1.last = 0;
|
|
x2.last = 0;
|
|
}
|
|
x1.value = a1;
|
|
x2.value = a2;
|
|
input1.write(x1);
|
|
input2.write(x2);
|
|
}
|
|
}
|
|
|
|
void point_wise_mult (hls::stream<coeff_t_stream> &input1,
|
|
hls::stream<coeff_t_stream> &input2,
|
|
hls::stream<coeff_t_stream> &output)
|
|
{
|
|
coeff_t_stream xe, xo, ye, yo, z;
|
|
coeff_t ae, be, ce, ao, bo, co, c1, c2, c2s, c3, c4;
|
|
int i;
|
|
|
|
coeff_t pm_factors[128] = {17, 3312, 2761, 568, 583, 2746, 2649, 680,
|
|
1637, 1692, 723, 2606, 2288, 1041, 1100, 2229,
|
|
1409, 1920, 2662, 667, 3281, 48, 233, 3096,
|
|
756, 2573, 2156, 1173, 3015, 314, 3050, 279,
|
|
1703, 1626, 1651, 1678, 2789, 540, 1789, 1540,
|
|
1847, 1482, 952, 2377, 1461, 1868, 2687, 642,
|
|
939, 2390, 2308, 1021, 2437, 892, 2388, 941,
|
|
733, 2596, 2337, 992, 268, 3061, 641, 2688,
|
|
1584, 1745, 2298, 1031, 2037, 1292, 3220, 109,
|
|
375, 2954, 2549, 780, 2090, 1239, 1645, 1684,
|
|
1063, 2266, 319, 3010, 2773, 556, 757, 2572,
|
|
2099, 1230, 561, 2768, 2466, 863, 2594, 735,
|
|
2804, 525, 1092, 2237, 403, 2926, 1026, 2303,
|
|
1143, 2186, 2150, 1179, 2775, 554, 886, 2443,
|
|
1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300,
|
|
2110, 1219, 2935, 394, 885, 2444, 2154, 1175};
|
|
|
|
z.last = 0;
|
|
for (i=0; i<N; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
xe = input1.read();
|
|
xo = input1.read();
|
|
ye = input2.read();
|
|
yo = input2.read();
|
|
ao = xo.value;
|
|
bo = yo.value;
|
|
ae = xe.value;
|
|
be = ye.value;
|
|
|
|
c1 = mod (ae * be);
|
|
c2 = mod (ao * bo);
|
|
c2s = mod (c2 * pm_factors[i]);
|
|
c3 = mod (ae * bo);
|
|
c4 = mod (ao * be);
|
|
|
|
ce = modadd (c1, c2s);
|
|
co = modadd (c3, c4);
|
|
|
|
z.value = ce;
|
|
output.write(z);
|
|
if (i == N-1)
|
|
z.last = 1;
|
|
z.value = co;
|
|
output.write(z);
|
|
}
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// AXI4-Stream <-> internal stream conversion helpers (only at top level)
|
|
// -----------------------------------------------------------------------------
|
|
|
|
static void axis_to_internal_input(hls::stream<coeff_axis_big_t> &axis_in,
|
|
hls::stream<coeff_t_stream_big> &int_in)
|
|
{
|
|
coeff_axis_big_t a;
|
|
coeff_t_stream_big x;
|
|
|
|
for (int i = 0; i < Nt; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
a = axis_in.read();
|
|
|
|
x.value = (double_coeff_t)a.data;
|
|
x.last = a.last;
|
|
|
|
int_in.write(x);
|
|
|
|
// Optional: break on TLAST if you want to be robust to shorter packets
|
|
if (a.last)
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void internal_to_axis_output(hls::stream<coeff_t_stream> &int_out,
|
|
hls::stream<coeff_axis_t> &axis_out)
|
|
{
|
|
coeff_t_stream x;
|
|
coeff_axis_t a;
|
|
|
|
for (int i = 0; i < Nt; i++)
|
|
{
|
|
#pragma HLS pipeline II = 1
|
|
x = int_out.read();
|
|
|
|
a.data = (ap_uint<32>)x.value;
|
|
a.last = x.last;
|
|
|
|
// Mark all bytes valid; side channels are disabled in this ap_axiu config
|
|
a.keep = -1;
|
|
a.strb = -1;
|
|
|
|
axis_out.write(a);
|
|
|
|
if (x.last)
|
|
break;
|
|
}
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// Top-level function with AXI4-Stream ports (for DMA) and internal NTT pipeline
|
|
// -----------------------------------------------------------------------------
|
|
/*
|
|
int poly_mult_dil (hls::stream<coeff_axis_big_t> &input,
|
|
hls::stream<coeff_axis_t> &output)
|
|
{
|
|
#pragma HLS INTERFACE axis register port=input
|
|
#pragma HLS INTERFACE axis register port=output
|
|
#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS
|
|
#pragma HLS dataflow
|
|
|
|
// Internal streams using the original coeff_t_stream{,_big} types
|
|
hls::stream<coeff_t_stream_big> in_internal("in_internal");
|
|
hls::stream<coeff_t_stream> input1("input1"), input2("input2");
|
|
hls::stream<coeff_t_stream> middle1("middle1"), middle2("middle2");
|
|
hls::stream<coeff_t_stream> middle3("middle3"), out_internal("out_internal");
|
|
|
|
axis_to_internal_input(input, in_internal);
|
|
stream_split(in_internal, input1, input2);
|
|
ct_ntt(input1, middle1);
|
|
ct_ntt(input2, middle2);
|
|
point_wise_mult(middle1, middle2, middle3);
|
|
gs_intt(middle3, out_internal);
|
|
internal_to_axis_output(out_internal, output);
|
|
|
|
return 0;
|
|
}
|
|
*/
|
|
|
|
// For Dilithium
|
|
// -----------------------------------------------------------------------------
|
|
// Top-level function: now uses array-based 512-NTT core instead of Kyber NTT
|
|
// -----------------------------------------------------------------------------
|
|
|
|
int poly_mult_dil(hls::stream<coeff_axis_big_t> &input,
|
|
hls::stream<coeff_axis_t> &output)
|
|
{
|
|
#pragma HLS INTERFACE axis register port=input
|
|
#pragma HLS INTERFACE axis register port=output
|
|
#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS
|
|
#pragma HLS dataflow
|
|
|
|
// Internal streams (same as before)
|
|
hls::stream<coeff_t_stream_big> in_internal("in_internal");
|
|
hls::stream<coeff_t_stream> input1("input1"), input2("input2");
|
|
hls::stream<coeff_t_stream> out_internal("out_internal");
|
|
|
|
// Existing helpers: keep as they are in your file
|
|
axis_to_internal_input(input, in_internal);
|
|
stream_split(in_internal, input1, input2);
|
|
|
|
// Local polynomial buffers
|
|
coeff_t poly_a[Nt];
|
|
coeff_t poly_b[Nt];
|
|
coeff_t poly_c[Nt];
|
|
|
|
// Read Nt coefficients for each polynomial from the two internal streams
|
|
for (int i = 0; i < Nt; ++i) {
|
|
#pragma HLS pipeline II=1
|
|
coeff_t_stream x1 = input1.read();
|
|
coeff_t_stream x2 = input2.read();
|
|
poly_a[i] = x1.value;
|
|
poly_b[i] = x2.value;
|
|
}
|
|
|
|
// Core negacyclic multiplication via 512-point NTT
|
|
poly_mult_dil_core(poly_c, poly_a, poly_b);
|
|
|
|
// Stream result back out as coeff_t_stream (value + last)
|
|
for (int i = 0; i < Nt; ++i) {
|
|
#pragma HLS pipeline II=1
|
|
coeff_t_stream y;
|
|
y.value = poly_c[i];
|
|
y.last = (i == Nt - 1) ? (bit)1 : (bit)0;
|
|
out_internal.write(y);
|
|
}
|
|
|
|
// Existing helper: convert internal stream to AXI output
|
|
internal_to_axis_output(out_internal, output);
|
|
|
|
return 0;
|
|
} |