// polymult.cpp  (Set A, with all inner pragmas kept, and .user/.id/.dest removed)

#include "ntt.h"

// Dilithium modulus and 1/N mod q for N = 256
coeff_t q     = 8380417;
coeff_t inv_n = 8347681;  // 256 * 8347681 ≡ 1 (mod 8380417)


//double_coeff_t v = 20159;

/*coeff_t mod(double_coeff_t A)
{
	#pragma HLS inline OFF
	//double_coeff_t v = (double_coeff_t) ((1<<26) + 1664)/q;
	double_coeff_t t = (v * A + (1 << 25)) >> 26;
	t = t * q;
	coeff_t val;
	if (A < t)
		val = A - t + q;
	else
		val = A - t;
	return val;
}*/


/* Kyber
ap_uint<13> m = 5039;

coeff_t mod(double_coeff_t A)
{
	#pragma HLS pipeline II = 1
	coeff_t val;
	ap_uint<36> t123 = m * A;
	ap_uint<12> t = (t123 >> 24);
	ap_uint<24> ta = t * q;
	ap_uint<24> c = A - ta;
	if (c > q)
		val = (coeff_t) (c - q);
	else
		val = (coeff_t) c;
	return val;
}

coeff_t modadd(coeff_t x, coeff_t y)
{
	#pragma HLS inline
	coeff_t w = x + y;
	return (coeff_t)(w - (w < q ? (coeff_t)0 : q));
}

coeff_t modsub(coeff_t x, coeff_t y)
{
	#pragma HLS inline
	coeff_t s = x + (x > y ? (coeff_t)0 : q);
	return (coeff_t)(s - y);
}
*/

//-----------------------------------------
// Start "For Dilithium"

// mod, modadd, modsub for Dilithium
// Generic modular reduction for coeff_t / double_coeff_t
// Works for any |A| < 2^63 and any |q| < 2^31.
static inline coeff_t mod(double_coeff_t A)
{
    #pragma HLS inline

    long long x      = (long long)A;
    long long q_long = (long long)q;

    long long r = x % q_long;
    if (r < 0)
        r += q_long;

    return (coeff_t)r;
}

// Modular addition: returns (a + b) mod q
static inline coeff_t modadd(coeff_t a, coeff_t b)
{
    #pragma HLS inline

    long long q_long = (long long)q;
    long long s      = (long long)a + (long long)b;

    if (s >= q_long)
        s -= q_long;

    return (coeff_t)s;
}

// Modular subtraction: returns (a - b) mod q
static inline coeff_t modsub(coeff_t a, coeff_t b)
{
    #pragma HLS inline

    long long q_long = (long long)q;
    long long d      = (long long)a - (long long)b;

    if (d < 0)
        d += q_long;

    return (coeff_t)d;
}


// -----------------------------------------------------------------------------
// 512-point NTT core (for Dilithium negacyclic convolution)
// -----------------------------------------------------------------------------

// Multiplication modulo q, using existing mod() on double_coeff_t
static inline coeff_t mul_mod(coeff_t a, coeff_t b)
{
    double_coeff_t prod = (double_coeff_t)a * (double_coeff_t)b;
    return mod(prod);
}

// Precomputed stage twiddles for length-512 NTT over q = 8380417
// Stage index s corresponds to len = 2^(s+1): 2,4,8,...,512
static const coeff_t NTT_WLEN[9] = {
    (coeff_t)8380416, // len = 2
    (coeff_t)4808194, // len = 4
    (coeff_t)4614810, // len = 8
    (coeff_t)2883726, // len = 16
    (coeff_t)6250525, // len = 32
    (coeff_t)7044481, // len = 64
    (coeff_t)3241972, // len = 128
    (coeff_t)6644104, // len = 256
    (coeff_t)1921994  // len = 512
};

static const coeff_t NTT_WLEN_INV[9] = {
    (coeff_t)8380416, // len = 2
    (coeff_t)3572223, // len = 4
    (coeff_t)3761513, // len = 8
    (coeff_t)5234739, // len = 16
    (coeff_t)3764867, // len = 32
    (coeff_t)3227876, // len = 64
    (coeff_t)6621070, // len = 128
    (coeff_t)6125690, // len = 256
    (coeff_t)527981   // len = 512
};

// 512^{-1} mod 8380417
static const coeff_t INV_NTT512 = (coeff_t)8364049;

// In-place iterative radix-2 NTT of size 512
static void ntt_512(coeff_t a[512], bool invert)
{
    // Bit-reversal permutation
    unsigned int j = 0;
    for (unsigned int i = 1; i < 512; ++i) {
        unsigned int bit = 512 >> 1;
        while (j & bit) {
            j ^= bit;
            bit >>= 1;
        }
        j ^= bit;
        if (i < j) {
            coeff_t tmp = a[i];
            a[i] = a[j];
            a[j] = tmp;
        }
    }

    int len   = 2;
    int stage = 0;
    while (len <= 512) {
        coeff_t wlen = invert ? NTT_WLEN_INV[stage] : NTT_WLEN[stage];
        int half = len >> 1;

        for (int i = 0; i < 512; i += len) {
            coeff_t w = (coeff_t)1;
            for (int j = 0; j < half; ++j) {
                coeff_t u = a[i + j];
                coeff_t v = mul_mod(a[i + j + half], w);
                a[i + j]         = modadd(u, v);
                a[i + j + half]  = modsub(u, v);
                w = mul_mod(w, wlen);
            }
        }

        len <<= 1;
        stage++;
    }

    if (invert) {
        // Multiply by 512^{-1} mod q to finish inverse NTT
        for (int i = 0; i < 512; ++i) {
            a[i] = mul_mod(a[i], INV_NTT512);
        }
    }
}

// Negacyclic convolution via 512-point NTT:
// c(x) = a(x) * b(x) mod (x^Nt + 1, q), Nt = 256
static void poly_mult_dil_core(coeff_t       c[Nt],
                               const coeff_t a[Nt],
                               const coeff_t b[Nt])
{
    const int N2 = 2 * Nt; // 512 for Nt = 256
    coeff_t A[N2];
    coeff_t B[N2];
    coeff_t C[N2];

    // Copy inputs and zero-pad
    for (int i = 0; i < Nt; ++i) {
        #pragma HLS pipeline II=1
        A[i] = a[i];
        B[i] = b[i];
    }
    for (int i = Nt; i < N2; ++i) {
        #pragma HLS pipeline II=1
        A[i] = 0;
        B[i] = 0;
    }

    // Forward 512-point NTT of both sequences
    ntt_512(A, false);
    ntt_512(B, false);

    // Pointwise multiplication in NTT domain
    for (int i = 0; i < N2; ++i) {
        #pragma HLS pipeline II=1
        C[i] = mul_mod(A[i], B[i]);
    }

    // Inverse 512-point NTT -> length-512 cyclic convolution
    ntt_512(C, true);

    // Negacyclic fold: c[k] = C[k] - C[k + Nt] mod q
    for (int k = 0; k < Nt; ++k) {
        #pragma HLS pipeline II=1
        c[k] = modsub(C[k], C[k + Nt]);
    }
}


// END "For Dilithium"
//----------------------------------------------------------


void butterfly_unit_dif(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y)
{
	#pragma HLS pipeline II = 1
	x = modadd(a, b);
	y = modsub(a, b);
	y = mod(w * y);
}

void butterfly_unit_dit(coeff_t w, coeff_t a, coeff_t b, coeff_t &x, coeff_t &y)
{
	#pragma HLS pipeline II = 1
	coeff_t wb = mod(w * b);
	x = modadd(a, wb);
	y = modsub(a, wb);
}

void delay_cycle()
{
	#ifdef __SYNTHESIS__
		ap_wait_n(1);
	#endif
}

void ntt_stage1 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeff = 1729;

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 64; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	for (int j = 0; j < 1; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 64; k++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeff;
			butterfly_unit_dit(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			delay_cycle();
		}

		for (int i = 0; i < 64; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
		}
	}
}

void ntt_stage2 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[2] = {2580, 3289};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 32; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	for (int j = 0; j < 2; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 32; k++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[j];
			butterfly_unit_dit(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			delay_cycle();
		}

		for (int i = 0; i < 32; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 1)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void ntt_stage3 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[4] = {2642, 630, 1897, 848};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 16; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	for (int j = 0; j < 4; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 16; k++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[j];
			butterfly_unit_dit(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			delay_cycle();
		}

		for (int i = 0; i < 16; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 3)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void ntt_stage4 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[8] = {1062, 1919, 193, 797, 2786, 3260, 569, 1746};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 8; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	for (int j = 0; j < 8; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		int ind = 1;
		for (int k = 0; k < 8; k++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[j];
			butterfly_unit_dit(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			delay_cycle();
		}

		for (int i = 0; i < 8; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 7)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void ntt_stage5 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[16] = {296, 2447, 1339, 1476, 3046, 56, 2240, 1333,
								  1426, 2094, 535, 2882, 2393, 2879, 1974, 821};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 4; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	for (int j = 0; j < 16; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		int ind = 1;
		for (int k = 0; k < 4; k = k + 1)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[j];
			butterfly_unit_dit(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			delay_cycle();
		}

		for (int i = 0; i < 4; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 15)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void ntt_stage6 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[32] = {289, 331, 3253, 1756, 1197, 2304, 2277, 2055,
								  650, 1977, 2513, 632, 2865, 33, 1320, 1915,
								  2319, 1435, 807, 452, 1438, 2868, 1534, 2402,
								  2647, 2617, 1481, 648, 2474, 3110, 1227, 910};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 2; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	for (int j = 0; j < 32; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		int ind = 1;
		for (int k = 0; k < 2; k = k + 1)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[j];
			butterfly_unit_dit(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			delay_cycle();
		}

		for (int i = 0; i < 2; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 31)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void ntt_stage7 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b,  coeff_t fifo[])
{
	#pragma HLS inline off

	#pragma HLS DEPENDENCE variable = fifo inter RAW false
	coeff_t twiddle_coeffs[64] = {17, 2761, 583, 2649, 1637, 723, 2288, 1100,
								  1409, 2662, 3281, 233, 756, 2156, 3015, 3050,
								  1703, 1651, 2789, 1789, 1847, 952, 1461, 2687,
								  939, 2308, 2437, 2388, 733, 2337, 268, 641,
							 	  1584, 2298, 2037, 3220, 375, 2549, 2090, 1645,
								  1063, 319, 2773, 757, 2099, 561, 2466, 2594,
								  2804, 1092, 403, 1026, 1143, 2150, 2775, 886,
								  1722, 1212, 1874, 1029, 2110, 2935, 885, 2154};
	int x, y;
	coeff_t u, t, it, bf1, bf2;

	u = a.read();

	for (int j = 0; j < 64; j++)
	{
		#pragma HLS pipeline II = 1
		#pragma HLS DEPENDENCE variable = fifo inter RAW false

		t = a.read();
		butterfly_unit_dit(twiddle_coeffs[j], u, t, bf1, bf2);
		b.write(bf1);
		b.write(bf2);
		if (j < 63)
			u = a.read();
	}
}

void intt_stage1 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	coeff_t twiddle_coeffs[64] = {1175, 2444, 394, 1219, 2300, 1455, 2117, 1607,
								  2443,  554, 1179, 2186, 2303, 2926, 2237,  525,
								  735,  863, 2768, 1230, 2572,  556, 3010, 2266,
								  1684, 1239,  780, 2954,  109, 1292, 1031, 1745,
								  2688, 3061,  992, 2596,  941,  892, 1021, 2390,
								  642, 1868, 2377, 1482, 1540,  540, 1678, 1626,
								  279,  314, 1173, 2573, 3096,   48,  667, 1920,
								  2229, 1041, 2606, 1692,  680, 2746,  568, 3312};

	#pragma HLS inline off
	#pragma HLS DEPENDENCE variable = fifo inter RAW false
	int x, y;
	coeff_t u, t, it, bf1, bf2;

	u = a.read();

	for (int j = 0; j < 64; j++)
	{
		#pragma HLS pipeline II = 1
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		t = a.read();
		butterfly_unit_dif(twiddle_coeffs[j], u, t, bf1, bf2);
		b.write(bf1);
		b.write(bf2);
		if (j < 63)
			u = a.read();
	}
}

void intt_stage2 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[32] = {2419, 2102,  219,  855, 2681, 1848,  712,  682,
								  927, 1795,  461, 1891, 2877, 2522, 1894, 1010,
								  1414, 2009, 3296,  464, 2697,  816, 1352, 2679,
								  1274, 1052, 1025, 2132, 1573,   76, 2998, 3040};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 2; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	int ind = 0;
	int count = 0;
	for (int j = 0; j < 32; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 2; k = k + 1)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[ind];
			butterfly_unit_dif(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			count++;
			if (count % 2 == 0)
				ind++;
			delay_cycle();
		}

		for (int i = 0; i < 2; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 31)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void intt_stage3 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[16] = {2508, 1355,  450,  936,  447, 2794, 1235, 1903,
								  1996, 1089, 3273,  283, 1853, 1990,  882, 3033};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	int m = 4;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 4; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	int ind = 0;
	int count = 0;
	for (int j = 0; j < 16; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 4; k = k + 1)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[ind];
			butterfly_unit_dif(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			count++;
			if (count % 4 == 0)
				ind++;
			delay_cycle();
		}

		for (int i = 0; i < 4; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 15)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void intt_stage4 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[8] = {1583, 2760, 69, 543, 2532, 3136, 1410, 2267};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 8; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	int ind = 0;
	int count = 0;
	for (int j = 0; j < 8; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 8; k = k + 1)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[ind];
			butterfly_unit_dif(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			count++;
			if (count % 8 == 0)
				ind++;
			delay_cycle();
		}

		for (int i = 0; i < 8; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 7)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void intt_stage5 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[4] = {2481, 1432, 2699,  687};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 16; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	int ind = 0;
	int count = 0;
	for (int j = 0; j < 4; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 16; k = k + 1)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[ind];
			butterfly_unit_dif(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			count++;
			if (count % 16 == 0)
				ind++;
			delay_cycle();
		}

		for (int i = 0; i < 16; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 3)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void intt_stage6 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b, coeff_t fifo[])
{
	#pragma HLS dataflow
	coeff_t twiddle_coeffs[2] = {40,  749};

	#pragma HLS DEPENDENCE variable = fifo inter RAW false

	int x, y;
	coeff_t a_, b_, it, bf1, bf2, tf;

	for (int i = 0; i < 32; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	int ind = 0;
	int count = 0;
	for (int j = 0; j < 2; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 32; k = k + 1)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = twiddle_coeffs[ind];
			butterfly_unit_dif(tf, a_, b_, bf1, bf2);
			b.write(bf1);
			fifo[iter] = bf2;
			iter++;
			count++;
			if (count == 32)
				ind++;
			delay_cycle();
		}

		for (int i = 0; i < 32; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
			if (j < 1)
			{
				it = a.read();
				fifo[i + 64] = it;
			}
		}
	}
}

void intt_stage7 (hls::stream<coeff_t> &a, hls::stream<coeff_t> &b,  coeff_t fifo[])
{
	#pragma HLS inline off
	#pragma HLS DEPENDENCE variable = fifo inter RAW false
	int x, y;
	coeff_t a_, b_, it, bf1, bf2, bfn1, bfn2, tf;

	for (int i = 0; i < 64; i++)
	{
		#pragma HLS pipeline
		it = a.read();
		fifo[i + 64] = it;
	}

	for (int j = 0; j < 1; j++)
	{
		#pragma HLS DEPENDENCE variable = fifo inter RAW false
		int iter = 0;
		for (int k = 0; k < 64; k = k + 1)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			a_ = fifo[iter + 64];
			b_ = a.read();
			tf = 1600;
			butterfly_unit_dif(tf, a_, b_, bf1, bf2);
			bfn1 = mod(bf1 * inv_n);
			bfn2 = mod(bf2 * inv_n);
			b.write(bfn1);
			fifo[iter] = bfn2;
			iter++;
			delay_cycle();
		}

		for (int i = 0; i < 64; i++)
		{
			#pragma HLS pipeline II = 1
			#pragma HLS DEPENDENCE variable = fifo inter RAW false
			b.write(fifo[i]);
			delay_cycle();
		}
	}
}

void read_inputs (hls::stream<coeff_t_stream> &input, hls::stream<coeff_t> &se, hls::stream<coeff_t> &so)
{
	coeff_t_stream x;
	coeff_t a;
	int i;

	for (i=0; i<Nt; i++)
	{
		#pragma HLS pipeline II = 1
		x = input.read();
		a = x.value;
		if (i%2 == 0)
			se.write(a);
		else
			so.write(a);
	}
}

void write_outputs (hls::stream<coeff_t> &se, hls::stream<coeff_t> &so, hls::stream<coeff_t_stream> &output)
{
	coeff_t a1, a0;
	coeff_t_stream y;
	int i;

	y.last = 0;
	for (i=0; i<N; i++)
	{
		#pragma HLS pipeline II = 1
		a0 = se.read();
		a1 = so.read();
		y.value = a0;
		output.write(y);
		y.value = a1;
		if (i == N-1)
			y.last = 1;
		output.write(y);
	}
}

void ct_ntt (hls::stream<coeff_t_stream> &input, hls::stream<coeff_t_stream> &output)
{
	#pragma HLS dataflow

	hls::stream<coeff_t> s0o("s0o"), s1o("s1o"), s2o("s2o"), s3o("s3o"),
						 s4o("s4o"), s5o("s5o"), s6o("s6o"), s7o("s7o"),
						 s0e("s0e"), s1e("s1e"), s2e("s2e"), s3e("s3e"),
						 s4e("s4e"), s5e("s5e"), s6e("s6e"), s7e("s7e");

	coeff_t fo7[65], fo6[66], fo5[68], fo4[72], fo3[80], fo2[96], fo1[128];
	coeff_t fe7[65], fe6[66], fe5[68], fe4[72], fe3[80], fe2[96], fe1[128];

	coeff_t_stream x, y;

	#pragma HLS STREAM variable = s7o depth = 1
	#pragma HLS STREAM variable = s6o depth = 2
	#pragma HLS STREAM variable = s5o depth = 4
	#pragma HLS STREAM variable = s4o depth = 8
	#pragma HLS STREAM variable = s3o depth = 16
	#pragma HLS STREAM variable = s2o depth = 32
	#pragma HLS STREAM variable = s1o depth = 64
	#pragma HLS STREAM variable = s0o depth = 128

	#pragma HLS STREAM variable = s7e depth = 1
	#pragma HLS STREAM variable = s6e depth = 2
	#pragma HLS STREAM variable = s5e depth = 4
	#pragma HLS STREAM variable = s4e depth = 8
	#pragma HLS STREAM variable = s3e depth = 16
	#pragma HLS STREAM variable = s2e depth = 32
	#pragma HLS STREAM variable = s1e depth = 64
	#pragma HLS STREAM variable = s0e depth = 128


	read_inputs(input, s0e, s0o);

	ntt_stage1 (s0e, s1e, fe1);
	ntt_stage1 (s0o, s1o, fo1);

	ntt_stage2 (s1e, s2e, fe2);
	ntt_stage2 (s1o, s2o, fo2);

	ntt_stage3 (s2e, s3e, fe3);
	ntt_stage3 (s2o, s3o, fo3);

	ntt_stage4 (s3e, s4e, fe4);
	ntt_stage4 (s3o, s4o, fo4);

	ntt_stage5 (s4e, s5e, fe5);
	ntt_stage5 (s4o, s5o, fo5);

	ntt_stage6 (s5e, s6e, fe6);
	ntt_stage6 (s5o, s6o, fo6);

	ntt_stage7 (s6e, s7e, fe7);
	ntt_stage7 (s6o, s7o, fo7);

	write_outputs(s7e, s7o, output);
}

void gs_intt (hls::stream<coeff_t_stream> &input, hls::stream<coeff_t_stream> &output)
{
	#pragma HLS dataflow

	hls::stream<coeff_t> s0o("s0o"), s1o("s1o"), s2o("s2o"), s3o("s3o"),
						 s4o("s4o"), s5o("s5o"), s6o("s6o"), s7o("s7o"),
						 s0e("s0e"), s1e("s1e"), s2e("s2e"), s3e("s3e"),
						 s4e("s4e"), s5e("s5e"), s6e("s6e"), s7e("s7e");

	coeff_t fo7[128], fo6[96], fo5[80], fo4[72], fo3[68], fo2[66], fo1[65];
	coeff_t fe7[128], fe6[96], fe5[80], fe4[72], fe3[68], fe2[66], fe1[65];

	coeff_t_stream x, y;

	#pragma HLS STREAM variable = s7o depth = 1
	#pragma HLS STREAM variable = s6o depth = 2
	#pragma HLS STREAM variable = s5o depth = 4
	#pragma HLS STREAM variable = s4o depth = 8
	#pragma HLS STREAM variable = s3o depth = 16
	#pragma HLS STREAM variable = s2o depth = 32
	#pragma HLS STREAM variable = s1o depth = 64
	#pragma HLS STREAM variable = s0o depth = 128

	#pragma HLS STREAM variable = s7e depth = 1
	#pragma HLS STREAM variable = s6e depth = 2
	#pragma HLS STREAM variable = s5e depth = 4
	#pragma HLS STREAM variable = s4e depth = 8
	#pragma HLS STREAM variable = s3e depth = 16
	#pragma HLS STREAM variable = s2e depth = 32
	#pragma HLS STREAM variable = s1e depth = 64
	#pragma HLS STREAM variable = s0e depth = 128

	read_inputs(input, s0e, s0o);

	intt_stage1 (s0e, s1e, fe1);
	intt_stage1 (s0o, s1o, fo1);

	intt_stage2 (s1e, s2e, fe2);
	intt_stage2 (s1o, s2o, fo2);

	intt_stage3 (s2e, s3e, fe3);
	intt_stage3 (s2o, s3o, fo3);

	intt_stage4 (s3e, s4e, fe4);
	intt_stage4 (s3o, s4o, fo4);

	intt_stage5 (s4e, s5e, fe5);
	intt_stage5 (s4o, s5o, fo5);

	intt_stage6 (s5e, s6e, fe6);
	intt_stage6 (s5o, s6o, fo6);

	intt_stage7 (s6e, s7e, fe7);
	intt_stage7 (s6o, s7o, fo7);

	write_outputs(s7e, s7o, output);
}

void stream_split (hls::stream<coeff_t_stream_big> &input,
				   hls::stream<coeff_t_stream> &input1,
				   hls::stream<coeff_t_stream> &input2)
{

	coeff_t_stream_big x;
	double_coeff_t a;
	coeff_t_stream x1, x2;
	coeff_t a1, a2;
	int i;

	for (i=0; i<Nt; i++)
	{
		#pragma HLS pipeline II = 1
		x = input.read();
		a = x.value;
		a1 = a(double_coeff_t::width - 1, coeff_t::width);
		a2 = a(coeff_t::width - 1, 0);
		if (i == Nt-1)
		{
			x1.last = 1;
			x2.last = 1;
		}
		else
		{
			x1.last = 0;
			x2.last = 0;
		}
		x1.value = a1;
		x2.value = a2;
		input1.write(x1);
		input2.write(x2);
	}
}

void point_wise_mult (hls::stream<coeff_t_stream> &input1,
					  hls::stream<coeff_t_stream> &input2,
		   	   	   	  hls::stream<coeff_t_stream> &output)
{
	coeff_t_stream xe, xo, ye, yo, z;
	coeff_t ae, be, ce, ao, bo, co, c1, c2, c2s, c3, c4;
	int i;

	coeff_t pm_factors[128] = {17, 3312, 2761, 568, 583, 2746, 2649, 680,
							   1637, 1692, 723, 2606, 2288, 1041, 1100, 2229,
							   1409, 1920, 2662, 667, 3281, 48, 233, 3096,
							   756, 2573, 2156, 1173, 3015, 314, 3050, 279,
							   1703, 1626, 1651, 1678, 2789, 540, 1789, 1540,
							   1847, 1482, 952, 2377, 1461, 1868, 2687, 642,
							   939, 2390, 2308, 1021, 2437, 892, 2388, 941,
							   733, 2596, 2337, 992, 268, 3061, 641, 2688,
							   1584, 1745, 2298, 1031, 2037, 1292, 3220, 109,
							   375, 2954, 2549, 780, 2090, 1239, 1645, 1684,
							   1063, 2266, 319, 3010, 2773, 556, 757, 2572,
							   2099, 1230, 561, 2768, 2466, 863, 2594, 735,
							   2804, 525, 1092, 2237, 403, 2926, 1026, 2303,
							   1143, 2186, 2150, 1179, 2775, 554, 886, 2443,
							   1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300,
							   2110, 1219, 2935, 394, 885, 2444, 2154, 1175};

	z.last = 0;
	for (i=0; i<N; i++)
	{
		#pragma HLS pipeline II = 1
		xe = input1.read();
		xo = input1.read();
		ye = input2.read();
		yo = input2.read();
		ao = xo.value;
		bo = yo.value;
		ae = xe.value;
		be = ye.value;

		c1 = mod (ae * be);
		c2 = mod (ao * bo);
		c2s = mod (c2 * pm_factors[i]);
		c3 = mod (ae * bo);
		c4 = mod (ao * be);

		ce = modadd (c1, c2s);
		co = modadd (c3, c4);

		z.value = ce;
		output.write(z);
		if (i == N-1)
			z.last = 1;
		z.value = co;
		output.write(z);
	}
}

// -----------------------------------------------------------------------------
// AXI4-Stream <-> internal stream conversion helpers (only at top level)
// -----------------------------------------------------------------------------

static void axis_to_internal_input(hls::stream<coeff_axis_big_t> &axis_in,
                                   hls::stream<coeff_t_stream_big> &int_in)
{
	coeff_axis_big_t  a;
	coeff_t_stream_big x;

	for (int i = 0; i < Nt; i++)
	{
		#pragma HLS pipeline II = 1
		a = axis_in.read();

		x.value = (double_coeff_t)a.data;
		x.last  = a.last;

		int_in.write(x);

		// Optional: break on TLAST if you want to be robust to shorter packets
		if (a.last)
			break;
	}
}

static void internal_to_axis_output(hls::stream<coeff_t_stream> &int_out,
                                    hls::stream<coeff_axis_t> &axis_out)
{
	coeff_t_stream x;
	coeff_axis_t   a;

	for (int i = 0; i < Nt; i++)
	{
		#pragma HLS pipeline II = 1
		x = int_out.read();

		a.data = (ap_uint<32>)x.value;
		a.last = x.last;

		// Mark all bytes valid; side channels are disabled in this ap_axiu config
		a.keep = -1;
		a.strb = -1;

		axis_out.write(a);

		if (x.last)
			break;
	}
}

// -----------------------------------------------------------------------------
// Top-level function with AXI4-Stream ports (for DMA) and internal NTT pipeline
// -----------------------------------------------------------------------------
/*
int poly_mult_dil (hls::stream<coeff_axis_big_t> &input,
	       hls::stream<coeff_axis_t>     &output)
{
	#pragma HLS INTERFACE axis register port=input
	#pragma HLS INTERFACE axis register port=output
	#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS
	#pragma HLS dataflow

	// Internal streams using the original coeff_t_stream{,_big} types
	hls::stream<coeff_t_stream_big> in_internal("in_internal");
	hls::stream<coeff_t_stream>     input1("input1"), input2("input2");
	hls::stream<coeff_t_stream>     middle1("middle1"), middle2("middle2");
	hls::stream<coeff_t_stream>     middle3("middle3"), out_internal("out_internal");

	axis_to_internal_input(input, in_internal);
	stream_split(in_internal, input1, input2);
	ct_ntt(input1, middle1);
	ct_ntt(input2, middle2);
	point_wise_mult(middle1, middle2, middle3);
	gs_intt(middle3, out_internal);
	internal_to_axis_output(out_internal, output);

	return 0;
}
*/

// For Dilithium
// -----------------------------------------------------------------------------
// Top-level function: now uses array-based 512-NTT core instead of Kyber NTT
// -----------------------------------------------------------------------------

int poly_mult_dil(hls::stream<coeff_axis_big_t> &input,
                  hls::stream<coeff_axis_t>     &output)
{
    #pragma HLS INTERFACE axis register port=input
    #pragma HLS INTERFACE axis register port=output
    #pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS
    #pragma HLS dataflow

    // Internal streams (same as before)
    hls::stream<coeff_t_stream_big> in_internal("in_internal");
    hls::stream<coeff_t_stream>     input1("input1"), input2("input2");
    hls::stream<coeff_t_stream>     out_internal("out_internal");

    // Existing helpers: keep as they are in your file
    axis_to_internal_input(input, in_internal);
    stream_split(in_internal, input1, input2);

    // Local polynomial buffers
    coeff_t poly_a[Nt];
    coeff_t poly_b[Nt];
    coeff_t poly_c[Nt];

    // Read Nt coefficients for each polynomial from the two internal streams
    for (int i = 0; i < Nt; ++i) {
        #pragma HLS pipeline II=1
        coeff_t_stream x1 = input1.read();
        coeff_t_stream x2 = input2.read();
        poly_a[i] = x1.value;
        poly_b[i] = x2.value;
    }

    // Core negacyclic multiplication via 512-point NTT
    poly_mult_dil_core(poly_c, poly_a, poly_b);

    // Stream result back out as coeff_t_stream (value + last)
    for (int i = 0; i < Nt; ++i) {
        #pragma HLS pipeline II=1
        coeff_t_stream y;
        y.value = poly_c[i];
        y.last  = (i == Nt - 1) ? (bit)1 : (bit)0;
        out_internal.write(y);
    }

    // Existing helper: convert internal stream to AXI output
    internal_to_axis_output(out_internal, output);

    return 0;
}