From 6a0f2fa466527e2a9dd7ef86feca0700fc850784 Mon Sep 17 00:00:00 2001 From: aottaviano Date: Sat, 25 Jun 2022 09:48:20 +0200 Subject: [PATCH] treewide: Change way idma API are handled * Instead of aswitch, we double the API and mark them aaaaaaaaas 'fc' or 'cl' * This is due to an error when we wait for the dma to finish its transactions. I didn't investigate for time reasons, let's do like this (not very code-reuse firendly, but it works at least) --- include/hal/chips/control-pulp/pulp.h | 3 +- include/hal/dma/{idma_v1.h => idma_v1_cl.h} | 173 +++++----- include/hal/dma/idma_v1_fc.h | 351 ++++++++++++++++++++ 3 files changed, 427 insertions(+), 100 deletions(-) rename include/hal/dma/{idma_v1.h => idma_v1_cl.h} (62%) create mode 100644 include/hal/dma/idma_v1_fc.h diff --git a/include/hal/chips/control-pulp/pulp.h b/include/hal/chips/control-pulp/pulp.h index bcb57f4..3b851a2 100644 --- a/include/hal/chips/control-pulp/pulp.h +++ b/include/hal/chips/control-pulp/pulp.h @@ -26,7 +26,8 @@ #include "hal/dma/mchan_v7.h" #endif #if IDMA_VERSION == 1 -#include "hal/dma/idma_v1.h" +#include "hal/dma/idma_v1_cl.h" +#include "hal/dma/idma_v1_fc.h" #endif #include "hal/timer/timer_v2.h" #include "hal/soc_eu/soc_eu_v2.h" diff --git a/include/hal/dma/idma_v1.h b/include/hal/dma/idma_v1_cl.h similarity index 62% rename from include/hal/dma/idma_v1.h rename to include/hal/dma/idma_v1_cl.h index 62a722a..a1a10a5 100644 --- a/include/hal/dma/idma_v1.h +++ b/include/hal/dma/idma_v1_cl.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef __HAL_IDMA_V1_H__ -#define __HAL_IDMA_V1_H__ +#ifndef __HAL_CL_IDMA_V1_H__ +#define __HAL_CL_IDMA_V1_H__ #include #include "hal/pulp.h" @@ -55,7 +55,7 @@ typedef unsigned int dma_loc_t; \param ext2loc If 1, the transfer is loading data from external memory and storing to cluster memory. If 0, it is the contrary \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. */ -static inline int plp_dma_memcpy(unsigned int base, dma_ext_t ext, unsigned int loc, unsigned int size, int ext2loc); +static inline int plp_cl_dma_memcpy(dma_ext_t ext, unsigned int loc, unsigned int size, int ext2loc); /** Cluster memory to external memory transfer with event-based completion. * @@ -64,7 +64,7 @@ static inline int plp_dma_memcpy(unsigned int base, dma_ext_t ext, unsigned int \param size Number of bytes to be transfered. The only restriction is that this size must fit 16 bits, i.e. must be inferior to 65536. \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. */ -static inline int plp_dma_l1ToExt(dma_ext_t ext, unsigned int loc, unsigned short size); +static inline int plp_cl_dma_l1ToExt(dma_ext_t ext, unsigned int loc, unsigned short size); /** External memory to cluster memory transfer with event-based completion. * @@ -73,7 +73,7 @@ static inline int plp_dma_l1ToExt(dma_ext_t ext, unsigned int loc, unsigned shor \param size Number of bytes to be transfered. The only restriction is that this size must fit 16 bits, i.e. must be inferior to 65536. \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. */ -static inline int plp_dma_extToL1(unsigned int loc, dma_ext_t ext, unsigned short size); +static inline int plp_cl_dma_extToL1(unsigned int loc, dma_ext_t ext, unsigned short size); /** 2-dimensional memory transfer with event-based completion. * @@ -85,7 +85,7 @@ static inline int plp_dma_extToL1(unsigned int loc, dma_ext_t ext, unsigned shor \param ext2loc If 1, the transfer is loading data from external memory and storing to cluster memory. If 0, it is the contrary \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. */ -static inline int plp_dma_memcpy_2d(unsigned int base, dma_ext_t ext, unsigned int loc, unsigned int size, unsigned int stride, unsigned int length, int ext2loc); +static inline int plp_cl_dma_memcpy_2d(dma_ext_t ext, unsigned int loc, unsigned int size, unsigned int stride, unsigned int length, int ext2loc); /** Cluster memory to external memory 2-dimensional transfer with event-based completion. * @@ -96,7 +96,7 @@ static inline int plp_dma_memcpy_2d(unsigned int base, dma_ext_t ext, unsigned i \param length 2D length, which is the number of transfered bytes after which the DMA will switch to the next line. Must fit 16 bits, i.e. must be inferior to 65536. This applies only to the external memory. \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. */ -static inline int plp_dma_l1ToExt_2d(dma_ext_t ext, unsigned int loc, unsigned short size, unsigned short stride, unsigned short length); +static inline int plp_cl_dma_l1ToExt_2d(dma_ext_t ext, unsigned int loc, unsigned short size, unsigned short stride, unsigned short length); /** External memory to cluster memory 2-dimensional transfer with event-based completion. * @@ -107,7 +107,7 @@ static inline int plp_dma_l1ToExt_2d(dma_ext_t ext, unsigned int loc, unsigned s \param length 2D length, which is the number of transfered bytes after which the DMA will switch to the next line. Must fit 16 bits, i.e. must be inferior to 65536. This applies only to the external memory. \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer */ -static inline int plp_dma_extToL1_2d(unsigned int loc, dma_ext_t ext, unsigned short size, unsigned short stride, unsigned short length); +static inline int plp_cl_dma_extToL1_2d(unsigned int loc, dma_ext_t ext, unsigned short size, unsigned short stride, unsigned short length); //!@} @@ -117,14 +117,14 @@ static inline int plp_dma_extToL1_2d(unsigned int loc, dma_ext_t ext, unsigned s /** DMA barrier. * This blocks the core until no transfer is on-going in the DMA. */ -static inline void plp_dma_barrier(unsigned int base); +static inline void plp_cl_dma_barrier(); /** DMA wait. * This blocks the core until the specified transfer is finished. * \param counter The counter ID identifying the transfer. This has been returned from an enqueued transfer (e.g. plp_dma_extToL1_2d) */ -static inline void plp_dma_wait(unsigned int base, unsigned int dma_tx_id); +static inline void plp_cl_dma_wait(unsigned int dma_tx_id); //!@} @@ -147,7 +147,7 @@ static inline void plp_dma_wait(unsigned int base, unsigned int dma_tx_id); \param twod if set, the DMA will execute a 2D transfer. \return The generated configuration */ -static inline unsigned int pulp_idma_get_conf(unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod); +static inline unsigned int pulp_cl_idma_get_conf(unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod); /** * iDMA transfer status @@ -155,7 +155,7 @@ static inline unsigned int pulp_idma_get_conf(unsigned int decouple, unsigned in \param dma_tx_id The dma transfer identifier \return transfer status. 1 if complete, 0 if still ongoing or waiting. */ -static inline unsigned int pulp_idma_tx_cplt(unsigned int base, unsigned int dma_tx_id); +static inline unsigned int pulp_cl_idma_tx_cplt(unsigned int dma_tx_id); /** * iDMA memory transfer @@ -166,7 +166,7 @@ static inline unsigned int pulp_idma_tx_cplt(unsigned int base, unsigned int dma \param num_bytes The number bytes \return The dma transfer identifier */ -static inline unsigned int pulp_idma_memcpy(unsigned int base, unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes); +static inline unsigned int pulp_cl_idma_memcpy(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes); /** * iDMA 2D memory transfer @@ -180,7 +180,7 @@ static inline unsigned int pulp_idma_memcpy(unsigned int base, unsigned int cons \param num_reps The number of repetitions \return The dma transfer identifier */ -static inline unsigned int pulp_idma_memcpy_2d(unsigned int base, unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps); +static inline unsigned int pulp_cl_idma_memcpy_2d(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps); /** @@ -203,13 +203,13 @@ static inline unsigned int pulp_idma_memcpy_2d(unsigned int base, unsigned int c \param num_reps if 2D, the number of repetitions \return The dma trasfer identifier */ -static inline unsigned int pulp_idma_memcpy_advanced(unsigned int base, unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps); +static inline unsigned int pulp_cl_idma_memcpy_advanced(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps); /** Return the DMA status. * \return DMA status. 1 means there are still on-going transfers, 0 means nothing is on-going. */ -static inline unsigned int plp_dma_status(unsigned int base); +static inline unsigned int plp_cl_dma_status(); //!@} @@ -218,36 +218,19 @@ static inline unsigned int plp_dma_status(unsigned int base); /// @cond IMPLEM #if ARCHI_HAS_DMA_DEMUX -#define DMA_ADDR_CL ARCHI_IDMA_DEMUX_ADDR +#define DMA_ADDR ARCHI_IDMA_DEMUX_ADDR #else -#define DMA_ADDR_CL ARCHI_IDMA_EXT_ADDR +#define DMA_ADDR ARCHI_IDMA_EXT_ADDR +#endif +#if defined(__riscv__) && !defined(RV_ISA_RV32) && !defined(__LLVM__) +#define DMA_WRITE(value, offset) __builtin_pulp_OffsetedWrite((value), (int *)DMA_ADDR, (offset)) +#define DMA_READ(offset) __builtin_pulp_OffsetedRead((int *)DMA_ADDR, (offset)) +#else +#define DMA_WRITE(value, offset) pulp_write32(DMA_ADDR + (offset), (value)) +#define DMA_READ(offset) pulp_read32(DMA_ADDR + (offset)) #endif -#define DMA_ADDR_FC ARCHI_SDMA_ADDR - -typedef enum{ - DMA_FC = 0, - DMA_CL = 1 -} dma_e; - -unsigned int inline get_dma_base_addr(dma_e dma_loc) { - switch (dma_loc) { - case DMA_FC: - return DMA_ADDR_FC; - case DMA_CL: - return DMA_ADDR_CL; - } -} - -uint32_t inline pulp_idma_write(unsigned int base, unsigned int value, unsigned offset) { - pulp_write32((base) + (offset), (value)); - } - -uint32_t inline pulp_idma_read(unsigned int base, unsigned int offset) { - pulp_read32((base) + (offset)); - } - -static inline unsigned int pulp_idma_get_conf(unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod) { +static inline unsigned int pulp_cl_idma_get_conf(unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod) { unsigned int conf; #if defined(__riscv__) conf = __builtin_bitinsert(0, decouple, 1, IDMA_REG32_2D_FRONTEND_CONF_DECOUPLE_BIT); @@ -260,8 +243,8 @@ static inline unsigned int pulp_idma_get_conf(unsigned int decouple, unsigned in return conf; } -static inline unsigned int pulp_idma_tx_cplt(unsigned int base, unsigned int dma_tx_id) { - unsigned int done_id = pulp_idma_read(base, IDMA_REG32_2D_FRONTEND_DONE_REG_OFFSET); +static inline unsigned int pulp_cl_idma_tx_cplt(unsigned int dma_tx_id) { + unsigned int done_id = DMA_READ(IDMA_REG32_2D_FRONTEND_DONE_REG_OFFSET); unsigned int my_id = dma_tx_id & IDMA_ID_MASK; if (done_id >> (IDMA_ID_COUNTER_WIDTH-1) == my_id >> (IDMA_ID_COUNTER_WIDTH-1)) { return my_id <= done_id; @@ -271,110 +254,102 @@ static inline unsigned int pulp_idma_tx_cplt(unsigned int base, unsigned int dma } -static inline unsigned int pulp_idma_memcpy(unsigned int base, unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes) { - pulp_idma_write(base, src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); - pulp_idma_write(base, dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); - pulp_idma_write(base, num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); - pulp_idma_write(base, IDMA_DEFAULT_CONFIG, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); +static inline unsigned int pulp_cl_idma_memcpy(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes) { + DMA_WRITE(src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); + DMA_WRITE(dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); + DMA_WRITE(num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); + DMA_WRITE(IDMA_DEFAULT_CONFIG, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); asm volatile("" : : : "memory"); // Launch TX - unsigned int dma_tx_id = pulp_idma_read(base, IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); + unsigned int dma_tx_id = DMA_READ(IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); return dma_tx_id; } -static inline unsigned int pulp_idma_memcpy_2d(unsigned int base, unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps) { - pulp_idma_write(base, src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); - pulp_idma_write(base, dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); - pulp_idma_write(base, num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); - pulp_idma_write(base, IDMA_DEFAULT_CONFIG_2D, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); - pulp_idma_write(base, src_stride, IDMA_REG32_2D_FRONTEND_STRIDE_SRC_REG_OFFSET); - pulp_idma_write(base, dst_stride, IDMA_REG32_2D_FRONTEND_STRIDE_DST_REG_OFFSET); - pulp_idma_write(base, num_reps, IDMA_REG32_2D_FRONTEND_NUM_REPETITIONS_REG_OFFSET); +static inline unsigned int pulp_cl_idma_memcpy_2d(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps) { + DMA_WRITE(src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); + DMA_WRITE(dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); + DMA_WRITE(num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); + DMA_WRITE(IDMA_DEFAULT_CONFIG_2D, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); + DMA_WRITE(src_stride, IDMA_REG32_2D_FRONTEND_STRIDE_SRC_REG_OFFSET); + DMA_WRITE(dst_stride, IDMA_REG32_2D_FRONTEND_STRIDE_DST_REG_OFFSET); + DMA_WRITE(num_reps, IDMA_REG32_2D_FRONTEND_NUM_REPETITIONS_REG_OFFSET); asm volatile("" : : : "memory"); // Launch TX - unsigned int dma_tx_id = pulp_idma_read(base, IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); + unsigned int dma_tx_id = DMA_READ(IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); return dma_tx_id; } -static inline unsigned int pulp_idma_memcpy_advanced(unsigned int base, unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps) { - pulp_idma_write(base, src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); - pulp_idma_write(base, dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); - pulp_idma_write(base, num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); - unsigned int conf = pulp_idma_get_conf(decouple, deburst, serialize, twod); - pulp_idma_write(base, conf, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); +static inline unsigned int pulp_cl_idma_memcpy_advanced(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps) { + DMA_WRITE(src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); + DMA_WRITE(dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); + DMA_WRITE(num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); + unsigned int conf = pulp_cl_idma_get_conf(decouple, deburst, serialize, twod); + DMA_WRITE(conf, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); if (twod) { - pulp_idma_write(base, src_stride, IDMA_REG32_2D_FRONTEND_STRIDE_SRC_REG_OFFSET); - pulp_idma_write(base, dst_stride, IDMA_REG32_2D_FRONTEND_STRIDE_DST_REG_OFFSET); - pulp_idma_write(base, num_reps, IDMA_REG32_2D_FRONTEND_NUM_REPETITIONS_REG_OFFSET); + DMA_WRITE(src_stride, IDMA_REG32_2D_FRONTEND_STRIDE_SRC_REG_OFFSET); + DMA_WRITE(dst_stride, IDMA_REG32_2D_FRONTEND_STRIDE_DST_REG_OFFSET); + DMA_WRITE(num_reps, IDMA_REG32_2D_FRONTEND_NUM_REPETITIONS_REG_OFFSET); } asm volatile("" : : : "memory"); // Launch TX - unsigned int dma_tx_id = pulp_idma_read(base, IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); + unsigned int dma_tx_id = DMA_READ(IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); return dma_tx_id; } -static inline unsigned int plp_dma_status(unsigned int base) { - return pulp_idma_read(base, IDMA_REG32_2D_FRONTEND_STATUS_REG_OFFSET); +static inline unsigned int plp_cl_dma_status() { + return DMA_READ(IDMA_REG32_2D_FRONTEND_STATUS_REG_OFFSET); } -static inline void plp_dma_wait(unsigned int base, unsigned int dma_tx_id) { - while(!pulp_idma_tx_cplt(base, dma_tx_id)) { +static inline void plp_cl_dma_wait(unsigned int dma_tx_id) { + while(!pulp_cl_idma_tx_cplt(dma_tx_id)) { eu_evt_maskWaitAndClr(1 << IDMA_EVENT); } return; } -static inline int plp_dma_memcpy(unsigned int base, dma_ext_t ext, unsigned int loc, unsigned int size, int ext2loc) { +static inline int plp_cl_dma_memcpy(dma_ext_t ext, unsigned int loc, unsigned int size, int ext2loc) { if (ext2loc) { - return pulp_idma_memcpy(base, loc, ext, size); + return pulp_cl_idma_memcpy(loc, ext, size); } else { - return pulp_idma_memcpy(base, ext, loc, size); + return pulp_cl_idma_memcpy(ext, loc, size); } } -static inline int plp_dma_l1ToExt(dma_ext_t ext, unsigned int loc, unsigned short size) { - dma_e dma_loc = DMA_CL; - unsigned int base = get_dma_base_addr(dma_loc); - return pulp_idma_memcpy(base, ext, loc, size); +static inline int plp_cl_dma_l1ToExt(dma_ext_t ext, unsigned int loc, unsigned short size) { + return pulp_cl_idma_memcpy(ext, loc, size); } -static inline int plp_dma_extToL1(unsigned int loc, dma_ext_t ext, unsigned short size) { - dma_e dma_loc = DMA_CL; - unsigned int base = get_dma_base_addr(dma_loc); - return pulp_idma_memcpy(base, loc, ext, size); +static inline int plp_cl_dma_extToL1(unsigned int loc, dma_ext_t ext, unsigned short size) { + return pulp_cl_idma_memcpy(loc, ext, size); } -static inline int plp_dma_memcpy_2d(unsigned int base, dma_ext_t ext, unsigned int loc, unsigned int size, unsigned int stride, unsigned int length, int ext2loc) { +static inline int plp_cl_dma_memcpy_2d(dma_ext_t ext, unsigned int loc, unsigned int size, unsigned int stride, unsigned int length, int ext2loc) { if (ext2loc) { - return pulp_idma_memcpy_2d(base, loc, ext, length, length, stride, size/length); + return pulp_cl_idma_memcpy_2d(loc, ext, length, length, stride, size/length); } else { - return pulp_idma_memcpy_2d(base, ext, loc, length, stride, length, size/length); + return pulp_cl_idma_memcpy_2d(ext, loc, length, stride, length, size/length); } } -static inline int plp_dma_l1ToExt_2d(dma_ext_t ext, unsigned int loc, unsigned short size, unsigned short stride, unsigned short length) { - dma_e dma_loc = DMA_CL; - unsigned int base = get_dma_base_addr(dma_loc); - return pulp_idma_memcpy_2d(base, ext, loc, length, stride, length, size/length); +static inline int plp_cl_dma_l1ToExt_2d(dma_ext_t ext, unsigned int loc, unsigned short size, unsigned short stride, unsigned short length) { + return pulp_cl_idma_memcpy_2d(ext, loc, length, stride, length, size/length); } -static inline int plp_dma_extToL1_2d(unsigned int loc, dma_ext_t ext, unsigned short size, unsigned short stride, unsigned short length) { - dma_e dma_loc = DMA_CL; - unsigned int base = get_dma_base_addr(dma_loc); - return pulp_idma_memcpy_2d(base, loc, ext, length, length, stride, size/length); +static inline int plp_cl_dma_extToL1_2d(unsigned int loc, dma_ext_t ext, unsigned short size, unsigned short stride, unsigned short length) { + return pulp_cl_idma_memcpy_2d(loc, ext, length, length, stride, size/length); } -static inline void plp_dma_barrier(unsigned int base) { - while(plp_dma_status(base)) { +static inline void plp_cl_dma_barrier() { + while(plp_cl_dma_status()) { eu_evt_maskWaitAndClr(1 << IDMA_EVENT); } } -#endif // __HAL_IDMA_V1_H__ +#endif // __HAL_CL_IDMA_V1_H__ diff --git a/include/hal/dma/idma_v1_fc.h b/include/hal/dma/idma_v1_fc.h new file mode 100644 index 0000000..98b3ba1 --- /dev/null +++ b/include/hal/dma/idma_v1_fc.h @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2021 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __HAL_FC_IDMA_V1_H__ +#define __HAL_FC_IDMA_V1_H__ + +#include +#include "hal/pulp.h" + +#define PLP_DMA_LOC2EXT 0 +#define PLP_DMA_EXT2LOC 1 + +#define PLP_DMA_1D 0 +#define PLP_DMA_2D 1 + +#define IDMA_EVENT 8 // all iDMA tx_cplt events are broadcast +#define IDMA_ID_COUNTER_WIDTH 28 +#define IDMA_ID_MASK 0x0fffffff + +#define IDMA_DEFAULT_CONFIG 0x0 +#define IDMA_DEFAULT_CONFIG_2D 0x8 + +typedef unsigned int dma_ext_t; +typedef unsigned int dma_loc_t; + +/** @name High-level DMA memory copy functions + * The following functions can be used to trigger DMA transfers to copy data between the cluster memory (L1) and another memory outside the cluster (another cluster L1 or L2). + * The DMA supports the following features: + * - Transfers are event-based. With event-based transfers the core can call a wait function to block execution until the transfer is done. + * - The DMA supports 2D transfers which allows transfering a 2D tile in one command. Additional information must then be given to specify the width of the tile and the number of bytes between 2 lines of the tile. + * - The event sent at the end of the transfer is broadcasted to all cluster cores. + * - To identify specific transfers, the DMA provides a transfer identifier. + * - Multiple transfers can be launched simultaneously, with them being executed 2-4 in parallel, with more waiting in a queue. + */ +/**@{*/ + +/** Memory transfer with event-based completion. + * + \param ext Address in the external memory where to access the data. There is no restriction on memory alignment. + \param loc Address in the cluster memory where to access the data. There is no restriction on memory alignment. + \param size Number of bytes to be transfered. The only restriction is that this size must fit 16 bits, i.e. must be inferior to 65536. + \param ext2loc If 1, the transfer is loading data from external memory and storing to cluster memory. If 0, it is the contrary + \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. + */ +static inline int plp_fc_dma_memcpy(dma_ext_t ext, unsigned int loc, unsigned int size, int ext2loc); + +/** Cluster memory to external memory transfer with event-based completion. + * + \param ext Address in the external memory where to store the data. There is no restriction on memory alignment. + \param loc Address in the cluster memory where to load the data. There is no restriction on memory alignment. + \param size Number of bytes to be transfered. The only restriction is that this size must fit 16 bits, i.e. must be inferior to 65536. + \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. + */ +static inline int plp_fc_dma_l1ToExt(dma_ext_t ext, unsigned int loc, unsigned short size); + +/** External memory to cluster memory transfer with event-based completion. + * + \param loc Address in the cluster memory where to store the data. There is no restriction on memory alignment. + \param ext Address in the external memory where to load the data. There is no restriction on memory alignment. + \param size Number of bytes to be transfered. The only restriction is that this size must fit 16 bits, i.e. must be inferior to 65536. + \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. + */ +static inline int plp_fc_dma_extToL1(unsigned int loc, dma_ext_t ext, unsigned short size); + +/** 2-dimensional memory transfer with event-based completion. + * + \param ext Address in the external memory where to access the data. There is no restriction on memory alignment. + \param loc Address in the cluster memory where to access the data. There is no restriction on memory alignment. + \param size Number of bytes to be transfered. The only restriction is that this size must fit 16 bits, i.e. must be inferior to 65536. + \param stride 2D stride, which is the number of bytes which are added to the beginning of the current line to switch to the next one. Must fit 16 bits, i.e. must be inferior to 65536. + \param length 2D length, which is the number of transfered bytes after which the DMA will switch to the next line. Must fit 16 bits, i.e. must be inferior to 65536. + \param ext2loc If 1, the transfer is loading data from external memory and storing to cluster memory. If 0, it is the contrary + \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. + */ +static inline int plp_fc_dma_memcpy_2d(dma_ext_t ext, unsigned int loc, unsigned int size, unsigned int stride, unsigned int length, int ext2loc); + +/** Cluster memory to external memory 2-dimensional transfer with event-based completion. + * + \param ext Address in the external memory where to store the data. There is no restriction on memory alignment. + \param loc Address in the cluster memory where to load the data. There is no restriction on memory alignment. + \param size Number of bytes to be transfered. The only restriction is that this size must fit 16 bits, i.e. must be inferior to 65536. + \param stride 2D stride, which is the number of bytes which are added to the beginning of the current line to switch to the next one. Must fit 16 bits, i.e. must be inferior to 65536. This applies only to the external memory. + \param length 2D length, which is the number of transfered bytes after which the DMA will switch to the next line. Must fit 16 bits, i.e. must be inferior to 65536. This applies only to the external memory. + \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer. + */ +static inline int plp_fc_dma_l1ToExt_2d(dma_ext_t ext, unsigned int loc, unsigned short size, unsigned short stride, unsigned short length); + +/** External memory to cluster memory 2-dimensional transfer with event-based completion. + * + \param loc Address in the cluster memory where to store the data. There is no restriction on memory alignment. + \param ext Address in the external memory where to load the data. There is no restriction on memory alignment. + \param size Number of bytes to be transfered. The only restriction is that this size must fit 16 bits, i.e. must be inferior to 65536. + \param stride 2D stride, which is the number of bytes which are added to the beginning of the current line to switch to the next one. Must fit 16 bits, i.e. must be inferior to 65536. This applies only to the external memory. + \param length 2D length, which is the number of transfered bytes after which the DMA will switch to the next line. Must fit 16 bits, i.e. must be inferior to 65536. This applies only to the external memory. + \return The identifier of the transfer. This can be used with plp_dma_wait to wait for the completion of this transfer + */ +static inline int plp_fc_dma_extToL1_2d(unsigned int loc, dma_ext_t ext, unsigned short size, unsigned short stride, unsigned short length); + +//!@} + +/** @name DMA wait functions + */ + +/** DMA barrier. + * This blocks the core until no transfer is on-going in the DMA. + */ +static inline void plp_fc_dma_barrier(); + +/** DMA wait. + * This blocks the core until the specified transfer is finished. + * + \param counter The counter ID identifying the transfer. This has been returned from an enqueued transfer (e.g. plp_dma_extToL1_2d) + */ +static inline void plp_fc_dma_wait(unsigned int dma_tx_id); + +//!@} + + +/** @name iDMA low-level functions. + * This can be used instead of the high-level ones in order to have more control over the DMA features. + */ + +/** + * iDMA configuration generation + * A standard memcpy will set all of these values to 0. + * + \param decouple if set to true, there is no longer exactly one AXI write_request issued for + every read request. This mode can improve performance of unaligned transfers when crossing + the AXI page boundaries. + \param deburst if set, the DMA will split all bursts in single transfers + \param serialize if set, the DMA will only send AX belonging to a given Arbitrary 1D burst request + at a time. This is default behavior to prevent deadlocks. Setting `serialize` to + zero violates the AXI4+ATOP specification. + \param twod if set, the DMA will execute a 2D transfer. + \return The generated configuration + */ +static inline unsigned int pulp_fc_idma_get_conf(unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod); + +/** + * iDMA transfer status + * + \param dma_tx_id The dma transfer identifier + \return transfer status. 1 if complete, 0 if still ongoing or waiting. + */ +static inline unsigned int pulp_fc_idma_tx_cplt(unsigned int dma_tx_id); + +/** + * iDMA memory transfer + * Launches a standard 1D memory transfer + * + \param dst_addr The destination address + \param src_addr The source address + \param num_bytes The number bytes + \return The dma transfer identifier + */ +static inline unsigned int pulp_fc_idma_memcpy(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes); + +/** + * iDMA 2D memory transfer + * Launches a standard 2D memory transfer + * + \param dst_addr The destination address + \param src_addr The source address + \param num_bytes The number bytes (per stride) + \param dst_stride The stride at the destination + \param src_stride The stride at the source + \param num_reps The number of repetitions + \return The dma transfer identifier + */ +static inline unsigned int pulp_fc_idma_memcpy_2d(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps); + + +/** + * iDMA advanced memory transfer + * Launches a 1D memory transfer with special configuration options + * + \param dst_addr The destination address + \param src_addr The source address + \param num_bytes The number bytes + \param decouple if set to true, there is no longer exactly one AXI write_request issued for + every read request. This mode can improve performance of unaligned transfers when crossing + the AXI page boundaries. + \param deburst if set, the DMA will split all bursts in single transfers + \param serialize if set, the DMA will only send AX belonging to a given Arbitrary 1D burst request + at a time. This is default behavior to prevent deadlocks. Setting `serialize` to + zero violates the AXI4+ATOP specification. + \param twod if set, the DMA will execute a 2D transfer + \param dst_stride if 2D, the stride at the destination + \param src_stride if 2D, the stride at the source + \param num_reps if 2D, the number of repetitions + \return The dma trasfer identifier + */ +static inline unsigned int pulp_fc_idma_memcpy_advanced(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps); + +/** Return the DMA status. + * + \return DMA status. 1 means there are still on-going transfers, 0 means nothing is on-going. + */ +static inline unsigned int plp_fc_dma_status(); + + +//!@} + + +/// @cond IMPLEM + +#define DMA_ADDR ARCHI_SDMA_ADDR +#if defined(__riscv__) && !defined(RV_ISA_RV32) && !defined(__LLVM__) +#define DMA_WRITE(value, offset) __builtin_pulp_OffsetedWrite((value), (int *)DMA_ADDR, (offset)) +#define DMA_READ(offset) __builtin_pulp_OffsetedRead((int *)DMA_ADDR, (offset)) +#else +#define DMA_WRITE(value, offset) pulp_write32(DMA_ADDR + (offset), (value)) +#define DMA_READ(offset) pulp_read32(DMA_ADDR + (offset)) +#endif + +static inline unsigned int pulp_fc_idma_get_conf(unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod) { + unsigned int conf; +#if defined(__riscv__) + conf = __builtin_bitinsert(0, decouple, 1, IDMA_REG32_2D_FRONTEND_CONF_DECOUPLE_BIT); + conf = __builtin_bitinsert(conf, deburst, 1, IDMA_REG32_2D_FRONTEND_CONF_DEBURST_BIT); + conf = __builtin_bitinsert(conf, serialize, 1, IDMA_REG32_2D_FRONTEND_CONF_SERIALIZE_BIT); + conf = __builtin_bitinsert(conf, twod, 1, IDMA_REG32_2D_FRONTEND_CONF_TWOD_BIT); +#else + conf = (((decouple & 0x1)<> (IDMA_ID_COUNTER_WIDTH-1) == my_id >> (IDMA_ID_COUNTER_WIDTH-1)) { + return my_id <= done_id; + } else { + return ((done_id & (IDMA_ID_MASK - (1<<(IDMA_ID_COUNTER_WIDTH-1))) < (1<<(IDMA_ID_COUNTER_WIDTH-2)))); + } +} + + +static inline unsigned int pulp_fc_idma_memcpy(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes) { + DMA_WRITE(src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); + DMA_WRITE(dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); + DMA_WRITE(num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); + DMA_WRITE(IDMA_DEFAULT_CONFIG, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); + asm volatile("" : : : "memory"); + + // Launch TX + unsigned int dma_tx_id = DMA_READ(IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); + + return dma_tx_id; +} + +static inline unsigned int pulp_fc_idma_memcpy_2d(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps) { + DMA_WRITE(src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); + DMA_WRITE(dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); + DMA_WRITE(num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); + DMA_WRITE(IDMA_DEFAULT_CONFIG_2D, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); + DMA_WRITE(src_stride, IDMA_REG32_2D_FRONTEND_STRIDE_SRC_REG_OFFSET); + DMA_WRITE(dst_stride, IDMA_REG32_2D_FRONTEND_STRIDE_DST_REG_OFFSET); + DMA_WRITE(num_reps, IDMA_REG32_2D_FRONTEND_NUM_REPETITIONS_REG_OFFSET); + asm volatile("" : : : "memory"); + + // Launch TX + unsigned int dma_tx_id = DMA_READ(IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); + + return dma_tx_id; +} + + +static inline unsigned int pulp_fc_idma_memcpy_advanced(unsigned int const dst_addr, unsigned int const src_addr, unsigned int num_bytes, unsigned int decouple, unsigned int deburst, unsigned int serialize, unsigned int twod, unsigned int dst_stride, unsigned int src_stride, unsigned int num_reps) { + DMA_WRITE(src_addr, IDMA_REG32_2D_FRONTEND_SRC_ADDR_REG_OFFSET); + DMA_WRITE(dst_addr, IDMA_REG32_2D_FRONTEND_DST_ADDR_REG_OFFSET); + DMA_WRITE(num_bytes, IDMA_REG32_2D_FRONTEND_NUM_BYTES_REG_OFFSET); + unsigned int conf = pulp_fc_idma_get_conf(decouple, deburst, serialize, twod); + DMA_WRITE(conf, IDMA_REG32_2D_FRONTEND_CONF_REG_OFFSET); + if (twod) { + DMA_WRITE(src_stride, IDMA_REG32_2D_FRONTEND_STRIDE_SRC_REG_OFFSET); + DMA_WRITE(dst_stride, IDMA_REG32_2D_FRONTEND_STRIDE_DST_REG_OFFSET); + DMA_WRITE(num_reps, IDMA_REG32_2D_FRONTEND_NUM_REPETITIONS_REG_OFFSET); + } + asm volatile("" : : : "memory"); + + // Launch TX + unsigned int dma_tx_id = DMA_READ(IDMA_REG32_2D_FRONTEND_NEXT_ID_REG_OFFSET); + + return dma_tx_id; +} + +static inline unsigned int plp_fc_dma_status() { + return DMA_READ(IDMA_REG32_2D_FRONTEND_STATUS_REG_OFFSET); +} + +static inline void plp_fc_dma_wait(unsigned int dma_tx_id) { + while(!pulp_fc_idma_tx_cplt(dma_tx_id)) { + eu_evt_maskWaitAndClr(1 << IDMA_EVENT); + } + return; +} + +static inline int plp_fc_dma_memcpy(dma_ext_t ext, unsigned int loc, unsigned int size, int ext2loc) { + if (ext2loc) { + return pulp_fc_idma_memcpy(loc, ext, size); + } else { + return pulp_fc_idma_memcpy(ext, loc, size); + } +} + +static inline int plp_fc_dma_l1ToExt(dma_ext_t ext, unsigned int loc, unsigned short size) { + return pulp_fc_idma_memcpy(ext, loc, size); +} + +static inline int plp_fc_dma_extToL1(unsigned int loc, dma_ext_t ext, unsigned short size) { + return pulp_fc_idma_memcpy(loc, ext, size); +} + +static inline int plp_fc_dma_memcpy_2d(dma_ext_t ext, unsigned int loc, unsigned int size, unsigned int stride, unsigned int length, int ext2loc) { + if (ext2loc) { + return pulp_fc_idma_memcpy_2d(loc, ext, length, length, stride, size/length); + } else { + return pulp_fc_idma_memcpy_2d(ext, loc, length, stride, length, size/length); + } +} + +static inline int plp_fc_dma_l1ToExt_2d(dma_ext_t ext, unsigned int loc, unsigned short size, unsigned short stride, unsigned short length) { + return pulp_fc_idma_memcpy_2d(ext, loc, length, stride, length, size/length); +} + +static inline int plp_fc_dma_extToL1_2d(unsigned int loc, dma_ext_t ext, unsigned short size, unsigned short stride, unsigned short length) { + return pulp_fc_idma_memcpy_2d(loc, ext, length, length, stride, size/length); +} + +static inline void plp_fc_dma_barrier() { + while(plp_fc_dma_status()) { + eu_evt_maskWaitAndClr(1 << IDMA_EVENT); + } +} + +#endif // __HAL_FC_IDMA_V1_H__