target: Add pulpissimo with cv32e40p

2026-05-14 20:48:09 +00:00 · 2021-05-18 19:12:40 +02:00 · 2021-05-18 19:12:40 +02:00 · 9eba7de59a
commit 9eba7de59a
parent 9bcaac54a6
8 changed files with 721 additions and 8 deletions
--- a/configs/pulpissimo_cv32e40p.sh
+++ b/configs/pulpissimo_cv32e40p.sh
@ -0,0 +1,16 @@
+#!/bin/bash -e
+
+export PULPRT_TARGET=pulpissimo
+export PULPRUN_TARGET=pulpissimo
+export USE_CV32E40P=1
+
+if [  -n "${ZSH_VERSION:-}" ]; then 
+        DIR="$(readlink -f -- "${(%):-%x}")"
+        scriptDir="$(dirname $DIR)"
+else
+
+    scriptDir="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
+
+fi
+
+source $scriptDir/common.sh
--- a/include/archi/chips/pulpissimo/pulp.h
+++ b/include/archi/chips/pulpissimo/pulp.h
@ -26,7 +26,9 @@
 #include "archi/riscv/priv_1_10.h"
 #ifdef __ibex__
 #include "archi/ibex/mhpm.h"
-#else // __ibex__
+#elif defined(__cv32e40p__)
+#include "archi/cv32e40p/cv32e40p.h"
+#else
 #include "archi/riscv/pcer_v2.h"
 #endif // __ibex__

@ -40,4 +42,4 @@
 #include "archi/udma/uart/udma_uart_v1.h"
 #include "archi/udma/udma_v3.h"

-#endif
+#endif
--- a/include/archi/cv32e40p/cv32e40p.h
+++ b/include/archi/cv32e40p/cv32e40p.h
@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2018 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * Bit definitions for Performance counters mode registers
+ *
+ */
+#define CSR_PCER_CYCLES        0  /* Count the number of cycles the core was running */
+#define CSR_PCER_INSTR         1  /* Count the number of instructions executed */
+#define CSR_PCER_LD_STALL      2  /* Number of load use hazards */
+#define CSR_PCER_JMP_STALL     3  /* Number of jump register hazards */
+#define CSR_PCER_IMISS         4  /* Cycles waiting for instruction fetches. i.e. the number of instructions wasted due to non-ideal caches */
+#define CSR_PCER_LD            5  /* Number of memory loads executed. Misaligned accesses are counted twice */
+#define CSR_PCER_ST            6  /* Number of memory stores executed. Misaligned accesses are counted twice */
+#define CSR_PCER_JUMP          7  /* Number of jump instructions seen, i.e. j, jr, jal, jalr */
+#define CSR_PCER_BRANCH        8  /* Number of branch instructions seen, i.e. bf, bnf */
+#define CSR_PCER_TAKEN_BRANCH  9  /* Number of taken branch instructions seen, i.e. bf, bnf */
+#define CSR_PCER_COMP_INSTR   10  /* Number of compressed instructions */
+#define CSR_PCER_PIPE_STALL   11  /* Cycles wasted due to ELW instruction */
+#define CSR_PCER_APU_TYPE     12  /* Number of memory loads to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external */
+#define CSR_PCER_APU_CONT     13  /* Number of memory stores to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external */
+#define CSR_PCER_APU_DEP      14  /* Cycles used for memory loads to EXT. Every non-TCDM access is considered external */
+#define CSR_PCER_APU_WB       15  /* Cycles used for memory stores to EXT. Every non-TCDM access is considered external */
+
+
+// Gives from the event ID, the HW mask that can be stored (with an OR with other events mask) to the PCER
+#define CSR_PCER_EVENT_MASK(eventId)  (1<<(eventId))
+#define CSR_PCER_ALL_EVENTS_MASK  0xffffffff
+
+#define CSR_PCMR_ACTIVE           0x1 /* Activate counting */
+#define CSR_PCMR_SATURATE         0x2 /* Activate saturation */
+
+#define CSR_PCER_NAME(id) \
+    ( id == 0 ? "Cycles" : \
+      id == 1 ? "Instructions" : \
+      id == 2 ? "LD_Stall" : \
+      id == 3 ? "Jmp_Stall" : \
+      id == 4 ? "IMISS" : \
+      id == 5 ? "LD" : \
+      id == 6 ? "ST" : \
+      id == 7 ? "JUMP" : \
+      id == 8 ? "BRANCH" : \
+      id == 9 ? "TAKEN_BRANCH" : \
+      id == 10 ? "COMP_INSTR" : \
+      id == 11 ? "PIPE_STALL" : \
+      id == 12 ? "APU_TYPE" : \
+      id == 13 ? "APU_CONT" : \
+      id == 14 ? "APU_DEP" : \
+      id == 15 ? "APU_WB" : \
+      "NA")
--- a/include/bench/bench.h
+++ b/include/bench/bench.h
@ -145,6 +145,8 @@ static inline void perf_start(void) {
  cpu_perf_conf(CSR_PCMR_ACTIVE | CSR_PCMR_SATURATE);
 #elif defined(__ibex__)
  cpu_perf_start();
+#elif defined(__cv32e40p__)
+  cpu_perf_start();
 #else
  cpu_perf_conf_events(SPR_PCER_ALL_EVENTS_MASK);
  cpu_perf_conf(SPR_PCMR_ACTIVE | SPR_PCMR_SATURATE);
@ -194,6 +196,8 @@ static inline void perf_enable_id( int eventid){
  cpu_perf_conf(CSR_PCMR_ACTIVE | CSR_PCMR_SATURATE);
 #elif defined(__ibex__)
  cpu_perf_conf_events(CSR_PCER_EVENT_MASK(eventid));
+#elif defined(__cv32e40p__)
+  cpu_perf_conf_events(1<<eventid);
 #else
  cpu_perf_conf_events(SPR_PCER_EVENT_MASK(eventid));
  cpu_perf_conf(SPR_PCMR_ACTIVE | SPR_PCMR_SATURATE);
--- a/include/hal/chips/pulpissimo/pulp.h
+++ b/include/hal/chips/pulpissimo/pulp.h
@ -19,7 +19,9 @@

 #ifdef __ibex__
 #include "hal/ibex/ibex.h"
-#else // __ibex__
+#elif defined(__cv32e40p__)
+#include "hal/cv32e40p/cv32e40p.h"
+#else
 #include "hal/riscv/riscv_v5.h"
 #endif // __ibex__
 #include "hal/itc/itc_v1.h"
@ -37,4 +39,4 @@
 #include "hal/udma/spim/udma_spim_v3.h"
 #include "hal/udma/uart/udma_uart_v1.h"

-#endif
+#endif
--- a/include/hal/cv32e40p/cv32e40p.h
+++ b/include/hal/cv32e40p/cv32e40p.h
@ -0,0 +1,601 @@
+/*
+ * Copyright (C) 2018 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HAL_RISCV_RISCV_V5_H__
+#define __HAL_RISCV_RISCV_V5_H__
+
+#include "archi/pulp.h"
+
+#include "hal/riscv/types.h"
+#include "archi/riscv/builtins_v2.h"
+#include "archi/riscv/builtins_v2_emu.h"
+
+#define CSR_PCMR_ACTIVE 0x1
+
+#define SR_MTVEC  0x305
+
+
+
+
+
+
+#if defined(__OPTIMIZE__) && defined(CORE_PULP_BUILTINS) && !defined(__LLVM__)
+
+static inline unsigned int hal_spr_read_then_clr(unsigned int reg, unsigned int val)
+{
+  return __builtin_pulp_read_then_spr_bit_clr(reg, val);
+}
+
+static inline unsigned int hal_spr_read_then_set(unsigned int reg, unsigned int val)
+{
+  return __builtin_pulp_read_then_spr_bit_set(reg, val);
+}
+
+static inline void hal_spr_write(unsigned int reg, unsigned int val)
+{
+  __builtin_pulp_spr_write(reg, val);
+}
+
+static inline unsigned int hal_spr_read(unsigned int reg)
+{
+  return __builtin_pulp_spr_read(reg);
+}
+
+#else
+
+#if defined(__LLVM__)
+
+#else
+ 
+#define hal_spr_read_then_clr(reg,val) \
+  ({ \
+    int state; \
+    asm volatile ("csrrc %0, %1, %2" :  "=r" (state) : "I" (reg), "I" (val) ); \
+    state; \
+  })
+
+#define hal_spr_read_then_set(reg,val) \
+  ({ \
+    int state; \
+    asm volatile ("csrrs %0, %1, %2" :  "=r" (state) : "I" (reg), "I" (val) ); \
+    state; \
+  })
+
+#define hal_spr_read_then_clr_from_reg(reg,val) \
+  ({ \
+    int state; \
+    asm volatile ("csrrc %0, %1, %2" :  "=r" (state) : "I" (reg), "r" (val) ); \
+    state; \
+  })
+
+#define hal_spr_read_then_set(reg,val) \
+  ({ \
+    int state; \
+    asm volatile ("csrrs %0, %1, %2" :  "=r" (state) : "I" (reg), "I" (val) ); \
+    state; \
+  })
+
+#define hal_spr_read_then_set_from_reg(reg,val) \
+  ({ \
+    int state; \
+    asm volatile ("csrrs %0, %1, %2" :  "=r" (state) : "I" (reg), "r" (val) ); \
+    state; \
+  })
+
+#define hal_spr_write(reg,val) \
+do { \
+  asm volatile ("csrw %0, %1" :  : "I" (reg), "r" (val) ); \
+} while(0)
+
+#define hal_spr_read(reg) \
+({ \
+  int result; \
+  asm volatile ("csrr %0, %1" : "=r" (result) : "I" (reg) ); \
+  result; \
+})
+
+#endif
+
+#endif
+
+
+
+
+
+#if defined(__LLVM__)
+
+#define csr_read(csr)           \
+({                \
+  register unsigned int __v;       \
+  __asm__ __volatile__ ("csrr %0, " #csr      \
+            : "=r" (__v));      \
+  __v;              \
+})
+
+#define hal_mepc_read() csr_read(0x341)
+
+#else
+#define hal_mepc_read() hal_spr_read(RV_CSR_MEPC)
+#endif
+
+static inline unsigned int core_id() {
+  int hart_id;
+  asm("csrr %0, 0xF14" : "=r" (hart_id) : );
+  // in PULP the hart id is {22'b0, cluster_id, core_id}
+  return hart_id & 0x01f;
+}
+
+static inline unsigned int cluster_id() {
+  int hart_id;
+  asm("csrr %0, 0xF14" : "=r" (hart_id) : );
+  // in PULP the hart id is {22'b0, cluster_id, core_id}
+  return (hart_id >> 5) & 0x3f;
+}
+
+#ifndef PLP_NO_BUILTIN
+
+static inline unsigned int hal_core_id() {
+  return core_id();
+  //return __builtin_pulp_CoreId();
+}
+
+static inline unsigned int hal_cluster_id() {
+  return cluster_id();
+  //return __builtin_pulp_ClusterId();
+}
+
+// TODO replace by compiler builtin
+static inline __attribute__((always_inline)) unsigned int hal_has_fc() {
+#ifdef ARCHI_HAS_FC
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+static inline __attribute__((always_inline)) unsigned int hal_is_fc() {
+#ifndef ARCHI_HAS_FC
+  return 0;
+#else
+  if (hal_has_fc()) return hal_cluster_id() == ARCHI_FC_CID;
+  else return 0;
+#endif
+}
+
+#else
+
+static inline __attribute__((always_inline)) unsigned int hal_core_id() {
+  int hart_id;
+  asm("csrr %0, 0xF14" : "=r" (hart_id) : );
+  // in PULP the hart id is {22'b0, cluster_id, core_id}
+  return hart_id & 0x01f;
+}
+
+static inline __attribute__((always_inline)) unsigned int hal_cluster_id() {
+  int hart_id;
+  asm("csrr %0, 0xF14" : "=r" (hart_id) : );
+  // in PULP the hart id is {22'b0, cluster_id, core_id}
+  return (hart_id >> 5) & 0x3f;
+}
+
+static inline __attribute__((always_inline)) unsigned int hal_has_fc() {
+#ifdef ARCHI_HAS_FC
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+static inline __attribute__((always_inline)) unsigned int hal_is_fc() {
+#ifndef ARCHI_HAS_FC
+  return 0;
+#else
+  if (hal_has_fc()) return hal_cluster_id() == ARCHI_FC_CID;
+  else return 0;
+#endif
+}
+
+#endif
+
+
+
+#if defined(__LLVM__)
+
+static inline int hal_irq_disable()
+{
+  return 0;
+}
+
+static inline void hal_irq_restore(int state)
+{
+}
+
+static inline void hal_irq_enable()
+{
+}
+
+#else
+
+static inline int hal_irq_disable()
+{
+  int irq = hal_spr_read_then_clr(0x300, 0x1<<3);
+  // This memory barrier is needed to prevent the compiler to cross the irq barrier
+  __asm__ __volatile__ ("" : : : "memory");
+  return irq;
+}
+
+static inline void hal_irq_restore(int state)
+{
+  // This memory barrier is needed to prevent the compiler to cross the irq barrier
+  __asm__ __volatile__ ("" : : : "memory");
+  hal_spr_write(0x300, state);
+}
+
+static inline void hal_irq_enable()
+{
+  // This memory barrier is needed to prevent the compiler to cross the irq barrier
+  __asm__ __volatile__ ("" : : : "memory");
+  hal_spr_read_then_set(0x300, 0x1<<3);
+}
+
+#endif
+
+/*
+ * PERFORMANCE COUNTERS
+ * 
+ * API for accessing performance counters registers.
+ * Have a look at CV32E40P specifications.
+ * We implement two not-configurable perf counters: MCYCLE and MISNTR plus a configurable counter
+ * in which we can OR multiple events (16 different events). Basic example can be found in the
+ * regression tests repository under the perf counters test.
+ * Instantiated configurable counter : 0xB03
+ * Register to set the counter event : 0x323
+ * Register to enable the counters (wheter they are instantiated or not) : 0x320
+ */
+
+
+/* Configure the active events. eventMask is an OR of events */
+static inline void cpu_perf_conf_events(unsigned int eventMask)
+{
+#ifndef PLP_NO_PERF_COUNTERS
+  asm volatile("csrw 0x323, %0" : : "r"(eventMask));
+#endif
+}
+
+/* Return events configuration */
+static inline unsigned int cpu_perf_conf_events_get()
+{
+#ifndef PLP_NO_PERF_COUNTERS
+  unsigned int result;
+  asm volatile ("csrr %0, 0x323" : "=r" (result));
+  return result;
+#else
+  return 0;
+#endif
+}
+
+/* Configure the mode. confMask is an OR of all SPR_PCMR_* macros */
+static inline void cpu_perf_conf(unsigned int confMask)
+{
+}
+
+/* Starts counting in all counters. As this is using the mode register,
+ * the rest of the config can be given through conf parameter */
+static inline void cpu_perf_start() {
+#ifndef PLP_NO_PERF_COUNTERS
+  asm volatile("csrc 0x320, %0" : : "r"(0xffffffff));
+#endif
+}
+
+/* Stops counting in all counters. As this is using the mode register,
+ * the rest of the config can be given through conf parameter */
+static inline void cpu_perf_stop() {
+#ifndef PLP_NO_PERF_COUNTERS
+ asm volatile("csrs 0x320, %0" : : "r"(0xffffffff));
+#endif
+}
+
+/* Set the specified counter to the specified value */
+static inline void cpu_perf_set(unsigned int counterId, unsigned int value) {
+  
+}
+
+/* Set all counters to the specified value */
+static inline void cpu_perf_setall(unsigned int value) {
+#ifndef PLP_NO_PERF_COUNTERS
+  
+#endif
+}
+
+/* Return the value of the specified counter */
+static inline unsigned int cpu_perf_get(const unsigned int counterId) {
+#ifndef PLP_NO_PERF_COUNTERS
+  unsigned int value = 0;
+
+  // This is stupid! But I really don't know how else we could do that
+  switch(counterId) {
+   case  0: asm volatile ("csrr %0, 0xB00" : "=r" (value)); break;
+   case  1: break;
+   case  2: asm volatile ("csrr %0, 0xB02" : "=r" (value)); break;
+   case  3: asm volatile ("csrr %0, 0xB03" : "=r" (value)); break;
+   case  4: asm volatile ("csrr %0, 0xB04" : "=r" (value)); break;
+   case  5: asm volatile ("csrr %0, 0xB05" : "=r" (value)); break;
+   case  6: asm volatile ("csrr %0, 0xB06" : "=r" (value)); break;
+   case  7: asm volatile ("csrr %0, 0xB07" : "=r" (value)); break;
+   case  8: asm volatile ("csrr %0, 0xB08" : "=r" (value)); break;
+   case  9: asm volatile ("csrr %0, 0xB09" : "=r" (value)); break;
+   case 10: asm volatile ("csrr %0, 0xB0A" : "=r" (value)); break;
+   case 11: asm volatile ("csrr %0, 0xB0B" : "=r" (value)); break;
+   case 12: asm volatile ("csrr %0, 0xB0C" : "=r" (value)); break;
+   case 13: asm volatile ("csrr %0, 0xB0D" : "=r" (value)); break;
+   case 14: asm volatile ("csrr %0, 0xB0E" : "=r" (value)); break;
+   case 15: asm volatile ("csrr %0, 0xB0F" : "=r" (value)); break;
+  }
+  return value;
+#else
+  return 0;
+#endif
+}
+
+static inline const char *cpu_perf_name(int event) {
+}
+
+
+
+/*
+ * Stack checking
+ */
+
+static inline void cpu_stack_check_enable(unsigned int base, unsigned int end)
+{
+  asm volatile ("csrwi 0x7D0, 0" :: );
+  asm volatile ("csrw  0x7D1, %0" :: "r" (base));
+  asm volatile ("csrw  0x7D2, %0" :: "r" (end));
+  asm volatile ("csrwi 0x7D0, 1" :: );
+}
+
+static inline void cpu_stack_check_disable()
+{
+  asm volatile ("csrwi 0x7D0, 0" :: );
+}
+
+
+
+#if !defined(RV_ISA_RV32)
+
+/* Packing of scalars into vectors */
+#define __builtin_pack2(x, y)    __builtin_pulp_pack2((signed short)   (x), (signed short)   (y))
+#define __builtin_packu2(x, y)   __builtin_pulp_pack2((unsigned short) (x), (unsigned short) (y))
+
+#define __builtin_pack4(x, y, z, t)    __builtin_pulp_pack4((signed char)   (x), (signed char)   (y), (signed char)   (z), (signed char)   (t))
+#define __builtin_packu4(x, y, z, t)   __builtin_pulp_pack4((unsigned char) (x), (unsigned char) (y), (unsigned char) (z), (unsigned char) (t))
+
+#define __builtin_max2(x, y)     __builtin_pulp_max2((x), (y))
+#define __builtin_max4(x, y)     __builtin_pulp_max4((x), (y))
+
+#define __builtin_maxu2(x, y)    __builtin_pulp_maxu2((x), (y))
+#define __builtin_maxu4(x, y)    __builtin_pulp_maxu4((x), (y))
+
+/* Min */
+#define __builtin_min2(x, y)     __builtin_pulp_min2((x), (y))
+#define __builtin_min4(x, y)     __builtin_pulp_min4((x), (y))
+
+#define __builtin_minu2(x, y)    __builtin_pulp_minu2((x), (y))
+#define __builtin_minu4(x, y)    __builtin_pulp_minu4((x), (y))
+
+/* Clip */
+#define __builtin_clip(x, precision)   __builtin_pulp_clip((x), -(1<<(precision)), (1<<precision)-1)
+#define __builtin_clipu(x, precision)  __builtin_pulp_clipu((x), 0, (1<<precision)-1)
+
+/* Abs */
+#define __builtin_abs2(x)      __builtin_pulp_abs2((x))
+#define __builtin_abs4(x)      __builtin_pulp_abs4((x))
+
+/* Mac */
+#define __builtin_macs(Acc, x, y)    __builtin_pulp_macs((x), (y), (Acc))
+#define __builtin_machhs(Acc, x, y)    __builtin_pulp_machhs((x), (y), (Acc))
+#define __builtin_macu(Acc, x, y)    __builtin_pulp_macu((x), (y), (Acc))
+#define __builtin_machhu(Acc, x, y)    __builtin_pulp_machhu((x), (y), (Acc))
+
+#define __builtin_macsN(Acc, x, y, n)  __builtin_pulp_macsN((x), (y), (Acc), (n))
+#define __builtin_macuN(Acc, x, y, n)  __builtin_pulp_macuN((x), (y), (Acc), (n))
+#define __builtin_macsRN(Acc, x, y, n) __builtin_pulp_macsRN((x), (y), (Acc), (n), (1<<((n)-1)))
+#define __builtin_macuRN(Acc, x, y, n) __builtin_pulp_macuRN((x), (y), (Acc), (n), (1<<((n)-1)))
+
+#define __builtin_machhsN(Acc, x, y, n)  __builtin_pulp_machhsN((x), (y), (Acc), (n))
+#define __builtin_machhuN(Acc, x, y, n)  __builtin_pulp_machhuN((x), (y), (Acc), (n))
+#define __builtin_machhsRN(Acc, x, y, n) __builtin_pulp_machhsN((x), (y), (Acc), (n), (1<<((n)-1)))
+#define __builtin_machhuRN(Acc, x, y, n) __builtin_pulp_machhuN((x), (y), (Acc), (n), (1<<((n)-1)))
+
+/* Multiplications */
+#define __builtin_mulsN(x, y, n)   __builtin_pulp_mulsN((x), (y), (n))
+#define __builtin_mulsRN(x, y, n)    __builtin_pulp_mulsRN((x), (y), (n), (1<<((n)-1)))
+#define __builtin_muluN(x, y, n)   __builtin_pulp_muluN((x), (y), (n))
+#define __builtin_muluRN(x, y, n)    __builtin_pulp_muluRN((x), (y), (n), (1<<((n)-1)))
+
+#define __builtin_mulhhsN(x, y, n)   __builtin_pulp_mulhhsN((x), (y), (n))
+#define __builtin_mulhhsRN(x, y, n)    __builtin_pulp_mulhhsRN((x), (y), (n), (1<<((n)-1)))
+#define __builtin_mulhhuN(x, y, n)   __builtin_pulp_mulhhuN((x), (y), (n))
+#define __builtin_mulhhuRN(x, y, n)    __builtin_pulp_mulhhuRN((x), (y), (n), (1<<((n)-1)))
+
+/* Vectorial product and sum of products */
+#define __builtin_dotp2(x, y)        __builtin_pulp_dotsp2((x), (y))
+#define __builtin_dotpu2(x, y)         __builtin_pulp_dotup2((x), (y))
+#define __builtin_dotpus2(x, y)        __builtin_pulp_dotusp2((x), (y))
+
+#define __builtin_sumdotp2(x, y, z)    __builtin_pulp_sdotsp2((x), (y), (z))
+#define __builtin_sumdotpu2(x, y, z)   __builtin_pulp_sdotup2((x), (y), (z))
+#define __builtin_sumdotpus2(x, y, z)  __builtin_pulp_sdotusp2((x), (y), (z))
+
+#define __builtin_dotp4(x, y)        __builtin_pulp_dotsp4((x), (y))
+#define __builtin_dotpu4(x, y)         __builtin_pulp_dotup4((x), (y))
+#define __builtin_dotpus4(x, y)        __builtin_pulp_dotusp4((x), (y))
+
+#define __builtin_sumdotp4(x, y, z)      __builtin_pulp_sdotsp4((x), (y), (z))
+#define __builtin_sumdotpu4(x, y, z)       __builtin_pulp_sdotup4((x), (y), (z))
+#define __builtin_sumdotpus4(x, y, z)      __builtin_pulp_sdotusp4((x), (y), (z))
+
+
+/* Position of the most significant bit of x */
+#define __builtin_fl1(x)     __builtin_pulp_fl1((x))
+
+/* Number of sign bits */
+#define __builtin_clb(x)     __builtin_pulp_clb((x))
+
+/* Bit Extraction */
+#define __builtin_bitextract(x, size, off) __builtin_pulp_bextract((x), (size), (off))
+#define __builtin_bitextractu(x, size, off)  __builtin_pulp_bextractu((x), (size), (off))
+
+/* Bit insertion */
+static inline unsigned int bi_ExtInsMaskFast(unsigned int Size, unsigned int Offset) { return ((((Size-1))<<5)|(Offset)); }
+#define __builtin_bitinsert(dst, src, size, off) __builtin_pulp_binsert((dst), ~(((1<<(size))-1)<<(off)), (src), (((1<<(size))-1)<<(off)), (off))
+#define __builtin_bitinsert_r(dst, src, size, off)   __builtin_pulp_binsert_r((dst), (src), bi_ExtInsMaskFast((size), (off)))
+
+/* 1 bit rotation to the right, 32 bits input */
+#define __builtin_rotr(x)      __builtin_pulp_rotr((x))
+
+/* Add with normalization and rounding */
+#define __builtin_addroundnormu(x, y, scale) __builtin_pulp_adduRN((x), (y), (scale), (1<<((scale)-1)))
+#define __builtin_addroundnorm(x, y, scale)  __builtin_pulp_addRN((x), (y), (scale), (1<<((scale)-1)))
+
+/* Normalization and rounding */
+#define __builtin_roundnormu(x, scale) __builtin_pulp_adduRN((x), 0, (scale), (1<<((scale)-1)))
+#define __builtin_roundnorm(x, scale)  __builtin_pulp_addRN((x), 0, (scale), (1<<((scale)-1)))
+
+#else
+
+/* Packing of scalars into vectors */
+#define __builtin_pack2(x, y)    ((v2s) {(signed short)   (x), (signed short)   (y)})
+#define __builtin_packu2(x, y)   ((v2u) {(unsigned short) (x), (unsigned short) (y)})
+
+#define __builtin_pack4(x, y, z, t)    ((v4s) {(signed char)   (x), (signed char)   (y), (signed char)   (z), (signed char)   (t)})
+#define __builtin_packu4(x, y, z, t)   ((v4u) {(unsigned char) (x), (unsigned char) (y), (unsigned char) (z), (unsigned char) (t)})
+
+/* Max */
+
+#define __builtin_max2(x, y)     ((v2s) {((signed short)(x)[0]>(signed short)(y)[0])?((signed short)(x)[0]):((signed short)(y)[0]), \
+            ((signed short)(x)[1]>(signed short)(y)[1])?((signed short)(x)[1]):((signed short)(y)[1])})
+#define __builtin_max4(x, y)     ((v4s) {((signed char)(x)[0]>(signed char)(y)[0])?(signed char)(x)[0]:(signed char)(y)[0], \
+            ((signed char)(x)[1]>(signed char)(y)[1])?(signed char)(x)[1]:(signed char)(y)[1], \
+            ((signed char)(x)[2]>(signed char)(y)[2])?(signed char)(x)[2]:(signed char)(y)[2], \
+            ((signed char)(x)[3]>(signed char)(y)[3])?(signed char)(x)[3]:(signed char)(y)[3]})
+
+#define __builtin_maxu2(x, y)    ((v2u) {((unsigned short)(x)[0]>(unsigned short)(y)[0])?(unsigned short)(x)[0]:(unsigned short)(y)[0], \
+            ((unsigned short)(x)[1]>(unsigned short)(y)[1])?(unsigned short)(x)[1]:(unsigned short)(y)[1]})
+#define __builtin_maxu4(x, y)    ((v4u) {((unsigned char)(x)[0]>(unsigned char)(y)[0])?(unsigned char)(x)[0]:(unsigned char)(y)[0], \
+            ((unsigned char)(x)[1]>(unsigned char)(y)[1])?(unsigned char)(x)[1]:(unsigned char)(y)[1], \
+            ((unsigned char)(x)[2]>(unsigned char)(y)[2])?(unsigned char)(x)[2]:(unsigned char)(y)[2], \
+            ((unsigned char)(x)[3]>(unsigned char)(y)[3])?(unsigned char)(x)[3]:(unsigned char)(y)[3]})
+
+/* Min */
+#define __builtin_min2(x, y)     ((v2s) {((signed short)(x)[0]<(signed short)(y)[0])?((signed short)(x)[0]):((signed short)(y)[0]), \
+            ((signed short)(x)[1]<(signed short)(y)[1])?((signed short)(x)[1]):((signed short)(y)[1])})
+#define __builtin_min4(x, y)     ((v4s) {((signed char)(x)[0]<(signed char)(y)[0])?(signed char)(x)[0]:(signed char)(y)[0], \
+            ((signed char)(x)[1]<(signed char)(y)[1])?(signed char)(x)[1]:(signed char)(y)[1], \
+            ((signed char)(x)[2]<(signed char)(y)[2])?(signed char)(x)[2]:(signed char)(y)[2], \
+            ((signed char)(x)[3]<(signed char)(y)[3])?(signed char)(x)[3]:(signed char)(y)[3]})
+
+#define __builtin_minu2(x, y)    ((v2u) {((unsigned short)(x)[0]<(unsigned short)(y)[0])?(unsigned short)(x)[0]:(unsigned short)(y)[0], \
+            ((unsigned short)(x)[1]<(unsigned short)(y)[1])?(unsigned short)(x)[1]:(unsigned short)(y)[1]})
+#define __builtin_minu4(x, y)    ((v4u) {((unsigned char)(x)[0]<(unsigned char)(y)[0])?(unsigned char)(x)[0]:(unsigned char)(y)[0], \
+            ((unsigned char)(x)[1]<(unsigned char)(y)[1])?(unsigned char)(x)[1]:(unsigned char)(y)[1], \
+            ((unsigned char)(x)[2]<(unsigned char)(y)[2])?(unsigned char)(x)[2]:(unsigned char)(y)[2], \
+            ((unsigned char)(x)[3]<(unsigned char)(y)[3])?(unsigned char)(x)[3]:(unsigned char)(y)[3]})
+
+/* Clip */
+#define __builtin_clip(x, precision)   ((x)<(-(1<<(precision)))?(-(1<<(precision))):(((x)>((1<<(precision))-1))?((1<<(precision))-1):(x)))
+#define __builtin_clipu(x, precision)  ((x)<0)?0:(((x)>((1<<(precision))-1))?((1<<(precision))-1):(x))
+
+/* Abs */
+#define __builtin_abs2(x)      ((v2s) {((x)[0]<0)?-(x)[0]:(x)[0], ((x)[1]<0)?-(x)[1]:(x)[1]})
+#define __builtin_abs4(x)      ((v4s) {((x)[0]<0)?-(x)[0]:(x)[0], ((x)[1]<0)?-(x)[1]:(x)[1], \
+            ((x)[2]<0)?-(x)[2]:(x)[2], ((x)[3]<0)?-(x)[3]:(x)[3]})
+
+/* Mac */
+#define __builtin_macs(Acc, x, y)    ((Acc) + ((short int) (x) * (short int) (y)))
+#define __builtin_machhs(Acc, x, y)    ((Acc) + ((short int) ((x)>>16) * (short int) ((y)>>16)))
+#define __builtin_macu(Acc, x, y)    ((Acc) + ((unsigned short int) (x) * (unsigned short int) (y)))
+#define __builtin_machhu(Acc, x, y)    ((Acc) + ((unsigned short int) ((x)>>16) * (unsigned short int) ((y)>>16)))
+
+#define __builtin_macsN(Acc, x, y, n)  (((Acc) + ((short int) (x) * (short int) (y)))>>(n))
+#define __builtin_macuN(Acc, x, y, n)  (((Acc) + ((unsigned short int) (x) * (unsigned short int) (y)))>>(n))
+#define __builtin_macsRN(Acc, x, y, n) ((((Acc) + ((short int) (x) * (short int) (y))) + (1<<((n)-1))) >> (n))
+#define __builtin_macuRN(Acc, x, y, n) ((((Acc) + ((unsigned short int) (x) * (unsigned short int) (y))) + (1<<((n)-1))) >> (n))
+
+#define __builtin_machhsN(Acc, x, y, n)  (((Acc) + ((short int) ((x)>>16) * (short int) ((y)>>16))) >> (n))
+#define __builtin_machhuN(Acc, x, y, n)  (((Acc) + ((unsigned short int) ((x)>>16) * (unsigned short int) ((y)>>16))) >> (n))
+#define __builtin_machhsRN(Acc, x, y, n) ((((Acc) + ((short int) ((x)>>16) * (short int) ((y)>>16))) + (1<<((n)-1))) >> (n))
+#define __builtin_machhuRN(Acc, x, y, n) ((((Acc) + ((unsigned short int) ((x)>>16) * (unsigned short int) ((y)>>16))) + (n)))
+
+/* Multiplications */
+#define __builtin_mulsN(x, y, n)   (((short int) (x) * (short int) (y))>>(n))
+#define __builtin_mulsRN(x, y, n)    ((((short int) (x) * (short int) (y)) + (1<<((n)-1)))>>(n))
+#define __builtin_muluN(x, y, n)   (((unsigned short int) (x) * (unsigned short int) (y))>>(n))
+#define __builtin_muluRN(x, y, n)    ((((unsigned short int) (x) * (unsigned short int) (y)) + (1<<((n)-1)))>>(n))
+
+/* Vectorial product and sum of products */
+#define __builtin_dotp2(x, y)    (    (x)[0]*(y)[0] + (x)[1]*(y)[1])
+#define __builtin_dotpu2(x, y)   (    (x)[0]*(y)[0] + (x)[1]*(y)[1])
+#define __builtin_dotpus2(x, y)    (    (x)[0]*(y)[0] + (x)[1]*(y)[1])
+
+#define __builtin_sumdotp2(x, y, z)    ((z)+(x)[0]*(y)[0] + (x)[1]*(y)[1])
+#define __builtin_sumdotpu2(x, y, z)   ((z)+(x)[0]*(y)[0] + (x)[1]*(y)[1])
+#define __builtin_sumdotpus2(x, y, z)  ((z)+(x)[0]*(y)[0] + (x)[1]*(y)[1])
+
+#define __builtin_dotp4(x, y)    (    (x)[0]*(y)[0] + (x)[1]*(y)[1] + (x)[2]*(y)[2] + (x)[3]*(y)[3])
+#define __builtin_dotpu4(x, y)   (    (x)[0]*(y)[0] + (x)[1]*(y)[1] + (x)[2]*(y)[2] + (x)[3]*(y)[3])
+#define __builtin_dotpus4(x, y)    (    (x)[0]*(y)[0] + (x)[1]*(y)[1] + (x)[2]*(y)[2] + (x)[3]*(y)[3])
+
+#define __builtin_sumdotp4(x, y, z)    ((z)+(x)[0]*(y)[0] + (x)[1]*(y)[1] + (x)[2]*(y)[2] + (x)[3]*(y)[3])
+#define __builtin_sumdotpu4(x, y, z)   ((z)+(x)[0]*(y)[0] + (x)[1]*(y)[1] + (x)[2]*(y)[2] + (x)[3]*(y)[3])
+#define __builtin_sumdotpus4(x, y, z)  ((z)+(x)[0]*(y)[0] + (x)[1]*(y)[1] + (x)[2]*(y)[2] + (x)[3]*(y)[3])
+
+
+/* Position of the most significant bit of x */
+#define __FL1(x)     (31 - __builtin_clz((x)))
+
+/* Number of sign bits */
+static inline unsigned int __builtin_clb(unsigned int x) {
+  int result = 0;
+  while (x) {
+    if (x & 1) result++;
+    x >>= 1;
+  }
+  return result;
+}
+
+/* Bit Extraction */
+#define __builtin_bitextract(x, size, off)   (((((x)>>(off))&((unsigned int)(1<<(size))-1))<<(32-(size)))>>(32-(size)))
+#define __builtin_bitextractu(x, size, off)  (((x)>>(off))&((unsigned int)(1<<(size))-1))
+
+/* Bit insertion */
+#define __builtin_bitinsert(dst, src, size, off) (((dst) & ~(((1<<(size))-1)<<(off))) | (((src) & ((1<<(size))-1))<<(off)))
+#define __builtin_bitinsert_r(dst, src, size, off)   (((dst) & ~(((1<<(size))-1)<<(off))) | (((src) & ((1<<(size))-1))<<(off)))
+
+/* 1 bit rotation to the right, 32 bits input */
+#define __builtin_rotr(x)      ((((x)>>1)&0x7FFFFFFF) | ((x)<<31))
+
+/* Add with normalization and rounding */
+#define __builtin_addroundnormu(x, y, scale) ((unsigned int)((x) + (y) + (1<<((scale)-1)))>>(scale))
+#define __builtin_addroundnorm(x, y, scale)  ((int)((x) + (y) + (1<<((scale)-1)))>>(scale))
+
+/* Normalization and rounding */
+#define __builtin_roundnormu(x, scale) ((unsigned int)((x) + (1<<((scale)-1)))>>(scale))
+#define __builtin_roundnorm(x, scale)  ((int)((x) + (1<<((scale)-1)))>>(scale))
+
+#endif
+
+#endif
--- a/kernel/bench.c
+++ b/kernel/bench.c
@ -163,6 +163,24 @@ void perf_print_all(void) {
  printf("Perf ST EXT CYC: %d\n",  cpu_perf_get(CSR_PCER_ST_EXT_CYC));
  printf("Perf TCDM CONT: %d\n",   cpu_perf_get(CSR_PCER_TCDM_CONT));
  printf("Perf CSR HAZARD: [Not Implemented]\n");
+#elif defined(__cv32e40p__)
+  // not implemented registers will return 0s. Only 1,2,3 are implemented.
+  printf("MCYCLE     :%d\n",    cpu_perf_get(0));
+  printf("Perf reg 1 : not implemented\n")     ;
+  printf("MINSTR     :%d\n",    cpu_perf_get(2));
+  printf("Perf reg 3 :%d\n",    cpu_perf_get(3));
+  printf("Perf reg 4 :%d\n",    cpu_perf_get(4));
+  printf("Perf reg 5 :%d\n",    cpu_perf_get(5));
+  printf("Perf reg 6 :%d\n",    cpu_perf_get(6));
+  printf("Perf reg 7 :%d\n",    cpu_perf_get(7));
+  printf("Perf reg 8 :%d\n",    cpu_perf_get(8));
+  printf("Perf reg 9 :%d\n",    cpu_perf_get(9));
+  printf("Perf reg 10:%d\n",    cpu_perf_get(10));
+  printf("Perf reg 11:%d\n",    cpu_perf_get(11));
+  printf("Perf reg 12:%d\n",    cpu_perf_get(12));
+  printf("Perf reg 13:%d\n",    cpu_perf_get(13));
+  printf("Perf reg 14:%d\n",    cpu_perf_get(14));
+  printf("Perf reg 15:%d\n",    cpu_perf_get(15));
 #elif defined( __riscv__ )
  printf("Perf CYCLES: %d\n",      cpu_perf_get(0));
  printf("Perf INSTR: %d\n",       cpu_perf_get(1));
@ -229,7 +247,7 @@ void illegal_insn_handler_c(void)
 {
 #ifndef __ariane__
  unsigned int exception_address, insn;
-#if defined( __riscv__ ) || defined( __ibex__)
+#if defined( __riscv__ ) || defined( __ibex__) || defined(__cv32e40p__)
  asm("csrr %0, 0x341" : "=r" (exception_address) : );
 #else
  exception_address = hal_spr_read(SPR_EPCR_BASE);
--- a/rules/pulpos/targets/pulpissimo.mk
+++ b/rules/pulpos/targets/pulpissimo.mk
@ -4,12 +4,18 @@ PULP_CFLAGS += -D__ibex__ -U__riscv__ -UARCHI_CORE_HAS_PULPV2 -DRV_ISA_RV32
 PULP_ARCH_CFLAGS ?= -march=rv32imc
 PULP_ARCH_LDFLAGS ?= -march=rv32imc
 PULP_ARCH_OBJDFLAGS ?= -Mmarch=rv32imc
+else ifdef USE_CV32E40P
+PULP_LDFLAGS += 
+PULP_CFLAGS += -D__cv32e40p__ -U__riscv__ -UARCHI_CORE_HAS_PULPV2 
+PULP_ARCH_CFLAGS ?=  -march=rv32imcxgap9 
+PULP_ARCH_LDFLAGS ?=  -march=rv32imcxgap9 
+PULP_ARCH_OBJDFLAGS ?= -Mmarch=rv32imcxgap9 
 else
 PULP_LDFLAGS      += 
 PULP_CFLAGS       +=  -D__riscv__
-PULP_ARCH_CFLAGS ?=  -march=rv32imcxgap9
+PULP_ARCH_CFLAGS ?=  -march=rv32imcxgap9 
 PULP_ARCH_LDFLAGS ?=  -march=rv32imcxgap9
-PULP_ARCH_OBJDFLAGS ?= -Mmarch=rv32imcxgap9
+PULP_ARCH_OBJDFLAGS ?= -Mmarch=rv32imcxgap9 
 endif

 PULP_CFLAGS    += -fdata-sections -ffunction-sections -include chips/pulpissimo/config.h -I$(PULPRT_HOME)/include/chips/pulpissimo
@ -54,4 +60,4 @@ ifeq '$(platform)' 'fpga'
 CONFIG_IO_UART=1
 endif

-include $(PULPRT_HOME)/rules/pulpos/default_rules.mk
+include $(PULPRT_HOME)/rules/pulpos/default_rules.mk