mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
**Motivation** Enable SVE vectorization with `torch.compile` Extends PR: #119571 * This PR enables vectorization for codegen part using SVE-256 (vec length) * The changes can be extended to other SVE vec lengths I've done some comparisons against existing NEON implementation with SVE vectorization enabled route for `torch.compile` Test results are for 8 cores on ARM Neoverse_V1 <img width="359" alt="Screenshot 2024-08-28 at 16 02 07" src="https://github.com/user-attachments/assets/6961fbea-8285-4ca3-b92e-934a2db50ee2"> It's worth mentioning, for standalone `SiLU op` there's a `~1.8x` speedup with `torch.compile` Pull Request resolved: https://github.com/pytorch/pytorch/pull/134672 Approved by: https://github.com/jgong5, https://github.com/malfet
22 lines
865 B
C++
22 lines
865 B
C++
#include <ATen/cpu/Utils.h>
|
|
#include <torch/csrc/cpu/Module.h>
|
|
#include <torch/csrc/utils/pybind.h>
|
|
|
|
namespace torch::cpu {
|
|
|
|
void initModule(PyObject* module) {
|
|
auto m = py::handle(module).cast<py::module>();
|
|
|
|
auto cpu = m.def_submodule("_cpu", "cpu related pybind.");
|
|
cpu.def("_is_avx2_supported", at::cpu::is_avx2_supported);
|
|
cpu.def("_is_avx512_supported", at::cpu::is_avx512_supported);
|
|
cpu.def("_is_avx512_vnni_supported", at::cpu::is_avx512_vnni_supported);
|
|
cpu.def("_is_avx512_bf16_supported", at::cpu::is_avx512_bf16_supported);
|
|
cpu.def("_is_amx_tile_supported", at::cpu::is_amx_tile_supported);
|
|
cpu.def("_init_amx", at::cpu::init_amx);
|
|
cpu.def("_is_arm_sve_supported", at::cpu::is_arm_sve_supported);
|
|
cpu.def("_L1d_cache_size", at::cpu::L1d_cache_size);
|
|
cpu.def("_L2_cache_size", at::cpu::L2_cache_size);
|
|
}
|
|
|
|
} // namespace torch::cpu
|