// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "onnxruntime_pybind_mlvalue.h"
#include "python/onnxruntime_pybind_state_common.h"
#include "pybind11/numpy.h"

#define NO_IMPORT_ARRAY
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#define PY_ARRAY_UNIQUE_SYMBOL onnxruntime_python_ARRAY_API
#include <numpy/arrayobject.h>

#include "core/framework/tensor_shape.h"
#include "core/framework/tensor.h"
#include "core/framework/sparse_tensor.h"
#include "core/framework/allocator.h"
#include "core/framework/data_types.h"

#include "core/framework/data_types_internal.h"
#include "core/providers/get_execution_providers.h"
#include "core/framework/kernel_registry.h"
#include "core/framework/provider_options_utils.h"
#include "core/session/provider_bridge_ort.h"

namespace onnxruntime {
namespace python {

namespace py = pybind11;
using namespace onnxruntime::logging;

#if !defined(DISABLE_SPARSE_TENSORS)

namespace {
// Create a pybind11:dtype numpy instance using ONNX Tensor Element Type
template <typename T>
struct MakeDType {
  py::dtype operator()() const {
    return py::dtype::of<T>();
  }
};

/// <summary>
/// The function creates a numpy array that points to
/// data stored within the corresponing tensor. Parent object
/// holds a reference to the object that owns the data so it
/// does not disappear.
/// </summary>
/// <returns>numpy array</returns>
py::array MakeNumpyArrayFromIndices(const Tensor& indices, const py::object& parent) {
  // See https://github.com/pybind/pybind11/issues/2271 for more information on parent
  py::array result(indices.Shape().GetDims(), indices.Data<int64_t>(), parent);
  assert(!result.owndata());
  // Set a read-only flag
  PyArray_CLEARFLAGS(reinterpret_cast<PyArrayObject*>(result.ptr()), NPY_ARRAY_WRITEABLE);
  return result;
}

}  // namespace

class PySparseCooView : public SparseTensor::CooView {
  py::object parent_;

 public:
  PySparseCooView(const SparseTensor::CooView& view, const py::object& parent) noexcept
      : SparseTensor::CooView(view), parent_(parent) {}
};

class PySparseCsrView : public SparseTensor::CsrView {
  py::object parent_;

 public:
  PySparseCsrView(const SparseTensor::CsrView& view, const py::object& parent) noexcept
      : SparseTensor::CsrView(view), parent_(parent) {}
};

class PySparseBlockSparseView : public SparseTensor::BlockSparseView {
  py::object parent_;

 public:
  PySparseBlockSparseView(const SparseTensor::BlockSparseView& view, const py::object& parent) noexcept
      : SparseTensor::BlockSparseView(view), parent_(parent) {}
};

#endif  // !defined(DISABLE_SPARSE_TENSORS)

void addSparseTensorMethods(pybind11::module& m) {
  // this is exported via __init__.py so has to exist
  py::enum_<OrtSparseFormat>(m, "OrtSparseFormat")
      .value("ORT_SPARSE_UNDEFINED", OrtSparseFormat::ORT_SPARSE_UNDEFINED)
      .value("ORT_SPARSE_COO", OrtSparseFormat::ORT_SPARSE_COO)
      .value("ORT_SPARSE_CSRC", OrtSparseFormat::ORT_SPARSE_CSRC)
      .value("ORT_SPARSE_BLOCK_SPARSE", OrtSparseFormat::ORT_SPARSE_BLOCK_SPARSE);

#if !defined(DISABLE_SPARSE_TENSORS)
  py::class_<PySparseCooView>(m, "SparseCooView")
      // Returns a numpy array of COO indices backed by Sparse Tensor memory
      // be aware that indices may reside on GPU if Sparse Tensor is on GPU
      .def("indices", [](const PySparseCooView* view) -> py::array {
        const auto& indices = view->Indices();
        return MakeNumpyArrayFromIndices(indices, py::cast(*view));
      });

  py::class_<PySparseCsrView>(m, "SparseCsrView")
      .def("inner", [](const PySparseCsrView* view) -> py::array {
        const auto& indices = view->Inner();
        return MakeNumpyArrayFromIndices(indices, py::cast(*view));
      })
      .def("outer", [](const PySparseCsrView* view) -> py::array {
        const auto& indices = view->Outer();
        return MakeNumpyArrayFromIndices(indices, py::cast(*view));
      });

  py::class_<PySparseBlockSparseView>(m, "SparseBlockSparseView")
      .def("indices", [](const PySparseBlockSparseView* view) -> py::array {
        const auto& indices = view->Indices();
        return MakeNumpyArrayFromIndices(indices, py::cast(*view));
      });

  py::class_<PySparseTensor> sparse_bind(m, "SparseTensor");
  // Factory method to create a COO Sparse Tensor from numpy arrays acting as backing storage.
  // Numeric arrays memory is used as is with reference count increment. All other supported
  // types are copied and supported only on CPU.
  // Use numpy.ascontiguousarray() to obtain contiguous array of values and indices if necessary
  // py_dense_shape - numpy dense shape of the sparse tensor
  // py_values - contiguous and homogeneous numpy array of values
  // py_indices - contiguous numpy array of int64_t indices
  // ort_device - where the value and indices buffers are allocated. For non-primitive types,
  //              only cpu device is supported. There is not a way to verify that ort_device
  //              accurately describes the memory that is backing values and indices.
  sparse_bind
      .def_static("sparse_coo_from_numpy",
                  [](const std::vector<int64_t>& py_dense_shape,
                     const py::array& py_values,
                     const py::array_t<int64_t>& py_indices,
                     const OrtDevice& ort_device) -> std::unique_ptr<PySparseTensor> {
                    if (1 != py_values.ndim()) {
                      ORT_THROW("Expecting values 1-D numpy values array for COO format. Got dims: ", py_values.ndim());
                    }

                    TensorShape dense_shape(py_dense_shape);
                    auto values_type = GetNumpyArrayType(py_values);
                    auto ml_type = NumpyToOnnxRuntimeTensorType(values_type);

                    std::unique_ptr<PySparseTensor> result;
                    if (IsNumericNumpyType(values_type)) {
                      if (!PyArray_ISCONTIGUOUS(reinterpret_cast<PyArrayObject*>(py_values.ptr()))) {
                        throw std::runtime_error("Require contiguous numpy array of values");
                      }

                      if (!PyArray_ISCONTIGUOUS(reinterpret_cast<PyArrayObject*>(py_indices.ptr()))) {
                        throw std::runtime_error("Require contiguous numpy array of indices");
                      }

                      // create references to make sure storage does not disappear
                      std::vector<py::object> reference_holders = {py_values, py_indices};
                      OrtMemoryInfo mem_info = GetMemoryInfoPerDeviceType(ort_device);
                      TensorShape values_shape{py_values.size()};
                      auto sparse_tensor = std::make_unique<SparseTensor>(ml_type, dense_shape, values_shape,
                                                                          const_cast<void*>(py_values.data()), mem_info);
                      auto index_span = gsl::make_span(const_cast<int64_t*>(py_indices.data()), py_indices.size());
                      ORT_THROW_IF_ERROR(sparse_tensor->UseCooIndices(index_span));
                      result = std::make_unique<PySparseTensor>(std::move(sparse_tensor), std::move(reference_holders));
                    } else if (values_type == NPY_UNICODE || values_type == NPY_STRING) {
                      if (ort_device.Type() != OrtDevice::CPU) {
                        throw std::runtime_error("Only CPU based devices are supported for non-numeric datatypes");
                      }
                      auto sparse_tensor = std::make_unique<SparseTensor>(ml_type, dense_shape, GetAllocator());
                      auto mutator = sparse_tensor->MakeCooData(py_values.size(), py_indices.size());
                      CopyDataToTensor(py_values, values_type, mutator.Values());
                      CopyDataToTensor(py_indices, GetNumpyArrayType(py_indices), mutator.Indices());
                      result = std::make_unique<PySparseTensor>(std::move(sparse_tensor));
                    } else {
                      ORT_THROW("Unsupported values data type: ", values_type);
                    }
                    return result;
                  })
      // Factory method to create a CSR Sparse Tensor from numpy arrays acting as backing storage.
      // Numeric arrays memory is used as is with reference count increment. All other supported
      // types are copied and supported only on CPU.
      // Use numpy.ascontiguousarray() to obtain contiguous array of values and indices if necessary
      // py_dense_shape - numpy dense shape of the sparse tensor
      // py_values - contiguous and homogeneous numpy array of values
      // py_inner_indices - contiguous numpy array of int64_t indices
      // py_outer_indices - contiguous numpy array of int64_t indices
      // ort_device - where the value and indices buffers are allocated. For non-primitive types,
      //              only cpu device is supported. There is not a way to verify that ort_device
      //              accurately describes the memory that is backing values and indices.
      .def_static(
          "sparse_csr_from_numpy",
          [](const std::vector<int64_t>& py_dense_shape,
             const py::array& py_values,
             const py::array_t<int64_t>& py_inner_indices,
             const py::array_t<int64_t>& py_outer_indices,
             const OrtDevice& ort_device) -> std::unique_ptr<PySparseTensor> {
            if (1 != py_values.ndim() || 1 != py_inner_indices.ndim() || 1 != py_outer_indices.ndim()) {
              ORT_THROW("Expecting all data to be 1-D numpy arrays for CSR format.");
            }

            TensorShape dense_shape(py_dense_shape);
            auto values_type = GetNumpyArrayType(py_values);
            auto ml_type = NumpyToOnnxRuntimeTensorType(values_type);

            std::unique_ptr<PySparseTensor> result;
            if (IsNumericNumpyType(values_type)) {
              if (!PyArray_ISCONTIGUOUS(reinterpret_cast<PyArrayObject*>(py_values.ptr()))) {
                throw std::runtime_error("Require contiguous numpy array of values");
              }

              if (!PyArray_ISCONTIGUOUS(reinterpret_cast<PyArrayObject*>(py_inner_indices.ptr()))) {
                throw std::runtime_error("Require contiguous numpy array of indices");
              }

              if (!PyArray_ISCONTIGUOUS(reinterpret_cast<PyArrayObject*>(py_outer_indices.ptr()))) {
                throw std::runtime_error("Require contiguous numpy array of indices");
              }

              // go ahead and create references to make sure storage does not disappear
              std::vector<py::object> reference_holders = {py_values, py_inner_indices, py_outer_indices};
              OrtMemoryInfo mem_info = GetMemoryInfoPerDeviceType(ort_device);
              TensorShape values_shape{py_values.size()};
              auto sparse_tensor = std::make_unique<SparseTensor>(ml_type, dense_shape, values_shape,
                                                                  const_cast<void*>(py_values.data()), mem_info);
              auto inner_span = gsl::make_span<int64_t>(const_cast<int64_t*>(py_inner_indices.data()), py_inner_indices.size());
              auto outer_span = gsl::make_span<int64_t>(const_cast<int64_t*>(py_outer_indices.data()), py_outer_indices.size());
              ORT_THROW_IF_ERROR(sparse_tensor->UseCsrIndices(inner_span, outer_span));
              result = std::make_unique<PySparseTensor>(std::move(sparse_tensor), std::move(reference_holders));
            } else if (values_type == NPY_UNICODE || values_type == NPY_STRING) {
              if (ort_device.Type() != OrtDevice::CPU) {
                throw std::runtime_error("Only CPU based devices are supported for non-numeric datatypes");
              }
              auto sparse_tensor = std::make_unique<SparseTensor>(ml_type, dense_shape, GetAllocator());
              auto mutator = sparse_tensor->MakeCsrData(py_values.size(), py_inner_indices.size(), py_outer_indices.size());
              CopyDataToTensor(py_values, values_type, mutator.Values());
              CopyDataToTensor(py_inner_indices, GetNumpyArrayType(py_inner_indices), mutator.Inner());
              CopyDataToTensor(py_outer_indices, GetNumpyArrayType(py_outer_indices), mutator.Outer());
              result = std::make_unique<PySparseTensor>(std::move(sparse_tensor));
            } else {
              ORT_THROW("Unsupported values data type: ", values_type);
            }

            return result;
          })
      // Factory method to create a BlockSparse Tensor from numpy arrays acting as backing storage.
      // Numeric arrays memory is used as is with reference count increment. All other supported
      // types are copied and supported only on CPU.
      // Use numpy.ascontiguousarray() to obtain contiguous array of values and indices if necessary
      // py_dense_shape - numpy dense shape of the sparse tensor
      // ort_device - desribes the allocation. Only primitive types allocations can be mapped to
      // py_values - contiguous and homogeneous numpy array of values
      // py_indices - contiguous numpy array of int32_t indices
      // ort_device - where the value and indices buffers are allocated. For non-primitive types,
      //              only cpu device is supported. There is not a way to verify that ort_device
      //              accurately describes the memory that is backing values and indices.
      .def_static(
          "blocksparse_from_numpy",
          [](const std::vector<int64_t>& py_dense_shape,
             const py::array& py_values,
             const py::array_t<int32_t>& py_indices,
             const OrtDevice& ort_device) -> std::unique_ptr<PySparseTensor> {
            TensorShape dense_shape(py_dense_shape);
            TensorShape values_shape = GetShape(py_values);
            TensorShape index_shape = GetShape(py_indices);
            auto values_type = GetNumpyArrayType(py_values);
            auto ml_type = NumpyToOnnxRuntimeTensorType(values_type);

            std::unique_ptr<PySparseTensor> result;
            if (IsNumericNumpyType(values_type)) {
              if (!PyArray_ISCONTIGUOUS(reinterpret_cast<PyArrayObject*>(py_values.ptr()))) {
                throw std::runtime_error("Require contiguous numpy array of values");
              }

              if (!PyArray_ISCONTIGUOUS(reinterpret_cast<PyArrayObject*>(py_indices.ptr()))) {
                throw std::runtime_error("Require contiguous numpy array of indices");
              }

              // create references to make sure storage does not disappear
              std::vector<py::object> reference_holders = {py_values, py_indices};
              OrtMemoryInfo mem_info = GetMemoryInfoPerDeviceType(ort_device);
              auto sparse_tensor = std::make_unique<SparseTensor>(ml_type, dense_shape, values_shape,
                                                                  const_cast<void*>(py_values.data()), mem_info);
              ORT_THROW_IF_ERROR(sparse_tensor->UseBlockSparseIndices(index_shape, const_cast<int32_t*>(py_indices.data())));
              result = std::make_unique<PySparseTensor>(std::move(sparse_tensor), std::move(reference_holders));
            } else if (values_type == NPY_UNICODE || values_type == NPY_STRING) {
              if (ort_device.Type() != OrtDevice::CPU) {
                throw std::runtime_error("Only CPU based devices are supported for non-numeric datatypes");
              }
              auto sparse_tensor = std::make_unique<SparseTensor>(ml_type, dense_shape, GetAllocator());
              auto mutator = sparse_tensor->MakeBlockSparseData(values_shape, index_shape);
              CopyDataToTensor(py_values, values_type, mutator.Values());
              CopyDataToTensor(py_indices, GetNumpyArrayType(py_indices), mutator.Indices());
              result = std::make_unique<PySparseTensor>(std::move(sparse_tensor));
            } else {
              ORT_THROW("Unsupported values data type: ", values_type);
            }
            return result;
          })
      // Returns a numpy array that is backed by SparseTensor values memory
      // be aware that it may be on GPU
      .def("values", [](const PySparseTensor* py_tensor) -> py::array {
        const SparseTensor& sparse_tensor = py_tensor->Instance();
        if (sparse_tensor.Format() == SparseFormat::kUndefined) {
          ORT_THROW("This sparse tensor instance does not contain data");
        }
        if (sparse_tensor.IsDataTypeString()) {
          // Strings can not be on GPU and require conversion UTF-8 to Python UNICODE
          // We need to create a copy.
          const int numpy_type = OnnxRuntimeTensorToNumpyType(DataTypeImpl::GetType<std::string>());
          ORT_ENFORCE(NPY_OBJECT == numpy_type, "We are expecting to map strings to NPY_OBJECT type");
          const auto& values_shape = sparse_tensor.Values().Shape();
          py::dtype dtype("object");
          py::array result(dtype, values_shape.GetDims(), {});
          auto* out_ptr = static_cast<py::object*>(
              PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.ptr())));
          const std::string* src = sparse_tensor.Values().Data<std::string>();
          for (int64_t i = 0, size = values_shape.Size(); i < size; ++i, src++) {
            out_ptr[i] = py::cast(*src);
          }
          return result;
        } else {
          utils::MLTypeCallDispatcher<float, double, int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t>
              t_disp(sparse_tensor.GetElementType());
          auto dtype = t_disp.InvokeRet<py::dtype, MakeDType>();
          const auto& values = sparse_tensor.Values();
          // See https://github.com/pybind/pybind11/issues/2271
          py::array result(dtype, values.Shape().GetDims(), values.DataRaw(), py::cast(*py_tensor));
          assert(!result.owndata());
          // Set a read-only flag
          PyArray_CLEARFLAGS(reinterpret_cast<PyArrayObject*>(result.ptr()), NPY_ARRAY_WRITEABLE);
          return result;
        }
      })
      // Returns a Coo view of data
      .def("get_coo_data", [](const PySparseTensor* py_tensor) -> std::unique_ptr<PySparseCooView> {
        const SparseTensor& sparse_tensor = py_tensor->Instance();
        if (sparse_tensor.Format() != SparseFormat::kCoo) {
          ORT_THROW("This sparse tensor does not contain COO format");
        }
        return std::make_unique<PySparseCooView>(sparse_tensor.AsCoo(), py::cast(*py_tensor));
      })
      // Returns a CSR view of data
      .def("get_csrc_data", [](const PySparseTensor* py_tensor) -> std::unique_ptr<PySparseCsrView> {
        const SparseTensor& sparse_tensor = py_tensor->Instance();
        if (sparse_tensor.Format() != SparseFormat::kCsrc) {
          ORT_THROW("This sparse tensor does not contain CSR(C) format");
        }
        return std::make_unique<PySparseCsrView>(sparse_tensor.AsCsr(), py::cast(*py_tensor));
      })
      // Returns a blocksparse view of data
      .def("get_blocksparse_data", [](const PySparseTensor* py_tensor) -> std::unique_ptr<PySparseBlockSparseView> {
        const SparseTensor& sparse_tensor = py_tensor->Instance();
        if (sparse_tensor.Format() != SparseFormat::kBlockSparse) {
          ORT_THROW("This sparse tensor does not contain BlockSparse format");
        }
        return std::make_unique<PySparseBlockSparseView>(sparse_tensor.AsBlockSparse(), py::cast(*py_tensor));
      })

  /// This will copy SparseTensor into a new instance on a specified CUDA device or throw:
  /// - if this sparse tensor contains strings
  /// - if this sparse tensor is already on GPU
  /// - if CUDA is not present in this build
  /// - if the specified device is not valid
#ifdef USE_CUDA
      .def("to_cuda", [](const PySparseTensor* py_tensor, const OrtDevice& ort_device) -> std::unique_ptr<PySparseTensor> {
        const SparseTensor& sparse_tensor = py_tensor->Instance();
        if (sparse_tensor.IsDataTypeString()) {
          ORT_THROW("Can not copy string tensor to GPU devices.");
        }
        if (sparse_tensor.Location().device.Type() == OrtDevice::GPU) {
          ORT_THROW("This sparse_tensor is already allocated on cuda. Cross device copy not supported.");
        }
        if (!IsCudaDeviceIdValid(logging::LoggingManager::DefaultLogger(), ort_device.Id())) {
          ORT_THROW("The provided device id doesn't match any available GPUs on the machine: ", ort_device.Id());
        }
        auto cuda_allocator = GetCudaAllocator(ort_device.Id());
        auto gpu_transfer = GetGPUDataTransfer();
        auto dest_tensor = std::make_unique<SparseTensor>(sparse_tensor.DataType(), sparse_tensor.DenseShape(),
                                                          std::move(cuda_allocator));
        ORT_THROW_IF_ERROR(sparse_tensor.Copy(*gpu_transfer, *dest_tensor, 0));
        auto result = std::make_unique<PySparseTensor>(std::move(dest_tensor));
        return result;
#else
      .def("to_cuda", [](const PySparseTensor*, const OrtDevice&) {
        ORT_THROW("Cuda is not available in this build");
#endif  // USE_CUDA
      })
      .def("dense_shape", [](const PySparseTensor* py_tensor) -> py::list {
        const SparseTensor& st = py_tensor->Instance();
        const auto& dims = st.DenseShape().GetDims();
        // We create a copy of dimensions, it is small
        py::list py_dims;
        for (auto d : dims) {
          py_dims.append(d);
        }
        return py_dims;
      })
      .def("device_name", [](const PySparseTensor* py_tensor) -> std::string {
        return std::string(GetDeviceName(py_tensor->Instance().Location().device));
      })
      .def("data_type", [](const PySparseTensor* py_tensor) -> std::string {
        const SparseTensor& tensor = py_tensor->Instance();
        const auto elem_type = tensor.GetElementType();
        const auto* type_proto = DataTypeImpl::SparseTensorTypeFromONNXEnum(elem_type)->GetTypeProto();
        if (type_proto == nullptr) {
          ORT_THROW("Unknown type of SparseTensor: ", tensor.DataType());
        }
        return *ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(*type_proto);
      })
      // pybind apparently has a bug with returning enums from def_property_readonly or methods
      // returning a method object instead of the enumeration value
      // so we are using def_property and throw on a potential modificaiton
      .def_property(
          "format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat {
        const SparseTensor& tensor = py_tensor->Instance();
        auto retval = OrtSparseFormat::ORT_SPARSE_UNDEFINED;
        switch (tensor.Format()) {
          case SparseFormat::kUndefined:
            break;
          case SparseFormat::kCoo:
            retval = OrtSparseFormat::ORT_SPARSE_COO;
            break;
          case SparseFormat::kCsrc:
            retval = OrtSparseFormat::ORT_SPARSE_CSRC;
            break;
          case SparseFormat::kBlockSparse:
            retval = OrtSparseFormat::ORT_SPARSE_BLOCK_SPARSE;
            break;
          default:
            throw std::runtime_error("Can't switch on FormatFlags()");
        }
        return retval; }, [](PySparseTensor*, OrtSparseFormat) -> void { throw std::runtime_error("This is a readonly property"); });
#endif  // !defined(DISABLED_SPARSE_TENSORS)
}

}  // namespace python
}  // namespace onnxruntime