pytorch/torch/csrc/jit/codegen/cuda/executor.cpp


#include <torch/csrc/jit/codegen/cuda/executor.h>

#include <torch/csrc/jit/codegen/cuda/codegen.h>
#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
#include <torch/csrc/jit/codegen/cuda/utils.h>

#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/llvm_jit_strings.h>
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
#include <c10/core/DeviceGuard.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/util/irange.h>

#include <fstream>

namespace torch {
namespace jit {
namespace fuser {
namespace cuda {

int FusionExecutor::fusion_id_counter_ = 0; // NOLINT

namespace {

static const char* defineIndexMode(KernelIndexMode index_mode) {
  switch (index_mode) {
    case KernelIndexMode::INT32:
      return "typedef int nvfuser_index_t;\n";
    case KernelIndexMode::INT64:
      return "typedef int64_t nvfuser_index_t;\n";
    default:
      break;
  }

  TORCH_INTERNAL_ASSERT(false, "unknow indexing mode");
  return "";
}

static const char* defineIntegerTypes() {
  return R"(
typedef signed char int8_t;
typedef unsigned char uint8_t;
typedef short int int16_t;
typedef unsigned short int uint16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long long int int64_t;
typedef unsigned long long int uint64_t;
)";
}

static const std::string& defineComplexTypes() {
  static std::string result = std::string(R"ESCAPE(
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)
)ESCAPE") +
      at::cuda::get_traits_string() + at::cuda::get_complex_body_string() +
      at::cuda::get_cmath_string() + at::cuda::get_complex_math_string();
  return result;
}

} // namespace

std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
  // generating cuda code;
  std::string code = "";
#ifdef USE_ROCM
#if ROCM_VERSION < 40200
  code += std::string("#include <hip/hip_runtime.h>\n") +
      std::string("#include <hip/hip_bf16.h>\n") +
      std::string("#include <hip/hip_fp16.h>\n");
#endif
  code += std::string("#pragma clang force_cuda_host_device begin\n");
#endif
  code += std::string("namespace ") + FusionExecutor::kernelNamespace() +
      " {\n" + defineIntegerTypes() + defineIndexMode(options_.index_mode) +
      defineComplexTypes() + executor_utils::kernelPreamble() + kernel + "}\n";
#ifdef USE_ROCM
  code += std::string("#pragma clang force_cuda_host_device end\n");
#endif

  if (isDebugDumpEnabled(DebugDumpOption::CudaKernel)) {
    std::cout << "\n======= Codegen output for kernel: " << kernelName()
              << " =======\n\n"
              << kernel << "\n======================================\n\n";
  } else if (isDebugDumpEnabled(DebugDumpOption::CudaFull)) {
    std::cout << "\n======= Codegen output for kernel: " << kernelName()
              << " =======\n\n"
              << code << "\n======================================\n\n";
  }
  if (isDebugDumpEnabled(DebugDumpOption::CudaToFile) ||
      isDebugDumpEnabled(DebugDumpOption::DebugInfo)) {
    std::stringstream file_name;
    file_name << "__tmp_kernel" << fusion_id_ << ".cu";
    std::cout << "PRINTING: " << file_name.str() << std::endl;
    std::ofstream out(file_name.str());
    out << code << std::endl;
    out.close();
  }

  return code;
}

// TODO: come up with a more user friendly interface
void FusionExecutor::debugCompileFusionFromStr(
    Fusion* fusion,
    const std::string& code,
    const std::string& name,
    int id,
    CompileOptions options) {
  options_ = options;

  if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) {
    fusion->print();
  } else if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
    fusion->printMath();
  }

  if (isDebugDumpEnabled(DebugDumpOption::CudaFull)) {
    std::cout << "\n==== codegen output for kernel: " << kernelName()
              << " ====" << std::endl
              << code << std::endl
              << "======================================\n"
              << std::endl;
  }

  lowered_ = std::make_unique<GpuLower>(fusion);
  const auto kernel = lowered_->kernel();
  fusion_ = lowered_->kernel();

  fusion_id_ = id;
  setUsedTVs();

  if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) {
    kernel->print();
  }

  const auto& kernel_summary = kernel->summary();

  if (!kernel_summary.static_smem_allocations.empty()) {
    kir::ExpressionEvaluator static_evaluator;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    const auto static_smem_size = computeSharedMemory(
        static_evaluator, kernel_summary.static_smem_allocations);
    TORCH_INTERNAL_ASSERT(
        static_smem_size < max_static_smem_,
        "The static shared memory allocation is larger than available memory.");
  }

  std::tie(compiled_kernel_, last_compiler_log_) =
      executor_utils::nvrtcCompile(code, name, fusion_id_);
  TORCH_INTERNAL_ASSERT(
      fusion_id_ > 0, "assign a fusion_id_ <= 0 is not accepted.");
}

void FusionExecutor::compileFusion(
    Fusion* fusion,
    const KernelArgumentHolder& args,
    const LaunchParams& launch_constraints) {
  FUSER_PERF_SCOPE("compileFusion");

  TORCH_INTERNAL_ASSERT(
      !fusion->outputs().empty(), "No output found for this kernel, aborting.");

  for (auto out : fusion->outputs()) {
    TORCH_INTERNAL_ASSERT(
        out->getValType() == ValType::TensorView,
        "Output types from fusions that are not tensors are not supported at this point.");

    const auto maybe_rfactor_domain =
        out->as<TensorView>()->getMaybeRFactorDomain();
    // walking through outputs to see if output shapes are dependent on
    // non-tensor inputs. For which case, we should have disabled output
    // allocation, since the caching id only looks at tensor shapes.
    // See issue https://github.com/csarofeen/pytorch/issues/2002
    std::vector<Val*> output_extents;
    for (const auto id : maybe_rfactor_domain) {
      Val* extent = nullptr;
      if (id->isReduction() || id->isStride()) {
        continue;
      } else if (id->isBroadcast() && id->hasExpandedExtent()) {
        extent = id->expandedExtent();
      } else {
        extent = id->extent();
      }
      output_extents.emplace_back(extent);
    }
    auto dependencies = InputsOf::outputs(fusion, output_extents);
    if (std::any_of(dependencies.begin(), dependencies.end(), [](Val* val) {
          return val->isFusionInput();
        })) {
      // TODO: parameter cache is too big a hammer here. We should consider
      // separate the caching logic of output sizes & launch params. Since
      // output size dependency should only invalidate the output sizes
      disable_parameter_cache_ = true;
      break;
    }
  }

  if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) {
    fusion->print();
  } else if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
    fusion->printMath();
  }

  // TODO: refactor the options_ passed through
  options_.device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
  options_.index_mode = args.getIndexMode();
  c10::DeviceGuard dg(options_.device);

  TORCH_INTERNAL_ASSERT(
      options_.device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
  auto properties = at::cuda::getDeviceProperties(options_.device.index());
  configured_device_smem_ = properties->sharedMemPerBlock;
#ifndef USE_ROCM
  device_smem_limit_ = properties->sharedMemPerBlockOptin;
#else
  // don't know if rocm supports opt-in shared memroy reconfiguration
  device_smem_limit_ = properties->sharedMemPerBlock;
#endif
  warp_size_ = properties->warpSize;

  lowered_ = std::make_unique<GpuLower>(
      fusion,
      options_.index_mode == KernelIndexMode::INT64 ? DataType::Int
                                                    : DataType::Int32);
  const auto kernel = lowered_->kernel();
  fusion_ = lowered_->kernel()->as<Fusion>();

  fusion_id_ = ++fusion_id_counter_;
  setUsedTVs();

  if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) {
    kernel->print();
  }

  kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
  const auto structured_code = getStructuredCode(kernel_code_);

  const auto& kernel_summary = kernel->summary();

  // We currently shouldn't allocate any more shared mem
  //  tensors statically but could keep this path if
  //  needed in later development.
  if (!kernel_summary.static_smem_allocations.empty()) {
    kir::ExpressionEvaluator static_evaluator;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    const auto static_smem_size = computeSharedMemory(
        static_evaluator, kernel_summary.static_smem_allocations);
    TORCH_INTERNAL_ASSERT(
        static_smem_size < max_static_smem_,
        "The static shared memory allocation is larger than available memory.");
  }

  if (kernel_summary.has_dynamic_local_memory_allocations) {
    std::stringstream ss;
    ss << "Allocations must be based on constant integers for local memory. However, found: ";
    for (auto alloc : kernel_summary.dynamic_lmem_allocations) {
      ss << alloc->buffer()->toString() << ", ";
    }
    ss << " have dynamic allocations but are placed in local memory.";
    TORCH_INTERNAL_ASSERT(false, ss.str());
  }

  // TODO: pass block_size here;
  c10::optional<int> block_size = c10::nullopt;
  if (!args.empty()) {
    auto expr_eval = executor_utils::bindKernelInputs(args, kernel);
    auto launch_params =
        computeLaunchParams(launch_constraints, expr_eval, warp_size_);
    block_size = launch_params.nThreads();
    TORCH_INTERNAL_ASSERT(
        block_size > 0, "launch param inferred block size < 0");
  }

  // TODO: high water mark should be computed via occupancy API after
  // compilation.

  // Basically setting high water martk as 1 when we don't provide args for
  // compilation, it will just generate a kernel that gets ditched at the first
  // run - not great. We should have better heuristics.
  block_size_high_water_mark = std::max<int64_t>(
      (block_size.has_value() ? block_size.value() : 1),
      block_size_high_water_mark);
  std::tie(compiled_kernel_, last_compiler_log_) = executor_utils::nvrtcCompile(
      structured_code,
      (kernelNamespace() + "::" + kernelName()).c_str(),
      fusion_id_,
      block_size);
  TORCH_INTERNAL_ASSERT(
      fusion_id_ > 0, "failed to assign a fusion_id_ after compilation.");

#ifndef USE_ROCM
  // The driver API call requires an int argument.
  int max_dynamic_smem = 0;
  AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncGetAttribute(
      &max_dynamic_smem,
      CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
      compiled_kernel_.function));
  maybe_available_dynamic_smem_ = max_dynamic_smem;
#endif
}

namespace {

at::Tensor inferAndAlloc(
    const TensorView* tv,
    const std::vector<Val*>& sizes,
    kir::ExpressionEvaluator& expr_eval,
    // Map from dim -> expanded size of TV if any expanded broadcast dimensions
    // exist
    std::unordered_map<int, Val*> expanded_map,
    const CompileOptions& options,
    bool zero_init = false) {
  FUSER_PERF_SCOPE("inferAndAlloc");

  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  // Going to infer all the sizes of the TensorView
  std::vector<int64_t> inferred_sizes;
  // Expanded sizes is at maximum the same size of inferred_sizes, as you could
  // have a fully broadcasted tensor that's being expanded
  std::vector<int64_t> expanded_sizes;
  bool expanded_dim = false;
  for (const auto size : sizes) {
    const auto inferred_val = expr_eval.evaluate(size);
    TORCH_INTERNAL_ASSERT(
        inferred_val.has_value(),
        "Could not launch kernel as program could not infer ",
        size->toString(),
        "(",
        size->name(),
        ") for the buffer ",
        tv->toString());
    inferred_sizes.push_back(inferred_val->as<int64_t>());
    if (expanded_map.count(expanded_sizes.size())) {
      auto expanded_size = expanded_map.at(expanded_sizes.size());
      const auto inferred_expanded_size = expr_eval.evaluate(expanded_size);
      TORCH_INTERNAL_ASSERT(
          inferred_expanded_size.has_value(),
          "Could not launch kernel as program could not infer the expanded extent ",
          expanded_size->toString(),
          "(",
          expanded_size->name(),
          ") for the buffer ",
          tv->toString());
      if (inferred_val.value() != 1) {
        TORCH_INTERNAL_ASSERT(
            inferred_val.value() == inferred_expanded_size.value(),
            "Attempted an expand on a non-broadcasted dimension,",
            " but the expand doesn't match the dimensions size.");
      } else {
        expanded_dim = true;
      }
      expanded_sizes.push_back(inferred_expanded_size->as<int64_t>());
    } else {
      expanded_sizes.push_back(inferred_val->as<int64_t>());
    }
  }

  const auto at_type = data_type_to_aten(tv->dtype());
  const auto tensor_options =
      at::TensorOptions().dtype(at_type).device(options.device);
  c10::IntArrayRef isizes(inferred_sizes);

  if (zero_init) {
    auto zeros = at::zeros(isizes, tensor_options);
    if (expanded_dim) {
      return zeros.expand(expanded_sizes);
    }
    return zeros;
  } else {
    // Non Variable type guard for empty_cuda call
    at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
    auto empty = at::empty(isizes, tensor_options);
    if (expanded_dim) {
      return empty.expand(expanded_sizes);
    }
    return empty;
  }
}

at::Tensor inferAndAllocOutput(
    const TensorView* tv,
    kir::ExpressionEvaluator& expr_eval,
    const CompileOptions& options,
    bool zero_init = false) {
  const auto domain = tv->domain();
  const auto maybe_rfactor_domain = domain->hasRFactor()
      ? domain->getRFactorDomain()
      : domain->getRootDomain();

  std::vector<Val*> sizes;
  std::unordered_map<int, Val*> expand_map;

  for (const auto id : maybe_rfactor_domain) {
    if (id->isReduction() || id->isStride()) {
      continue;
    }
    sizes.push_back(id->extent());
    if (id->isBroadcast() && id->hasExpandedExtent()) {
      expand_map[sizes.size() - 1] = id->expandedExtent();
    }
  }
  return inferAndAlloc(tv, sizes, expr_eval, expand_map, options, zero_init);
}

} // namespace

uint64_t FusionExecutor::computeSharedMemory(
    kir::ExpressionEvaluator& expr_eval,
    const std::vector<const kir::Allocate*>& buffers,
    bool align_padding,
    uint64_t total) {
  FUSER_PERF_SCOPE("computeSharedMemory");
  for (auto smem_alloc : buffers) {
    // If this buffer aliases another buffer,
    // then do not allocate memory for this buffer.
    if (smem_alloc->alias() == nullptr) {
      const auto inferred_val = expr_eval.evaluate(smem_alloc->size());
      if (inferred_val.has_value()) {
        const uint64_t data_size = dataTypeSize(smem_alloc->buffer()->dtype());
        // Add padding to align dynamic shared memory
        if (align_padding) {
#ifndef USE_ROCM
          const int align_size = 16; // always align to 16B/128b.
#else
          const int align_size = 8; // see codegen.cpp for HIP
#endif
          total = ceilDiv(total, align_size) * align_size;
        }
        total += inferred_val->as<int64_t>() * data_size;
      } else {
        TORCH_INTERNAL_ASSERT(
            false,
            "Failed to evaluate the size ",
            smem_alloc->size(),
            " of shared memory buffer - T",
            smem_alloc->buffer()->name());
      }
    }
  }
  return total;
}

LaunchParams FusionExecutor::computeLaunchParams(
    const LaunchParams& launch_constraints,
    kir::ExpressionEvaluator& expr_eval,
    const int warp_size) {
  FUSER_PERF_SCOPE("FusionExecutor::ComputeLaunchParams");
  TORCH_INTERNAL_ASSERT(warp_size > 0, "WARP_SIZE should be larger than 0");

  LaunchParams launch_params;

  auto data_cache = compileTimeDataCache();

  auto lower = lowered_.get();
  auto& used_tvs = getUsedTVs();
  auto parallel_binding_ids_entry =
      executor_utils::caching::ExecutorCompileTimeEntry<
          executor_utils::caching::ParallelBindingIterDomains>(
          data_cache, [&used_tvs, &lower]() {
            return std::make_unique<std::vector<IterDomain*>>(
                executor_utils::getParallelBindingsIterDomains(
                    lower, used_tvs));
          });
  auto& parallel_binding_ids = parallel_binding_ids_entry.get();

  auto parallel_iter_extent_entry =
      executor_utils::caching::ExecutorCompileTimeEntry<
          executor_utils::caching::ParallelIterExtentMap>(
          data_cache, [&parallel_binding_ids]() {
            return executor_utils::getParallelIterExtents(parallel_binding_ids);
          });
  auto& parallel_iter_extents = parallel_iter_extent_entry.get();

  auto simplified_parallel_iter_extent_entry =
      executor_utils::caching::ExecutorCompileTimeEntry<
          executor_utils::caching::SimplifiedParallelIterExtentMap>(
          data_cache, [&parallel_binding_ids, &lower]() {
            return executor_utils::getSimplifiedParallelIterExtents(
                lower, parallel_binding_ids);
          });
  auto& simplified_parallel_iter_extents =
      simplified_parallel_iter_extent_entry.get();

  auto warp_padded_parallel_entry =
      executor_utils::caching::ExecutorCompileTimeEntry<
          executor_utils::caching::WarpPaddedParallelExtents>(
          data_cache, [&parallel_binding_ids, &lower]() {
            return executor_utils::getWarpPaddedExtentsInfo(
                lower->kernel(), parallel_binding_ids);
          });
  auto& warp_padded_extent_set =
      warp_padded_parallel_entry.get().warp_padded_extent_set;
  auto& warp_padded_constant =
      warp_padded_parallel_entry.get().warp_padded_constant;

  // TODO: Need to redesign this part a bit to
  //   find the right place to trigger evaluate
  if (expr_eval.precomputedValues()) {
    expr_eval.precomputedValues()->bindParallelExtents(
        parallel_iter_extents, launch_constraints);
    expr_eval.precomputedValues()->evaluate();
  }

  // If any dimension was set in launch constraints we need to run through
  // IterDomains that have been parallelized, and bind those values. Or make
  // sure if they could be inferred the inference matches what was set.
  for (auto& entry : parallel_iter_extents) {
    auto p_type = entry.first;
    if (launch_constraints.hasDim(p_type)) {
      auto parallel_extents = entry.second;
      for (auto extent : parallel_extents) {
        auto inferred_val = expr_eval.evaluate(extent);
        if (inferred_val.has_value()) {
          // This value could have been inferred, make sure it was set right.
          bool valid =
              inferred_val.value() == launch_constraints.getDim(p_type) ||
              launch_constraints.getRawVal(p_type) == -1;
          if (!useFallback() && !valid) {
            TORCH_WARN_ONCE(
                "Cannot validate parallelization scheme, "
                "this may be due to mixed broadcast axes that are parallelized.");
          }
        } else if (!expr_eval.precomputedValues()) {
          expr_eval.bind(extent, launch_constraints.getDim(p_type));
        }
        if (!launch_params.hasDim(p_type)) {
          // Bind the launch constraint into our evaluation context
          launch_params.bind(launch_constraints.getDim(p_type), p_type);
          // Makes sure the p-types bound to evaluators are the
          //  final values that will become the actual launch
          //  param size to ensure accurate smem buffer size
          //  computation.
          expr_eval.bind(p_type, launch_constraints.getDim(p_type));
        }
      }
    }
  }

  // Run through the rest of the parallel IterDomains and infer their size
  for (auto& entry : simplified_parallel_iter_extents) {
    FUSER_PERF_SCOPE("FusionExecutor::ParallelBindingResolution");
    auto p_type = entry.first;
    auto parallel_extents = entry.second;
    // Select the maxmimum value out of all the parallel extents
    int64_t maximum_value = std::numeric_limits<int64_t>::min();
    for (auto extent : parallel_extents) {
      auto val = expr_eval.evaluate(extent);
      TORCH_INTERNAL_ASSERT(
          val.has_value(),
          "Tried to evaluate the extent, ",
          extent->toInlineString(),
          " for the ptype: ",
          p_type,
          " to set launch bounds but could not.");

      // apply padding to the extent if needed
      if (warp_padded_extent_set.count(extent)) {
        // Check if the extent has const value
        auto padded_constant_it = warp_padded_constant.find(extent);

        if (padded_constant_it != warp_padded_constant.end()) {
          // If already specified padded to constant, need to check
          //  runtime value not over the constant bound
          TORCH_INTERNAL_ASSERT(*val <= padded_constant_it->second);
          *val = padded_constant_it->second;
        } else {
          // If no specified constant, pad to the smallest multiple of warp
          //  above the value.
          auto padded_number_of_warps = (*val + warp_size - 1) / warp_size;
          *val = warp_size * padded_number_of_warps;
        }
        TORCH_INTERNAL_ASSERT(
            *val <= 1024, "padded dimension larger than max block size");
      }
      maximum_value = std::max(maximum_value, val->as<int64_t>());
    }
    // Protect for size-0 tensors, they still have a value so would prefer to
    // bind nothing than 0
    if (maximum_value > 0) {
      expr_eval.bind(p_type, maximum_value);
      launch_params.bind(maximum_value, p_type);
    }
  }

  // Re-run the integer machine with all
  //  the thread sizes now determined.
  if (expr_eval.precomputedValues()) {
    expr_eval.precomputedValues()->evaluate();
  }

  const auto kernel = lowered_->kernel();
  const auto& kernel_summary = kernel->summary();

  // Calculate Dynamic Shared Memory Size
  // Add workspace for reduction and broadcast
  uint64_t reduction_broadcast_workspace = 0;
  const bool has_workspace = kernel_summary.has_block_reductions ||
      kernel_summary.has_grid_reductions ||
      kernel_summary.has_block_broadcasts || kernel_summary.has_grid_broadcasts;
  if (has_workspace &&
      kernel_summary.largest_smem_data_type != DataType::Null) {
    // Not using nThreads here since it does not handle uninitialized value

    // TODO: here is an optimization opportunity since welford uses int64_t for
    // N while the data type is not neccessarily double. But it may need more
    // work on the alignment
    const int welford_factor =
        kernel_summary.has_block_welford || kernel_summary.has_grid_welford ? 3
                                                                            : 1;
    reduction_broadcast_workspace =
        dataTypeSize(kernel_summary.largest_smem_data_type) * welford_factor *
        launch_params.bdimx() * launch_params.bdimy() * launch_params.bdimz();
  }

  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  const uint64_t dynamic_smem_size = computeSharedMemory(
      expr_eval,
      kernel_summary.dynamic_smem_allocations,
      true,
      reduction_broadcast_workspace);

  // Check that requested smem size can be dynamically allocated.
  //  This check is only done once a kernel has been compiled, since
  //  maybe_available_dynamic_smem_ needs to be evaluated on
  //  a compiled kernel.
  if (maybe_available_dynamic_smem_.has_value()) {
    // Dynamic shared memory space that we can allocate without
    //  carving more space from L1.
    const uint64_t available_dynamic_smem_without_reconfiguration =
        maybe_available_dynamic_smem_.value();
    // Maximum additional shared memory size we could request
    //  if we do re-configuration.
    const uint64_t additional_dynamic_smem_available_through_reconfiguration =
        device_smem_limit_ - configured_device_smem_;

    TORCH_INTERNAL_ASSERT(
        (dynamic_smem_size) <
            (available_dynamic_smem_without_reconfiguration +
             additional_dynamic_smem_available_through_reconfiguration),
        "The total shared memory allocation is larger than available memory.",
        " Dynamic size: ",
        dynamic_smem_size,
        ". Available size: ",
        maybe_available_dynamic_smem_.value(),
        ". Configured smem size: ",
        configured_device_smem_,
        ". Device limit size: ",
        device_smem_limit_);
  }

  launch_params.setSmem(dynamic_smem_size);

  return launch_params;
}

FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
    kir::ExpressionEvaluator& expr_eval) {
  FUSER_PERF_SCOPE("FusionExecutor::AllocGlobalVals");
  GlobalBuffers global_buffers;
  const auto kernel = lowered_->kernel();
  const auto& kernel_summary = kernel->summary();
  for (auto alloc : kernel_summary.global_allocations) {
    TORCH_INTERNAL_ASSERT(
        alloc->buffer()->isA<TensorView>(),
        "Cannot allocate global buffers that are not tensors.");
    auto tv = alloc->buffer()->as<TensorView>();
    if (tv->isFusionOutput()) {
      continue;
    }
    if (alloc->zeroInit()) {
      global_buffers.buffers.push_back(
          inferAndAlloc(tv, alloc->shape(), expr_eval, {}, options_, true));
      global_buffers.zero_init.push_back(true);
    } else {
      global_buffers.buffers.push_back(
          inferAndAlloc(tv, alloc->shape(), expr_eval, {}, options_, false));
      global_buffers.zero_init.push_back(false);
    }
    // Remember the tensor buffer used for storing kernel profile
    if (isOptionEnabled(EnableOption::KernelProfile) &&
        tv == kernel->profile().getBuffer()) {
      global_buffers.profile_buffer = global_buffers.buffers.back();
    }
  }

  return global_buffers;
}

std::vector<at::Tensor> FusionExecutor::allocOutputs(
    kir::ExpressionEvaluator& expr_eval,
    const std::unordered_set<int>& alias_indices) {
  FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
  const auto kernel = lowered_->kernel();
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  std::vector<at::Tensor> outputs;
  for (const auto out_i : c10::irange(kernel->outputs().size())) {
    // TODO: FIX this short-cut where we trivially forward inputs to outputs
    if (kernel->outputs()[out_i]->isFusionInput()) {
      TORCH_INTERNAL_ASSERT(false, "trivial input forwarding NOT IMPLEMENTED");
      // for (auto inp_i : c10::irange(kernel->inputs().size())) {
      //   if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
      //     TORCH_INTERNAL_ASSERT(
      //         inp_i < inputs.size(),
      //         "Issue with an input showing up as output, couldn't find
      //         input.");
      //     TORCH_INTERNAL_ASSERT(
      //         inputs[inp_i].isTensor(),
      //         "Cannot register a scalar as an output in a fusion.");
      //     outputs.push_back(inputs[inp_i].toTensor());
      //     break;
      //   }
      // }
    } else {
      TORCH_INTERNAL_ASSERT(
          kernel->outputs()[out_i]->isA<TensorView>(),
          "Cannot allocate outputs that are not tensors.");
      auto output = kernel->outputs()[out_i]->as<TensorView>();
      if (alias_indices.count(out_i) != 0) {
        // aliasing to inputs, no need to allocate real output, just push empty
        // tensor here.
        outputs.emplace_back();
      } else {
        outputs.push_back(
            inferAndAllocOutput(output, expr_eval, options_, false));
      }
    }
  }
  return outputs;
}

void FusionExecutor::setUsedTVs() {
  auto used_vals = fusion_->usedMathVals();
  auto used_tvs = ir_utils::filterByType<TensorView>(used_vals);
  used_tvs_.clear();
  used_tvs_.insert(used_tvs_.begin(), used_tvs.begin(), used_tvs.end());
}

KernelArgumentHolder FusionExecutor::evaluateOutputSizes(
    const KernelArgumentHolder& args,
    kir::ExpressionEvaluator& expr_eval,
    const std::unordered_set<int>& alias_indices) {
  FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
  const auto kernel = lowered_->kernel();

  KernelArgumentHolder ret(args.getIndexMode());
  ret.setDeviceIndex(args.getDeviceIndex());

  CompileOptions meta_options = options_;
  meta_options.device = c10::Device(DeviceType::Meta, 0);

  for (const auto out_i : c10::irange(kernel->outputs().size())) {
    // If the output is just trivially the input, just "copy" it over.
    if (kernel->outputs()[out_i]->isFusionInput()) {
      for (auto inp_i : c10::irange(kernel->inputs().size())) {
        if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
          TORCH_INTERNAL_ASSERT(
              inp_i < args.size(),
              "Issue with an input showing up as output, couldn't find input.");

          auto tensor_arg_abstract =
              dynamic_cast<const TensorArgAbstract*>(args[inp_i]);
          TORCH_INTERNAL_ASSERT(
              tensor_arg_abstract,
              "Cannot register a scalar as an output in a fusion.");
          ret.push(tensor_arg_abstract);
          break;
        }
      }
    } else {
      TORCH_INTERNAL_ASSERT(
          kernel->outputs()[out_i]->isA<TensorView>(),
          "Cannot allocate outputs that are not tensors.");
      auto output = kernel->outputs()[out_i]->as<TensorView>();
      if (alias_indices.count(out_i) != 0) {
        // aliasing to inputs, no need to allocate real output
        // but we still need to push an entry here.
        ret.push(int64_t(0));
      } else {
        // TODO: we are using meta here, which is bad since it doesn't account
        // for devices. Switch to fake tensor instead
        ret.push(inferAndAllocOutput(output, expr_eval, meta_options, false));
      }
    }
  }
  return ret;
}

KernelArgumentHolder FusionExecutor::inferOutputSizes(
    const KernelArgumentHolder& args,
    const LaunchParams& launch_constraints) {
  FUSER_PERF_SCOPE("FusionExecutor::RunFusion");

  ExecutorEntry* executor_entry = nullptr;
  c10::optional<size_t> opt_code = args.getCacheId();
  if (opt_code.has_value()) {
    executor_entry = &executor_entry_lookup_[*opt_code];
  }

  executor_utils::initializeCudaContext();
  TORCH_INTERNAL_ASSERT(lowered_);

  TORCH_INTERNAL_ASSERT(
      !executor_entry || !executor_entry->init,
      "compile kernel shouldn't hit a pre-existing cache");
  FUSER_PERF_SCOPE("ExecutorRunFusion::ValidateAndInitialize");
  // TODO: validate kernel inputs currently won't be happy, since our fusion
  // args are mapped with `meta` tensor instead of `cuda` tensor, check if this
  // would be resolved with FakeTensor
  // executor_utils::validateKernelInputs(fusion_, args, options_.device);

  if (!evaluator_precomputed_values_) {
    evaluator_precomputed_values_ =
        std::make_unique<KernelPrecomputedValues>(lowered_->kernel());
  }

  kir::ExpressionEvaluator expr_eval;
  evaluator_precomputed_values_->bindKernelInputs(lowered_->kernel(), args);
  expr_eval.precomputedValues() = evaluator_precomputed_values_.get();

  // I think this binds something to expr_eval, so even though we are not using
  // launch_params_, we still need this in order to infer output shapes.
  launch_params_ =
      computeLaunchParams(launch_constraints, expr_eval, warp_size_);

  executor_utils::validateVectorizedTensors(
      lowered_.get()->kernel(), args, {}, compileTimeDataCache(), expr_eval);

  auto alias_indices_entry = executor_utils::caching::ExecutorCompileTimeEntry<
      executor_utils::caching::InputAliasIndices>(
      compileTimeDataCache(), [&]() {
        return std::make_unique<std::vector<std::pair<int, int>>>(
            fusion_->getInputAliasIndices());
      });

  auto& alias_indices = alias_indices_entry.get();

  // NOLINTNEXTLINE(bugprone-branch-clone)
  auto output_alias_indices_entry =
      executor_utils::caching::ExecutorCompileTimeEntry<
          executor_utils::caching::OutputAliasIndices>(
          compileTimeDataCache(), [&]() {
            return std::make_unique<std::unordered_set<int>>(
                fusion_->getOutputAliasIndices());
          });

  auto& output_alias_indices = output_alias_indices_entry.get();

  auto ret = evaluateOutputSizes(args, expr_eval, output_alias_indices);

  for (const auto& entry : alias_indices) {
    auto aliased_output_index = entry.first;
    auto aliased_input_index = entry.second;
    TORCH_INTERNAL_ASSERT(
        args[aliased_input_index]->isType(ArgType::Tensor),
        "alias io only supports tensor");
    ret.swap(aliased_output_index, args[aliased_input_index]);
  }

  return ret;
}

std::vector<at::Tensor> FusionExecutor::runFusion(
    KernelArgumentHolder& args,
    const LaunchParams& launch_constraints,
    const std::vector<at::Tensor>& outputs) {
  FUSER_PERF_SCOPE("FusionExecutor::RunFusion");
  TORCH_INTERNAL_ASSERT(compiled());
  TORCH_INTERNAL_ASSERT(
      fusion_id_ > 0, "Cannot run fusion, it was not compiled.");
  TORCH_INTERNAL_ASSERT(
      !args.getCacheId().has_value() || outputs.empty(),
      "short cut input cache is not compatible with pre-allocated output");

  if (isDebugDumpEnabled(DebugDumpOption::FusionArgs)) {
    std::cout << "Arguments for fusion" << fusion_id_ << ":" << std::endl
              << "Inputs:" << std::endl;
    for (auto i : c10::irange(args.size())) {
      args[i]->print();
    }
    std::cout << "Outputs:" << std::endl;
    for (const auto& output : outputs) {
      std::cout << "  " << output.scalar_type() << " " << output.sizes()
                << " (strides = " << output.strides() << ")" << std::endl;
    }
    std::cout << launch_constraints.toString();
  }

  ExecutorEntry* executor_entry = nullptr;
  if (args.getCacheId().has_value()) {
    executor_entry = &executor_entry_lookup_[*args.getCacheId()];
  }

  c10::DeviceGuard dg(options_.device);
  auto stream = at::cuda::getCurrentCUDAStream();
  executor_utils::initializeCudaContext();
  TORCH_INTERNAL_ASSERT(lowered_);
  launch_params_ = LaunchParams();
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  std::vector<at::Tensor> allocated_outputs;
  GlobalBuffers global_buffers;
  uint64_t rand_offset = 0;

  if (executor_entry && executor_entry->init && !disable_parameter_cache_) {
    {
      // context manager to disable auto grad for `empty_cuda` calls later
      at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
      // take the short-cut for launch if we see a recorded input set again
      launch_params_ = executor_entry->launch_params;
      // only allocate outputs when not given
      if (outputs.empty()) {
        FUSER_PERF_SCOPE("ExecutorRunFusion::OutputAlloc");
        for (const auto i : c10::irange(executor_entry->output_sizes.size())) {
          allocated_outputs.push_back(at::native::empty_strided_cuda(
              executor_entry->output_sizes[i],
              executor_entry->output_strides[i],
              executor_entry->output_types[i],
              c10::nullopt,
              options_.device,
              c10::nullopt));
        }
        // Note: aliased output is not returned as output. But we still need it
        // for kernel execution, so would need to push them to args
        for (const auto& entry : executor_entry->io_alias_indices) {
          auto aliased_output_index = entry.first;
          auto aliased_input_index = entry.second;
          auto tensor_arg_abstract =
              dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
          TORCH_INTERNAL_ASSERT(
              tensor_arg_abstract, "alias io only supports tensor");
          allocated_outputs[aliased_output_index] =
              tensor_arg_abstract->getTensor();
        }
        args.push(allocated_outputs);
      } else {
        TORCH_INTERNAL_ASSERT(
            outputs.size() == fusion_->outputs().size(),
            __func__,
            " provided number of outputs does match fusion output");
        allocated_outputs = outputs;
        args.push(outputs);
      }

      {
        FUSER_PERF_SCOPE("ExecutorRunFusion::IntermediateBufferAlloc");
        for (const auto i : c10::irange(executor_entry->buffer_sizes.size())) {
          if (executor_entry->buffer_zero_init[i]) {
            global_buffers.buffers.push_back(at::zeros(
                executor_entry->buffer_sizes[i],
                at::TensorOptions()
                    .dtype(executor_entry->buffer_types[i])
                    .device(options_.device)));
            global_buffers.zero_init.push_back(true);
          } else {
            global_buffers.buffers.push_back(at::native::empty_cuda(
                executor_entry->buffer_sizes[i],
                executor_entry->buffer_types[i],
                c10::nullopt,
                options_.device,
                c10::nullopt));
            global_buffers.zero_init.push_back(false);
          }
        }
      }
    }
    rand_offset = executor_entry->rand_offset;
  } else {
    FUSER_PERF_SCOPE("ExecutorRunFusion::ValidateAndInitialize");
    // code path to take when either:
    //   1. no opt_code is provided or
    //   2. `executor_entry` is not initialized
    executor_utils::validateKernelInputs(fusion_, args, options_.device);

    if (!evaluator_precomputed_values_) {
      evaluator_precomputed_values_ =
          std::make_unique<KernelPrecomputedValues>(lowered_->kernel());
    }

    kir::ExpressionEvaluator expr_eval;
    evaluator_precomputed_values_->bindKernelInputs(lowered_->kernel(), args);
    expr_eval.precomputedValues() = evaluator_precomputed_values_.get();

    launch_params_ =
        computeLaunchParams(launch_constraints, expr_eval, warp_size_);

    // Recompile the kernel if the number of threads in the block has increased
    if (launch_params_.nThreads() > block_size_high_water_mark) {
      const auto kernel = lowered_->kernel();
      kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
      const auto structured_code = getStructuredCode(kernel_code_);
      block_size_high_water_mark = launch_params_.nThreads();

      std::tie(compiled_kernel_, last_compiler_log_) =
          executor_utils::nvrtcCompile(
              structured_code,
              (kernelNamespace() + "::" + kernelName()).c_str(),
              fusion_id_,
              block_size_high_water_mark);
    }

    if (kernel()->summary().has_cooperative_grid_reduction) {
#ifndef USE_ROCM
      int num_blocks_per_SM = -1;
      at::globalContext().getNVRTC().cuOccupancyMaxActiveBlocksPerMultiprocessor(
          &num_blocks_per_SM,
          compiled_kernel_.function,
          (int)(launch_params_.bdimx() * launch_params_.bdimy() * launch_params_.bdimz()),
          (size_t)launch_params_.smem());

      TORCH_INTERNAL_ASSERT(
          (int64_t)(
              num_blocks_per_SM *
              at::cuda::getDeviceProperties(options_.device.index())
                  ->multiProcessorCount) >= launch_params_.gdimx() *
                  launch_params_.gdimy() * launch_params_.gdimz(),
          "Wanted to launch a cooperative kernel, however the number of blocks is greater than ",
          "what can be resident on the GPU at once. Need: ",
          launch_params_.gdimx() * launch_params_.gdimy() *
              launch_params_.gdimz(),
          " (",
          launch_params_.gdimx(),
          " * ",
          launch_params_.gdimy(),
          " * ",
          launch_params_.gdimz(),
          ") but limited to ",
          num_blocks_per_SM,
          " * ",
          at::cuda::getDeviceProperties(options_.device.index())
              ->multiProcessorCount);
#else
      TORCH_INTERNAL_ASSERT(
          false, "Cross grid communication not supported with HIP.");
#endif
    }

    executor_utils::validateVectorizedTensors(
        lowered_.get()->kernel(),
        args,
        outputs,
        compileTimeDataCache(),
        expr_eval);

    auto alias_indices_entry =
        executor_utils::caching::ExecutorCompileTimeEntry<
            executor_utils::caching::InputAliasIndices>(
            compileTimeDataCache(), [&]() {
              return std::make_unique<std::vector<std::pair<int, int>>>(
                  fusion_->getInputAliasIndices());
            });

    auto& alias_indices = alias_indices_entry.get();

    // NOLINTNEXTLINE(bugprone-branch-clone)
    if (outputs.empty()) {
      auto output_alias_indices_entry =
          executor_utils::caching::ExecutorCompileTimeEntry<
              executor_utils::caching::OutputAliasIndices>(
              compileTimeDataCache(), [&]() {
                return std::make_unique<std::unordered_set<int>>(
                    fusion_->getOutputAliasIndices());
              });

      auto& output_alias_indices = output_alias_indices_entry.get();

      allocated_outputs = allocOutputs(expr_eval, output_alias_indices);

      for (const auto& entry : alias_indices) {
        auto aliased_output_index = entry.first;
        auto aliased_input_index = entry.second;
        auto tensor_arg_abstract =
            dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
        TORCH_INTERNAL_ASSERT(
            tensor_arg_abstract, "alias io only supports tensor");
        allocated_outputs[aliased_output_index] =
            tensor_arg_abstract->getTensor();
      }
      args.push(allocated_outputs);
    } else {
      allocated_outputs = outputs;
      args.push(outputs);
      executor_utils::validateKernelOutputs(
          fusion_, allocated_outputs, options_.device);
    }

    global_buffers = allocGlobalVals(expr_eval);

    if (kernel()->summary().max_rng_offsets >= 0) {
      // NOTE: this is how we map offset to PW kernels in order to have
      // identical random number generator to match native PyTorch results.
      // But it doesn't really work as it takes assumption how threads are
      // binded but is not generally how we handle that in scheduler.
      // Refer to `Philox` in generated kernel to understand how the mapping
      // works.
      rand_offset = (kernel()->summary().max_rng_offsets + 1) * 4;
    }

    // This is the entry when we have provided `opt_code` but the entry has not
    // been initialized yet.
    if (executor_entry) {
      FUSER_PERF_SCOPE("ExecutorRunFusion::FillCacheEntry");
      // record the the short-cut executor entry for the given input set;
      executor_entry->launch_params = launch_params_;
      executor_entry->io_alias_indices = alias_indices;
      for (const auto& output : allocated_outputs) {
        executor_entry->output_sizes.push_back(output.sizes().vec());
        executor_entry->output_strides.push_back(output.strides().vec());
        executor_entry->output_types.push_back(output.scalar_type());
      }

      for (const auto& i : c10::irange(global_buffers.buffers.size())) {
        executor_entry->buffer_sizes.push_back(
            global_buffers.buffers[i].sizes().vec());
        executor_entry->buffer_types.push_back(
            global_buffers.buffers[i].scalar_type());
        executor_entry->buffer_zero_init.push_back(global_buffers.zero_init[i]);
      }
      executor_entry->rand_offset = rand_offset;
      executor_entry->init = true;
    }
  }

  // push back global buffers
  args.push(global_buffers.buffers);

  // push back RNG state if needed
  if (lowered_->kernel()->summary().max_rng_offsets >= 0) {
    args.appendPhiloxRNGSeed(rand_offset);
  }

  if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
    launch_params_.print();
  }

  if (isDebugDumpEnabled(DebugDumpOption::KernelArgs)) {
    std::cout << "Arguments for kernel" << fusion_id_ << ":" << std::endl
              << "Inputs:" << std::endl;
    for (auto i : c10::irange(args.size())) {
      args[i]->print();
    }
    std::cout << "Outputs:" << std::endl;
    // note: add aliased outputs here.
    for (const auto& output : allocated_outputs) {
      std::cout << "  " << output.scalar_type() << " " << output.sizes()
                << " (strides = " << output.strides()
                << ", address = " << output.data_ptr() << ")" << std::endl;
    }
    std::cout << "Reduction and semaphore buffers:" << std::endl;
    TORCH_INTERNAL_ASSERT(
        global_buffers.buffers.size() == global_buffers.zero_init.size(),
        "global_buffer buffer & zero_init container should have identical sizes");
    for (const auto i : c10::irange(global_buffers.buffers.size())) {
      const auto& buffer = global_buffers.buffers[i];
      const auto& zero_init = global_buffers.zero_init[i];
      std::cout << "  " << buffer.scalar_type() << " " << buffer.sizes()
                << " is_zero_initialized: " << zero_init << std::endl;
    }
  }

  cudaEvent_t start_event = {};
  cudaEvent_t finish_event = {};

  if (measure_kernel_time_ ||
      isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
      isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
    C10_CUDA_CHECK(cudaEventCreate(&start_event));
    C10_CUDA_CHECK(cudaEventCreate(&finish_event));
    C10_CUDA_CHECK(cudaEventRecord(start_event));
  }

  if (execute_kernel_) {
    if (maybe_available_dynamic_smem_.has_value() &&
        launch_params_.smem() > maybe_available_dynamic_smem_.value()) {
#ifndef USE_ROCM
      // Increase limit of dynamic shared memory if needed.
      AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncSetAttribute(
          compiled_kernel_.function,
          CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
          launch_params_.smem()));
#else
      TORCH_INTERNAL_ASSERT(
          false, "cuFuncSetAttribute not supported with HIP.");
#endif
    }
    if (!kernel()->summary().has_cooperative_grid_reduction) {
      FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel");
      AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
          compiled_kernel_.function,
          launch_params_.gdimx(),
          launch_params_.gdimy(),
          launch_params_.gdimz(),
          launch_params_.bdimx(),
          launch_params_.bdimy(),
          launch_params_.bdimz(),
          launch_params_.smem(),
          stream,
          args.getBuffer(),
          nullptr));
    } else {
#ifndef USE_ROCM
      FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchCooperativeKernel");
      AT_CUDA_DRIVER_CHECK(
          at::globalContext().getNVRTC().cuLaunchCooperativeKernel(
              compiled_kernel_.function,
              launch_params_.gdimx(),
              launch_params_.gdimy(),
              launch_params_.gdimz(),
              launch_params_.bdimx(),
              launch_params_.bdimy(),
              launch_params_.bdimz(),
              launch_params_.smem(),
              stream,
              args.getBuffer()));
#else
      TORCH_INTERNAL_ASSERT(
          false, "Cross grid communication not supported with HIP.");
#endif
    }
  }

  if (measure_kernel_time_ ||
      isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
      isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
    C10_CUDA_CHECK(cudaEventRecord(finish_event));
    C10_CUDA_CHECK(cudaEventSynchronize(start_event));
    C10_CUDA_CHECK(cudaEventSynchronize(finish_event));
    C10_CUDA_CHECK(
        cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event));
    C10_CUDA_CHECK(cudaEventDestroy(start_event));
    C10_CUDA_CHECK(cudaEventDestroy(finish_event));

    bytes_processed_ = 0;
    // Figure how many bytes are inputs, outputs, and temporary buffers
    for (auto i : c10::irange(args.size())) {
      if (auto tensor_arg_abstract =
              dynamic_cast<const TensorArgAbstract*>(args[i])) {
        bytes_processed_ += tensor_arg_abstract->numel() *
            dataTypeSize(tensor_arg_abstract->getDataType());
      }
    }
    for (const auto& output : allocated_outputs) {
      bytes_processed_ += output.numel() *
          dataTypeSize(aten_to_data_type(output.scalar_type()));
    }

    if (isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) {
      double gb_per_s =
          ((double)bytes_processed_ / ((double)kernel_time_ms_ / 1000)) /
          (double)1.0e9;
      std::cout << "kernel" << fusion_id_ << " run in " << kernel_time_ms_
                << " ms, achieved: " << gb_per_s << " GB/s" << std::endl;
    }
  }

  if (isOptionEnabled(EnableOption::KernelProfile)) {
    std::cout << kernel()->profile().toString(global_buffers.profile_buffer);
  }

  return allocated_outputs;
}

void FusionExecutor::compileRtc(
    const std::string& code,
    const std::string& name,
    bool structured,
    CompileOptions options) {
  FUSER_PERF_SCOPE("ExecutorRunFusion::compileRtc");
  std::string scode;
  if (!structured) {
    scode = getStructuredCode(code);
  } else {
    scode = code;
  }
  fusion_id_ = 1;
  options_ = options;

  std::tie(compiled_kernel_, last_compiler_log_) =
      executor_utils::nvrtcCompile(scode, name, fusion_id_);
}

void FusionExecutor::runRtc(
    const LaunchParams& launch_params,
    const std::vector<at::Tensor>& args) {
  FUSER_PERF_SCOPE("runFusion");

  c10::DeviceGuard dg(options_.device);
  auto stream = at::cuda::getCurrentCUDAStream();

  KernelArgumentHolder kernel_arguments(options_.index_mode);
  kernel_arguments.push(args);
  AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
      compiled_kernel_.function,
      launch_params.gdimx(),
      launch_params.gdimy(),
      launch_params.gdimz(),
      launch_params.bdimx(),
      launch_params.bdimy(),
      launch_params.bdimz(),
      launch_params.smem(),
      stream,
      kernel_arguments.getBuffer(),
      nullptr));
}

} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch