#include #include #include #include #include #include #include namespace torch { namespace jit { namespace fuser { namespace cuda { inline bool deviceMajorMinorCheck(int major, int minor = 0) { auto dev_prop = at::cuda::getDeviceProperties(0); if (dev_prop->major < major || (dev_prop->major == major && dev_prop->minor < minor)) { return false; } return true; } class NVFuserTest : public ::testing::Test { protected: void SetUp() override { // requires PASCAL or newer if (!deviceMajorMinorCheck(6)) { GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs"; } } }; struct ValidationConstants { // Tolerances generated from randn + add + sum fusion // compared against double precision std::array, 20> sum_tolerances_float = { {{4, 1.51992e-06}, {8, 2.23704e-06}, {16, 2.95788e-06}, {32, 4.4778e-06}, {64, 6.75395e-06}, {128, 8.57934e-06}, {256, 1.30594e-05}, {512, 2.19122e-05}, {1024, 3.3451e-05}, {2048, 5.78476e-05}, {4096, 0.000108292}, {8192, 0.00012207}, {16384, 0.000136882}, {32768, 0.000248561}, {65536, 0.000407594}, {131072, 0.000500901}, {262144, 0.000923019}, {524288, 0.00156909}, {1048576, 0.00223107}, {2097152, 0.00343043}}}; // Tolerances generated from randn + add + sum fusion // compared against double precision std::array, 20> sum_tolerances_half = { {{4, 0.00390625}, {8, 0.0078125}, {16, 0.0078125}, {32, 0.0155334}, {64, 0.0156269}, {128, 0.0312042}, {256, 0.0312548}, {512, 0.0619979}, {1024, 0.0625103}, {2048, 0.124686}, {4096, 0.12501}, {8192, 0.24945}, {16384, 0.250049}, {32768, 0.498946}, {65536, 0.500071}, {131072, 0.985087}, {262144, 1.00006}, {524288, 1.99234}, {1048576, 2.00032}, {2097152, 3.99073}}}; double base_half_abs_tol = -1; double base_half_rel_tol = -1; double base_float_abs_tol = -1; double base_float_rel_tol = -1; }; namespace { // Returns abs and relative values to use for validation std::pair getTolerance( DataType dtype, int64_t reduction_size, const ValidationConstants& tolerances) { switch (dtype) { case DataType::Float: // TODO: Pull new tolerances for Double, for now we will just use float // tolerances as it should be no worse. case DataType::Double: { const auto& sum_tolerance_entry = tolerances.sum_tolerances_float; const auto& base_abs = tolerances.base_float_abs_tol; const auto& base_rel = tolerances.base_float_rel_tol; if (reduction_size <= 1) { // No reduction case if (base_abs == -1 || base_rel == -1) { return {sum_tolerance_entry[0][1], sum_tolerance_entry[1][1]}; } else { return {base_abs, base_rel}; } } else { // Reduction case size_t entry = 0; while (entry < sum_tolerance_entry.size() && sum_tolerance_entry[entry][0] < reduction_size) { entry++; } double abs_tol = 0.0; if (entry + 1 < sum_tolerance_entry.size()) { // Grab the next entry up so we have some margin abs_tol = sum_tolerance_entry[entry + 1][1]; } else { // If we hit the end of the list, return twice the max error we // measured abs_tol = sum_tolerance_entry[sum_tolerance_entry.size() - 1][1] * 2.; } // Relative tol we're going to set to 1% of abs tol just for // a small margin of rel error. return {abs_tol, abs_tol * 0.01}; } } case DataType::Half: { // Copied from float case const auto& sum_tolerance_entry = tolerances.sum_tolerances_half; const auto& base_abs = tolerances.base_half_abs_tol; const auto& base_rel = tolerances.base_half_rel_tol; if (reduction_size <= 1) { // No reduction case if (base_abs == -1 || base_rel == -1) { return {sum_tolerance_entry[0][1], sum_tolerance_entry[1][1]}; } else { return {base_abs, base_rel}; } } else { // Reduction case size_t entry = 0; while (sum_tolerance_entry[entry][0] < reduction_size && entry < sum_tolerance_entry.size()) { entry++; } double abs_tol = 0.0; if (entry + 1 < sum_tolerance_entry.size()) { // Grab the next entry up so we have some margin abs_tol = sum_tolerance_entry[entry + 1][1]; } else { // If we hit the end of the list, return twice the max error we // measured abs_tol = sum_tolerance_entry[sum_tolerance_entry.size() - 1][1] * 2.; } // Relative tol we're going to set to 1% of abs tol just for // a small margin of rel error. return {abs_tol, abs_tol * 0.01}; } } case DataType::BFloat16: { // Copied from float case const auto& sum_tolerance_entry = tolerances.sum_tolerances_half; const auto& base_abs = tolerances.base_half_abs_tol; const auto& base_rel = tolerances.base_half_rel_tol; if (reduction_size <= 1) { // No reduction case if (base_abs == -1 || base_rel == -1) { return {sum_tolerance_entry[0][1], sum_tolerance_entry[1][1]}; } else { return {base_abs * 10.0, base_rel * 10.0}; } } else { // Reduction case size_t entry = 0; while (sum_tolerance_entry[entry][0] < reduction_size && entry < sum_tolerance_entry.size()) { entry++; } double abs_tol = 0.0; if (entry + 1 < sum_tolerance_entry.size()) { // Grab the next entry up so we have some margin abs_tol = sum_tolerance_entry[entry + 1][1]; } else { // If we hit the end of the list, return twice the max error we // measured abs_tol = sum_tolerance_entry[sum_tolerance_entry.size() - 1][1] * 2.; } // Relative tol we're going to set to 1% of abs tol just for // a small margin of rel error. return {abs_tol * 10.0, abs_tol * 0.01 * 10.0}; } } case DataType::Int: return {0.0, 0.0}; case DataType::Int32: return {0.0, 0.0}; case DataType::Bool: return {0.0, 0.0}; default: TORCH_INTERNAL_ASSERT( false, "Do not have tolerance computation for type ", dtype, "."); } } class ReductionSizeMapper : private IterVisitor { public: //! Runs through the fusion and determines how many reductions were performed //! to compute each tensorview. static std::unordered_map computeReductionSizes( Fusion* fusion, ExpressionEvaluator& expr_eval) { ReductionSizeMapper mapper(fusion, expr_eval); return mapper.reduction_map; } private: ReductionSizeMapper(Fusion* fusion, ExpressionEvaluator& expr_eval) : expr_eval_(expr_eval) { // Initialize input values for (auto inp : fusion->inputs()) { if (inp->isA()) { auto tv = inp->as(); // Shouldn't have any reductions, but run it through analysis anyways. reduction_map[tv] = getReductionSize(tv); } } IterVisitor::traverse(fusion); // catch up with dangling outputs; for (auto out : fusion->outputs()) { if (out->isA()) { auto tv = out->as(); // possible that we have a dangling output that's not generated by any // expression. e.g. 0 workspace or null tensor if (reduction_map.count(tv) == 0) { // Shouldn't have any reductions, but run it through analysis anyways. reduction_map[tv] = getReductionSize(tv); } } } } int64_t getReductionSize(const TensorView* tv) { int64_t reduction_elements = 1; for (auto id : tv->getMaybeRFactorDomain()) { if (id->isReduction()) { auto inferred_extent = expr_eval_.evaluate(id->extent()); TORCH_INTERNAL_ASSERT( inferred_extent.has_value(), "Couldn't figure out what the dimensions of a tensorview is in evaluation for validation. ", id, " in ", tv); reduction_elements = reduction_elements * inferred_extent.value(); } } return reduction_elements; } void handle(Expr* expr) override { if (!ir_utils::isTvOp(expr)) { return; } int64_t inp_reduction_elements = 1; for (auto inp : expr->inputs()) { if (inp->isA()) { if (auto tv = inp->as()) { inp_reduction_elements = std::max(inp_reduction_elements, reduction_map.at(tv)); } } } for (auto out : expr->outputs()) { if (out->isA()) { auto tv = out->as(); reduction_map[tv] = getReductionSize(tv) * inp_reduction_elements; } } } private: using IterVisitor::handle; std::unordered_map reduction_map; ExpressionEvaluator& expr_eval_; }; ExpressionEvaluator bindInputsAndLaunchParams( Fusion* fusion, const at::ArrayRef& aten_inputs, const LaunchParams& launch_constraints) { auto expr_eval = executor_utils::bindFusionInputs(aten_inputs, fusion); for (auto val : fusion->vals()) { if (!val->isA()) { continue; } // Roughly taken from executor.cpp/computeLaunchParams auto tv = val->as(); for (auto id : tv->domain()->domain()) { if (!(id->isThread() && id->extent()->definition() == nullptr)) { continue; } if (id->isBroadcast()) { continue; } auto extent = id->extent(); auto inferred_extent = expr_eval.evaluate(extent); auto p_type = id->getParallelType(); if (inferred_extent.has_value()) { // This value could have been inferred, make sure it was set right. TORCH_CHECK( inferred_extent.value() == launch_constraints.getDim(p_type) || launch_constraints.getRawVal(p_type) == -1, "inferred that ", p_type, " should be set to ", inferred_extent.value(), " but launch constraints specified ", launch_constraints.getRawVal(p_type)); } else { // Bind the launch constraint into our evaluation context if (launch_constraints.hasDim(id->getParallelType())) { expr_eval.bind(extent, launch_constraints.getDim(p_type)); } } } } return expr_eval; } } // namespace // Validation will look through the fusion and figure out how many elements were // reduced to create each output. It will then compute a tolernace to use for // allclose based on experimental results. The experimental results were based // on adding two tensors then summing them. This of course has an assumption // that we're always summing values between -2 and 2. If we start summing values // larger than that this approach might not hold. inline void testValidate( Fusion* fusion, const std::vector& fusion_outputs, const at::ArrayRef& aten_inputs, const std::vector& aten_outputs, int line_number, const char* file_name, std::string err_msg = "", const LaunchParams& lparams = LaunchParams(), const ValidationConstants& tolerances = ValidationConstants()) { FusionGuard fg(fusion); auto expr_eval = bindInputsAndLaunchParams(fusion, aten_inputs, lparams); auto reduction_sizes = ReductionSizeMapper::computeReductionSizes(fusion, expr_eval); TORCH_INTERNAL_ASSERT( fusion_outputs.size() == aten_outputs.size() && aten_outputs.size() == fusion->outputs().size(), "Number of outputs don't match."); TORCH_INTERNAL_ASSERT( fusion->inputs().size() == aten_inputs.size(), "Number of inputs don't match."); for (size_t i = 0; i < fusion->inputs().size(); i++) { if (fusion->inputs()[i]->isA()) { TORCH_INTERNAL_ASSERT( aten_inputs[i].isTensor(), "Mismatch of tensor inputs."); auto fusion_input_tv = fusion->inputs()[i]->as(); auto at_tensor = aten_inputs[i].toTensor(); TORCH_INTERNAL_ASSERT( at_tensor.dim() == TensorDomain::noReductions( fusion_input_tv->getMaybeRFactorDomain()) .size(), "Dimensionality mismatch in inputs."); } } for (size_t i = 0; i < fusion->outputs().size(); i++) { TORCH_INTERNAL_ASSERT( fusion->outputs()[i]->isA(), "Mismatch of tensor outputs."); auto fusion_output_tensor = fusion_outputs[i]; auto fusion_output_tv = fusion->outputs()[i]->as(); auto aten_output_tensor = aten_outputs[i]; TORCH_INTERNAL_ASSERT( reduction_sizes.count(fusion_output_tv), "Missed reduction size count on fusion output at index: ", i); int64_t reduction_size = reduction_sizes.at(fusion_output_tv); TORCH_INTERNAL_ASSERT( aten_output_tensor.dim() == fusion_output_tensor.dim() && fusion_outputs[i].dim() == TensorDomain::noReductions( fusion_output_tv->getMaybeRFactorDomain()) .size(), "Dimensionality mismatch in outputs."); auto tolerance_values = getTolerance( fusion_output_tv->getDataType().value(), reduction_size, tolerances); if (aten_output_tensor.is_floating_point()) { TORCH_INTERNAL_ASSERT( aten_output_tensor.allclose( fusion_output_tensor.to(aten_output_tensor.dtype()), tolerance_values.second, tolerance_values.first), "\n", err_msg, "\nValidation error in output ", i, " on line ", line_number, " in file ", file_name, ".\n Detected abs error of: ", aten_output_tensor.sub(fusion_output_tensor) .abs() .max() .item() .to(), "\n absolute tolerance was set to ", tolerance_values.first, "\n and relative tolerance set to ", tolerance_values.second); } else { TORCH_INTERNAL_ASSERT( aten_output_tensor.equal( fusion_output_tensor.to(aten_output_tensor.dtype())), "\n", err_msg, ".\n Validation error in output ", i, " on line ", line_number, " in file ", file_name, ".\n Values are not equal and are not a floating type."); } } } } // namespace cuda } // namespace fuser } // namespace jit } // namespace torch