[c10] Move profiler clock to libc10 for timestamps (#111972)

Summary:
Move the profiler's Approximate Clock from libtorch to libc10. The main reason is to allow c10 features to get time.

The clock is using TSC when available for performance. CUDA Caching Allocator's implementation of memory snapshot will add the timestamps to memory events with this same clock in subsequent diff.

Test Plan: CI

Differential Revision: D50601935

Pulled By: aaronenyeshi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/111972
Approved by: https://github.com/davidberard98
This commit is contained in:
Aaron Enye Shi 2023-10-27 16:18:40 +00:00 committed by PyTorch MergeBot
parent fdbb73fa4e
commit 63c089b09d
16 changed files with 283 additions and 242 deletions

View file

@ -0,0 +1,79 @@
#include <c10/util/ApproximateClock.h>
#include <c10/util/ArrayRef.h>
#include <c10/util/irange.h>
#include <fmt/format.h>
namespace c10 {
ApproximateClockToUnixTimeConverter::ApproximateClockToUnixTimeConverter()
: start_times_(measurePairs()) {}
ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair
ApproximateClockToUnixTimeConverter::measurePair() {
// Take a measurement on either side to avoid an ordering bias.
auto fast_0 = getApproximateTime();
auto wall = std::chrono::system_clock::now();
auto fast_1 = getApproximateTime();
TORCH_INTERNAL_ASSERT(fast_1 >= fast_0, "getCount is non-monotonic.");
auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(
wall.time_since_epoch());
// `x + (y - x) / 2` is a more numerically stable average than `(x + y) / 2`.
return {t.count(), fast_0 + (fast_1 - fast_0) / 2};
}
ApproximateClockToUnixTimeConverter::time_pairs
ApproximateClockToUnixTimeConverter::measurePairs() {
static constexpr auto n_warmup = 5;
for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
getApproximateTime();
static_cast<void>(steady_clock_t::now());
}
time_pairs out;
for (const auto i : c10::irange(out.size())) {
out[i] = measurePair();
}
return out;
}
std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
makeConverter() {
auto end_times = measurePairs();
// Compute the real time that passes for each tick of the approximate clock.
std::array<long double, replicates> scale_factors{};
for (const auto i : c10::irange(replicates)) {
auto delta_ns = end_times[i].t_ - start_times_[i].t_;
auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
scale_factors[i] = (double)delta_ns / (double)delta_approx;
}
std::sort(scale_factors.begin(), scale_factors.end());
long double scale_factor = scale_factors[replicates / 2 + 1];
// We shift all times by `t0` for better numerics. Double precision only has
// 16 decimal digits of accuracy, so if we blindly multiply times by
// `scale_factor` we may suffer from precision loss. The choice of `t0` is
// mostly arbitrary; we just need a factor that is the correct order of
// magnitude to bring the intermediate values closer to zero. We are not,
// however, guaranteed that `t0_approx` is *exactly* the getApproximateTime
// equivalent of `t0`; it is only an estimate that we have to fine tune.
auto t0 = start_times_[0].t_;
auto t0_approx = start_times_[0].approx_t_;
std::array<double, replicates> t0_correction{};
for (const auto i : c10::irange(replicates)) {
auto dt = start_times_[i].t_ - t0;
auto dt_approx =
(double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
t0_correction[i] = dt - (time_t)dt_approx; // NOLINT
}
t0 += t0_correction[t0_correction.size() / 2 + 1]; // NOLINT
return [=](approx_time_t t_approx) {
// See above for why this is more stable than `A * t_approx + B`.
return (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0;
};
}
} // namespace c10

121
c10/util/ApproximateClock.h Normal file
View file

@ -0,0 +1,121 @@
// Copyright 2023-present Facebook. All Rights Reserved.
#pragma once
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <list>
#include <string>
#include <unordered_map>
#include <vector>
#include <c10/macros/Macros.h>
#include <c10/util/Optional.h>
#include <c10/util/hash.h>
#ifndef _WIN32
#include <ctime>
#endif
#if defined(C10_IOS) && defined(C10_MOBILE)
#include <sys/time.h> // for gettimeofday()
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
#define C10_RDTSC
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__CUDACC__) || defined(__HIPCC__)
#undef C10_RDTSC
#elif defined(__clang__)
// `__rdtsc` is available by default.
// NB: This has to be first, because Clang will also define `__GNUC__`
#elif defined(__GNUC__)
#include <x86intrin.h>
#else
#undef C10_RDTSC
#endif
#endif
namespace c10 {
using time_t = int64_t;
using steady_clock_t = std::conditional<
std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
inline time_t getTimeSinceEpoch() {
auto now = std::chrono::system_clock::now().time_since_epoch();
return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
}
inline time_t getTime(bool allow_monotonic = false) {
#if defined(C10_IOS) && defined(C10_MOBILE)
// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS
// can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime
// is implemented or not
struct timeval now;
gettimeofday(&now, NULL);
return static_cast<time_t>(now.tv_sec) * 1000000000 +
static_cast<time_t>(now.tv_usec) * 1000;
#elif defined(_WIN32) || defined(__MACH__)
return std::chrono::duration_cast<std::chrono::nanoseconds>(
steady_clock_t::now().time_since_epoch())
.count();
#else
// clock_gettime is *much* faster than std::chrono implementation on Linux
struct timespec t {};
auto mode = CLOCK_REALTIME;
if (allow_monotonic) {
mode = CLOCK_MONOTONIC;
}
clock_gettime(mode, &t);
return static_cast<time_t>(t.tv_sec) * 1000000000 +
static_cast<time_t>(t.tv_nsec);
#endif
}
// We often do not need to capture true wall times. If a fast mechanism such
// as TSC is available we can use that instead and convert back to epoch time
// during post processing. This greatly reduce the clock's contribution to
// profiling.
// http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/
// https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io
// TODO: We should use
// `https://github.com/google/benchmark/blob/main/src/cycleclock.h`
inline auto getApproximateTime() {
#if defined(C10_RDTSC)
return static_cast<uint64_t>(__rdtsc());
#else
return getTime();
#endif
}
using approx_time_t = decltype(getApproximateTime());
static_assert(
std::is_same<approx_time_t, int64_t>::value ||
std::is_same<approx_time_t, uint64_t>::value,
"Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");
// Convert `getCount` results to Nanoseconds since unix epoch.
class C10_API ApproximateClockToUnixTimeConverter final {
public:
ApproximateClockToUnixTimeConverter();
std::function<time_t(approx_time_t)> makeConverter();
struct UnixAndApproximateTimePair {
time_t t_;
approx_time_t approx_t_;
};
static UnixAndApproximateTimePair measurePair();
private:
static constexpr size_t replicates = 1001;
using time_pairs = std::array<UnixAndApproximateTimePair, replicates>;
time_pairs measurePairs();
time_pairs start_times_;
};
} // namespace c10

View file

@ -1,5 +1,6 @@
#include <ATen/Utils.h>
#include <c10/core/TensorImpl.h>
#include <c10/util/ApproximateClock.h>
#include <torch/csrc/jit/backends/backend.h>
#include <torch/csrc/jit/backends/backend_exception.h>
@ -112,14 +113,14 @@ class BackendWithCompiler : public PyTorchBackendInterface {
c10::List<at::Tensor> output_list;
#ifndef NO_PROFILING
auto start_us = torch::profiler::impl::getTime() / 1000;
auto start_us = c10::getTime() / 1000;
#endif
for (const auto& token : handle.toList()) {
IValue val = token;
auto instruction = val.toTupleRef().elements()[0].toStringRef();
auto debug_handle = val.toTupleRef().elements()[1].toInt();
#ifndef NO_PROFILING
auto start_time_us = torch::profiler::impl::getTime() / 1000;
auto start_time_us = c10::getTime() / 1000;
#endif
try {
if (instruction.rfind("prim::Constant", 0) == 0) {
@ -171,7 +172,7 @@ class BackendWithCompiler : public PyTorchBackendInterface {
TORCH_DELEGATED_BACKEND_THROW(false, e.what(), debug_handle);
}
#ifndef NO_PROFILING
auto end_time_us = torch::profiler::impl::getTime() / 1000;
auto end_time_us = c10::getTime() / 1000;
auto duration = end_time_us - start_time_us;
op_runtimes_us.emplace_back(duration, debug_handle, instruction);
#endif

View file

@ -5,6 +5,7 @@
#include <gtest/gtest.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/irange.h>
#include <torch/csrc/profiler/containers.h>
#include <torch/csrc/profiler/util.h>
@ -48,13 +49,12 @@ TEST(ProfilerTest, AppendOnlyList_ref) {
// Test that we can convert TSC measurements back to wall clock time.
TEST(ProfilerTest, clock_converter) {
const int n = 10001;
torch::profiler::impl::ApproximateClockToUnixTimeConverter converter;
std::vector<torch::profiler::impl::ApproximateClockToUnixTimeConverter::
UnixAndApproximateTimePair>
c10::ApproximateClockToUnixTimeConverter converter;
std::vector<
c10::ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair>
pairs;
for (const auto i : c10::irange(n)) {
pairs.push_back(torch::profiler::impl::ApproximateClockToUnixTimeConverter::
measurePair());
pairs.push_back(c10::ApproximateClockToUnixTimeConverter::measurePair());
}
auto count_to_ns = converter.makeConverter();
std::vector<int64_t> deltas;

View file

@ -3,6 +3,7 @@
#include <torch/csrc/autograd/profiler_kineto.h>
#include <c10/macros/Export.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/Exception.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
@ -56,7 +57,7 @@ inline int64_t getTimeUs() {
#ifdef USE_KINETO
return libkineto::timeSinceEpoch(std::chrono::system_clock::now());
#else
return torch::profiler::impl::getTime() / 1000;
return c10::getTime() / 1000;
#endif // USE_KINETO
}
@ -321,7 +322,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
void reportVulkanEventToProfiler(torch::profiler::impl::vulkan_id_t id) {
if (!config_.disabled()) {
record_queue_.getSubqueue()->emplace_vulkan_event(
torch::profiler::impl::getApproximateTime(), id);
c10::getApproximateTime(), id);
}
}
@ -333,7 +334,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
c10::Device device) override {
if (config_.profile_memory && !config_.disabled()) {
record_queue_.getSubqueue()->emplace_allocation_event(
torch::profiler::impl::getApproximateTime(),
c10::getApproximateTime(),
ptr,
alloc_size,
total_allocated,
@ -350,7 +351,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
c10::Device device) override {
if (config_.profile_memory && !config_.disabled()) {
record_queue_.getSubqueue()->emplace_ooms_event(
torch::profiler::impl::getApproximateTime(),
c10::getApproximateTime(),
alloc_size,
total_allocated,
total_reserved,
@ -421,7 +422,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
}
uint64_t start_time_;
torch::profiler::impl::ApproximateClockToUnixTimeConverter clock_converter_;
c10::ApproximateClockToUnixTimeConverter clock_converter_;
torch::profiler::impl::RecordQueue record_queue_;
std::vector<KinetoEvent> kineto_events_;
std::vector<experimental_event_t> event_tree_;
@ -452,8 +453,7 @@ void onFunctionExit(
auto* kineto_ctx_ptr =
static_cast<torch::profiler::impl::KinetoObserverContext*>(ctx_ptr);
TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
kineto_ctx_ptr->event_->end_time_ =
torch::profiler::impl::getApproximateTime();
kineto_ctx_ptr->event_->end_time_ = c10::getApproximateTime();
if (!config.experimental_config.performance_events.empty()) {
state_ptr->record_queue_.getSubqueue()->disable_perf_profiler(
*kineto_ctx_ptr->event_->counters_);

View file

@ -18,6 +18,7 @@
#include <ATen/record_function.h>
#include <c10/core/Allocator.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/ThreadLocalDebugInfo.h>
#include <c10/util/irange.h>
@ -477,7 +478,7 @@ void LegacyEvent::record(bool record_cuda) {
torch::profiler::impl::cudaStubs()->record(&device_, &cuda_event, &cpu_ns_);
return;
}
cpu_ns_ = torch::profiler::impl::getTime();
cpu_ns_ = c10::getTime();
}
/* static */ LegacyEvent LegacyEvent::fromIValue(

View file

@ -16,6 +16,7 @@
#include <ATen/core/TensorBase.h>
#include <c10/macros/Macros.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/Exception.h>
#include <c10/util/Logging.h>
#include <c10/util/Optional.h>
@ -293,7 +294,7 @@ class ValueCache {
auto caller = load<CallType::PyCall>(callsite.caller_);
TORCH_INTERNAL_ASSERT(!caller.module_info_.has_value());
return ExtraFields<Config<C>::event_type>{
/*end_time_ns=*/std::numeric_limits<time_t>::min(),
/*end_time_ns=*/std::numeric_limits<c10::time_t>::min(),
python_tid,
caller.frame_state_,
load<C>(callsite.value_)};
@ -666,8 +667,8 @@ struct ThreadLocalResults {
ValueCache* value_cache_;
PythonTracer* active_tracer_;
CallTypeHelper<TraceKeyCacheState>::tuple_type trace_keys_;
AppendOnlyList<approx_time_t, BLOCK_SIZE> exit_times_;
AppendOnlyList<approx_time_t, BLOCK_SIZE> c_exit_times_;
AppendOnlyList<c10::approx_time_t, BLOCK_SIZE> exit_times_;
AppendOnlyList<c10::approx_time_t, BLOCK_SIZE> c_exit_times_;
};
// ============================================================================
@ -687,13 +688,13 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
void stop() override;
std::vector<std::shared_ptr<Result>> getEvents(
std::function<time_t(approx_time_t)> time_converter,
std::function<c10::time_t(c10::approx_time_t)> time_converter,
std::vector<python_tracer::CompressedEvent>& enters,
time_t end_time_ns) override;
c10::time_t end_time_ns) override;
struct StartFrame {
TraceKey trace_key_;
approx_time_t start_time{};
c10::approx_time_t start_time{};
};
private:
@ -863,7 +864,7 @@ void PythonTracer::recordPyCall(
return tls.intern<CallType::PyCall, E>(no_ephemeral_t(), frame, f_back);
}
}();
const auto time = getApproximateTime();
const auto time = c10::getApproximateTime();
is_startup_frame ? start_frames_.push_back({key, time})
: queue_->getSubqueue()->emplace_py_call(key, time);
}
@ -879,7 +880,7 @@ void PythonTracer::recordCCall(
// `frame->f_back`.
auto key = tls.intern<CallType::PyCCall, EventType::PyCCall>(
arg, (void*)(fn->m_ml), frame);
queue_->getSubqueue()->emplace_py_call(key, getApproximateTime());
queue_->getSubqueue()->emplace_py_call(key, c10::getApproximateTime());
}
// ============================================================================
@ -890,17 +891,17 @@ struct Exit {
return t_ > other.t_;
}
time_t t_;
c10::time_t t_;
size_t python_tid_;
};
class PostProcess {
public:
PostProcess(
std::function<time_t(approx_time_t)> time_converter,
std::function<c10::time_t(c10::approx_time_t)> time_converter,
std::deque<ThreadLocalResults>& tls,
const ValueCache& value_cache,
time_t end_time_ns)
c10::time_t end_time_ns)
: end_time_{end_time_ns}, time_converter_{std::move(time_converter)} {
for (size_t python_tid : c10::irange(tls.size())) {
CallTypeHelper<TraceKeyCacheState>::map(
@ -936,7 +937,9 @@ class PostProcess {
}
template <EventType E, size_t N>
void addExits(AppendOnlyList<approx_time_t, N>& exits, size_t python_tid) {
void addExits(
AppendOnlyList<c10::approx_time_t, N>& exits,
size_t python_tid) {
for (const auto i : exits) {
get_state<E>().exits_.push({time_converter_(i), python_tid});
}
@ -961,7 +964,7 @@ class PostProcess {
std::vector<std::shared_ptr<Result>>& out) {
using stack_t = std::vector<std::shared_ptr<Result>>;
const auto initial_size = out.size();
auto pop = [](stack_t& stack, time_t t) {
auto pop = [](stack_t& stack, c10::time_t t) {
TORCH_INTERNAL_ASSERT(!stack.empty(), "Python replay stack is empty.");
std::get<ExtraFields<E>>(stack.back()->extra_fields_).end_time_ns_ = t;
stack.pop_back();
@ -1026,8 +1029,8 @@ class PostProcess {
return std::get < E == EventType::PyCall ? 0 : 1 > (state_);
}
time_t end_time_;
std::function<time_t(approx_time_t)> time_converter_;
c10::time_t end_time_;
std::function<c10::time_t(c10::approx_time_t)> time_converter_;
std::tuple<State<EventType::PyCall>, State<EventType::PyCCall>> state_;
};
@ -1054,9 +1057,9 @@ struct PythonIDVisitor {
};
std::vector<std::shared_ptr<Result>> PythonTracer::getEvents(
std::function<time_t(approx_time_t)> time_converter,
std::function<c10::time_t(c10::approx_time_t)> time_converter,
std::vector<python_tracer::CompressedEvent>& enters,
time_t end_time_ns) {
c10::time_t end_time_ns) {
value_cache_.trimPrefixes();
PostProcess post_process(
std::move(time_converter),
@ -1099,12 +1102,12 @@ int PythonTracer::pyProfileFn(
case PyTrace_EXCEPTION:
case PyTrace_RETURN:
local_results.exit_times_.emplace_back(getApproximateTime());
local_results.exit_times_.emplace_back(c10::getApproximateTime());
break;
case PyTrace_C_EXCEPTION:
case PyTrace_C_RETURN:
local_results.c_exit_times_.emplace_back(getApproximateTime());
local_results.c_exit_times_.emplace_back(c10::getApproximateTime());
break;
}
return 0;

View file

@ -1,11 +1,12 @@
#pragma once
#include <c10/util/ApproximateClock.h>
#include <torch/csrc/autograd/profiler.h>
namespace c10d {
constexpr int kUnsetTime = -1;
inline int64_t current_time_in_nanos() {
return torch::profiler::impl::getTime();
return c10::getTime();
}
class TORCH_API Timer {

View file

@ -2,6 +2,7 @@
#include <torch/csrc/jit/runtime/register_ops_utils.h>
#include <ATen/core/ivalue.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/irange.h>
#include <torch/csrc/autograd/profiler.h>
#include <torch/csrc/jit/frontend/tracer.h>
@ -370,8 +371,7 @@ RegisterOperators logging_operators(
tracer::recordSourceLocation(node);
graph->insertNode(node);
}
auto output =
torch::profiler::impl::getTime(/*allow_monotonic=*/true);
auto output = c10::getTime(/*allow_monotonic=*/true);
push(stack, output);
if (jit::tracer::isTracing()) {
jit::tracer::addOutput(node, output);

View file

@ -362,7 +362,7 @@ std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
nullptr, &out->fallback_->device_event_start_, nullptr);
}
event->start_time_ = torch::profiler::impl::getApproximateTime();
event->start_time_ = c10::getApproximateTime();
event->allow_tf32_cublas_ = at::globalContext().allowTF32CuBLAS();
if (!config_.experimental_config.performance_events.empty()) {
const size_t n = config_.experimental_config.performance_events.size();
@ -402,7 +402,7 @@ struct StealOrDefault {
void ThreadLocalSubqueue::TorchOpStorage::materialize(
std::vector<std::shared_ptr<Result>>& out,
const std::function<time_t(approx_time_t)>& time_converter,
const std::function<c10::time_t(c10::approx_time_t)>& time_converter,
const uint64_t tid,
const kineto::DeviceAndResource& kineto_info) {
// Plumb Autograd info to the top level annotation.
@ -471,7 +471,7 @@ void materialize_vulkan(
std::vector<std::shared_ptr<Result>>& out,
AppendOnlyList<ExtraFields<EventType::Vulkan>::raw_event_t, BlockSize>&
raw_events,
const std::function<time_t(approx_time_t)>& time_converter,
const std::function<c10::time_t(c10::approx_time_t)>& time_converter,
const uint64_t tid,
const kineto::DeviceAndResource& kineto_info) {
for (const auto& i : raw_events) {
@ -530,7 +530,7 @@ int64_t torchOpEndNS(
const ExtraFields<EventType::TorchOp>& e,
const bool finished,
const std::weak_ptr<Result>& parent) {
if (finished && e.end_time_ns_ == std::numeric_limits<time_t>::min()) {
if (finished && e.end_time_ns_ == std::numeric_limits<c10::time_t>::min()) {
auto p = parent.lock();
if (p) {
return p->endTimeNS();
@ -1176,7 +1176,7 @@ void build_tree(std::vector<std::shared_ptr<Result>>& sorted_events) {
if (event->endTimeNS() > event->start_time_ns_) {
stacks[event->start_tid_] = event;
end_events_.push(event);
} else if (event->endTimeNS() == std::numeric_limits<time_t>::min()) {
} else if (event->endTimeNS() == std::numeric_limits<c10::time_t>::min()) {
// We use min time to indicate the lack of a termination event, so if we
// encounter such a case we don't push to `end_events_`.
stacks[event->start_tid_] = event;
@ -1350,12 +1350,12 @@ std::pair<
std::vector<std::shared_ptr<Result>>,
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
RecordQueue::getRecords(
std::function<time_t(approx_time_t)> time_converter,
std::function<c10::time_t(c10::approx_time_t)> time_converter,
uint64_t start_time_us,
uint64_t end_time_us) {
auto converter = [&](approx_time_t t) {
return t == std::numeric_limits<approx_time_t>::min()
? std::numeric_limits<time_t>::min()
auto converter = [&](c10::approx_time_t t) {
return t == std::numeric_limits<c10::approx_time_t>::min()
? std::numeric_limits<c10::time_t>::min()
: time_converter(t);
};
std::vector<std::shared_ptr<Result>> out;
@ -1364,7 +1364,7 @@ RecordQueue::getRecords(
auto& queue = *subqueue_it.second;
auto materialize = [&](auto& events) {
for (auto& i : events) {
time_t start_time_ns = 0;
c10::time_t start_time_ns = 0;
if constexpr (std::is_same_v<
std::remove_reference_t<decltype(i)>,
ExtraFields<EventType::Backend>>) {

View file

@ -11,6 +11,7 @@
#include <c10/core/Device.h>
#include <c10/core/TensorImpl.h>
#include <c10/macros/Macros.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/strong_type.h>
#include <torch/csrc/profiler/containers.h>
@ -126,7 +127,7 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
ExtraFields(
TorchOpBasicFields&& f,
uint64_t correlation_id,
time_t end_time_ns,
c10::time_t end_time_ns,
std::vector<op_input_t>&& inputs,
std::vector<op_input_t>&& concrete_inputs,
jit_stack_t&& jit_stack,
@ -149,7 +150,7 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
allow_tf32_cublas_{allow_tf32_cublas},
perf_event_counters_{std::move(perf_event_counters)} {}
uint64_t correlation_id_;
time_t end_time_ns_;
c10::time_t end_time_ns_;
std::vector<op_input_t> inputs_;
std::vector<op_input_t> concrete_inputs_;
jit_stack_t jit_stack_;
@ -175,7 +176,7 @@ struct ExtraFields<EventType::Backend> {
template <>
struct ExtraFields<EventType::Vulkan> {
using raw_event_t = std::pair<approx_time_t, vulkan_id_t>;
using raw_event_t = std::pair<c10::approx_time_t, vulkan_id_t>;
std::string name_;
int64_t duration_ns_{0};
// While building the event tree, we want to report a vulkan event's duration
@ -184,7 +185,7 @@ struct ExtraFields<EventType::Vulkan> {
};
struct RawAllocation {
torch::profiler::impl::approx_time_t start_time_;
c10::approx_time_t start_time_;
void* ptr_;
int64_t alloc_size_;
size_t total_allocated_;
@ -210,7 +211,7 @@ struct ExtraFields<EventType::Allocation> : RawAllocation {
template <>
struct ExtraFields<EventType::OutOfMemory> {
torch::profiler::impl::approx_time_t start_time_;
c10::approx_time_t start_time_;
int64_t alloc_size_;
size_t total_allocated_;
size_t total_reserved_;
@ -270,12 +271,15 @@ struct OptimizerInfo {
};
struct PyExtraFieldsBase {
PyExtraFieldsBase(time_t end_time_ns, size_t python_tid, PyFrameState caller)
PyExtraFieldsBase(
c10::time_t end_time_ns,
size_t python_tid,
PyFrameState caller)
: end_time_ns_{end_time_ns},
python_tid_{python_tid},
caller_{std::move(caller)} {}
time_t end_time_ns_;
c10::time_t end_time_ns_;
size_t python_tid_;
PyFrameState caller_;
@ -292,7 +296,7 @@ struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
};
ExtraFields(
time_t end_time_ns,
c10::time_t end_time_ns,
size_t python_tid,
PyFrameState caller,
args_t args)
@ -311,7 +315,7 @@ struct ExtraFields<EventType::PyCCall> : public PyExtraFieldsBase {
using args_t = at::StringView;
ExtraFields(
time_t end_time_ns,
c10::time_t end_time_ns,
size_t python_tid,
PyFrameState caller,
args_t args)
@ -421,10 +425,11 @@ struct TORCH_API Result : public std::enable_shared_from_this<Result> {
struct KinetoObserverContext : public at::ObserverContext {
struct Event {
TorchOpBasicFields basic_fields_;
approx_time_t start_time_;
c10::approx_time_t start_time_;
// Set in the exit callback.
approx_time_t end_time_{std::numeric_limits<approx_time_t>::min()};
c10::approx_time_t end_time_{
std::numeric_limits<c10::approx_time_t>::min()};
bool allow_tf32_cublas_;
std::unique_ptr<perf_counters_t> counters_;
@ -549,7 +554,7 @@ class TORCH_API ThreadLocalSubqueue {
// NB: This is a destructive operation.
void materialize(
std::vector<std::shared_ptr<Result>>& out,
const std::function<time_t(approx_time_t)>& time_converter,
const std::function<c10::time_t(c10::approx_time_t)>& time_converter,
const uint64_t tid,
const kineto::DeviceAndResource& kineto_info);
@ -605,7 +610,9 @@ class TORCH_API ThreadLocalSubqueue {
AppendOnlyList<ExtraFields<EventType::OutOfMemory>, BlockSize> ooms_;
// with_stack (Python)
AppendOnlyList<std::pair<python_tracer::TraceKey, approx_time_t>, BlockSize>
AppendOnlyList<
std::pair<python_tracer::TraceKey, c10::approx_time_t>,
BlockSize>
py_calls_;
};
@ -622,7 +629,7 @@ class TORCH_API RecordQueue {
std::vector<std::shared_ptr<Result>>,
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
getRecords(
std::function<time_t(approx_time_t)> time_converter,
std::function<c10::time_t(c10::approx_time_t)> time_converter,
uint64_t start_time_us,
uint64_t end_time_us);

View file

@ -13,9 +13,9 @@ struct NoOpPythonTracer : public PythonTracerBase {
void stop() override {}
std::vector<std::shared_ptr<Result>> getEvents(
std::function<time_t(approx_time_t)>,
std::function<c10::time_t(c10::approx_time_t)>,
std::vector<CompressedEvent>&,
time_t) override {
c10::time_t) override {
return {};
}
};

View file

@ -5,6 +5,7 @@
#include <utility>
#include <vector>
#include <c10/util/ApproximateClock.h>
#include <c10/util/strong_type.h>
#include <torch/csrc/profiler/kineto_shim.h>
@ -29,7 +30,7 @@ struct CompressedEvent {
TraceKey key_;
uint64_t system_tid_;
kineto::DeviceAndResource kineto_info_;
time_t enter_t_;
c10::time_t enter_t_;
};
/*
@ -49,9 +50,9 @@ struct TORCH_API PythonTracerBase {
virtual void stop() = 0;
virtual std::vector<std::shared_ptr<Result>> getEvents(
std::function<time_t(approx_time_t)> time_converter,
std::function<c10::time_t(c10::approx_time_t)> time_converter,
std::vector<CompressedEvent>& enters,
time_t end_time_ns) = 0;
c10::time_t end_time_ns) = 0;
};
using MakeFn = std::unique_ptr<PythonTracerBase> (*)(RecordQueue*);

View file

@ -3,6 +3,7 @@
#include <nvToolsExt.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/irange.h>
#include <torch/csrc/profiler/stubs/base.h>
#include <torch/csrc/profiler/util.h>
@ -49,7 +50,7 @@ struct CUDAMethods : public ProfilerStubs {
});
auto stream = at::cuda::getCurrentCUDAStream();
if (cpu_ns) {
*cpu_ns = torch::profiler::impl::getTime();
*cpu_ns = c10::getTime();
}
TORCH_CUDA_CHECK(cudaEventRecord(cuda_event_ptr, stream));
}

View file

@ -19,77 +19,6 @@ namespace torch {
namespace profiler {
namespace impl {
ApproximateClockToUnixTimeConverter::ApproximateClockToUnixTimeConverter()
: start_times_(measurePairs()) {}
ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair
ApproximateClockToUnixTimeConverter::measurePair() {
// Take a measurement on either side to avoid an ordering bias.
auto fast_0 = getApproximateTime();
auto wall = std::chrono::system_clock::now();
auto fast_1 = getApproximateTime();
TORCH_INTERNAL_ASSERT(fast_1 >= fast_0, "getCount is non-monotonic.");
auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(
wall.time_since_epoch());
// `x + (y - x) / 2` is a more numerically stable average than `(x + y) / 2`.
return {t.count(), fast_0 + (fast_1 - fast_0) / 2};
}
ApproximateClockToUnixTimeConverter::time_pairs
ApproximateClockToUnixTimeConverter::measurePairs() {
static constexpr auto n_warmup = 5;
for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
getApproximateTime();
steady_clock_t::now();
}
time_pairs out;
for (const auto i : c10::irange(out.size())) {
out[i] = measurePair();
}
return out;
}
std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
makeConverter() {
auto end_times = measurePairs();
// Compute the real time that passes for each tick of the approximate clock.
std::array<long double, replicates> scale_factors{};
for (const auto i : c10::irange(replicates)) {
auto delta_ns = end_times[i].t_ - start_times_[i].t_;
auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
scale_factors[i] = (double)delta_ns / (double)delta_approx;
}
std::sort(scale_factors.begin(), scale_factors.end());
long double scale_factor = scale_factors[replicates / 2 + 1];
// We shift all times by `t0` for better numerics. Double precision only has
// 16 decimal digits of accuracy, so if we blindly multiply times by
// `scale_factor` we may suffer from precision loss. The choice of `t0` is
// mostly arbitrary; we just need a factor that is the correct order of
// magnitude to bring the intermediate values closer to zero. We are not,
// however, guaranteed that `t0_approx` is *exactly* the getApproximateTime
// equivilent of `t0`; it is only an estimate that we have to fine tune.
auto t0 = start_times_[0].t_;
auto t0_approx = start_times_[0].approx_t_;
std::array<double, replicates> t0_correction{};
for (const auto i : c10::irange(replicates)) {
auto dt = start_times_[i].t_ - t0;
auto dt_approx =
(double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
t0_correction[i] = dt - (time_t)dt_approx;
}
t0 += t0_correction[t0_correction.size() / 2 + 1];
return [=](approx_time_t t_approx) {
// See above for why this is more stable than `A * t_approx + B`.
return (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0;
};
}
namespace {
c10::optional<bool> soft_assert_raises_;
} // namespace

View file

@ -14,29 +14,6 @@
#include <torch/csrc/Export.h>
#include <torch/csrc/jit/frontend/source_range.h>
#ifndef _WIN32
#include <ctime>
#endif
#if defined(C10_IOS) && defined(C10_MOBILE)
#include <sys/time.h> // for gettimeofday()
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
#define C10_RDTSC
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__CUDACC__) || defined(__HIPCC__)
#undef C10_RDTSC
#elif defined(__clang__)
// `__rdtsc` is available by default.
// NB: This has to be first, because Clang will also define `__GNUC__`
#elif defined(__GNUC__)
#include <x86intrin.h>
#else
#undef C10_RDTSC
#endif
#endif
// TODO: replace with pytorch/rfcs#43 when it is ready.
#define SOFT_ASSERT(cond, ...) \
[&]() -> bool { \
@ -83,89 +60,10 @@ TORCH_API void logSoftAssert(
const char* cond,
const std::string& args);
using time_t = int64_t;
using steady_clock_t = std::conditional<
std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
using shape =
std::variant<std::vector<int64_t>, std::vector<std::vector<int64_t>>>;
constexpr int TENSOR_LIST_DISPLAY_LENGTH_LIMIT = 30;
inline time_t getTimeSinceEpoch() {
auto now = std::chrono::system_clock::now().time_since_epoch();
return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
}
inline time_t getTime(bool allow_monotonic = false) {
#if defined(C10_IOS) && defined(C10_MOBILE)
// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS
// can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime
// is implemented or not
struct timeval now;
gettimeofday(&now, NULL);
return static_cast<time_t>(now.tv_sec) * 1000000000 +
static_cast<time_t>(now.tv_usec) * 1000;
#elif defined(_WIN32) || defined(__MACH__)
return std::chrono::duration_cast<std::chrono::nanoseconds>(
steady_clock_t::now().time_since_epoch())
.count();
#else
// clock_gettime is *much* faster than std::chrono implementation on Linux
struct timespec t {};
auto mode = CLOCK_REALTIME;
if (allow_monotonic) {
mode = CLOCK_MONOTONIC;
}
clock_gettime(mode, &t);
return static_cast<time_t>(t.tv_sec) * 1000000000 +
static_cast<time_t>(t.tv_nsec);
#endif
}
// We often do not need to capture true wall times. If a fast mechanism such
// as TSC is available we can use that instead and convert back to epoch time
// during post processing. This greatly reduce the clock's contribution to
// profiling.
// http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/
// https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io
// TODO: We should use
// `https://github.com/google/benchmark/blob/main/src/cycleclock.h`
inline auto getApproximateTime() {
#if defined(C10_RDTSC)
return static_cast<uint64_t>(__rdtsc());
#else
return getTime();
#endif
}
using approx_time_t = decltype(getApproximateTime());
static_assert(
std::is_same<approx_time_t, int64_t>::value ||
std::is_same<approx_time_t, uint64_t>::value,
"Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");
// Convert `getCount` results to Nanoseconds since unix epoch.
class ApproximateClockToUnixTimeConverter final {
public:
ApproximateClockToUnixTimeConverter();
std::function<time_t(approx_time_t)> makeConverter();
struct UnixAndApproximateTimePair {
time_t t_;
approx_time_t approx_t_;
};
static UnixAndApproximateTimePair measurePair();
private:
static constexpr size_t replicates = 1001;
using time_pairs = std::array<UnixAndApproximateTimePair, replicates>;
time_pairs measurePairs();
time_pairs start_times_;
};
std::string getNvtxStr(
const char* name,
int64_t sequence_nr,
@ -267,7 +165,6 @@ namespace torch {
namespace autograd {
namespace profiler {
using torch::profiler::impl::computeFlops;
using torch::profiler::impl::getTime;
} // namespace profiler
} // namespace autograd
} // namespace torch