From 63c089b09de42fff67f7ad7ae410669ae0880e1f Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Fri, 27 Oct 2023 16:18:40 +0000 Subject: [PATCH] [c10] Move profiler clock to libc10 for timestamps (#111972) Summary: Move the profiler's Approximate Clock from libtorch to libc10. The main reason is to allow c10 features to get time. The clock is using TSC when available for performance. CUDA Caching Allocator's implementation of memory snapshot will add the timestamps to memory events with this same clock in subsequent diff. Test Plan: CI Differential Revision: D50601935 Pulled By: aaronenyeshi Pull Request resolved: https://github.com/pytorch/pytorch/pull/111972 Approved by: https://github.com/davidberard98 --- c10/util/ApproximateClock.cpp | 79 ++++++++++++ c10/util/ApproximateClock.h | 121 ++++++++++++++++++ test/cpp/jit/test_backend_compiler_lib.cpp | 7 +- test/cpp/profiler/containers.cpp | 10 +- torch/csrc/autograd/profiler_kineto.cpp | 14 +- torch/csrc/autograd/profiler_legacy.cpp | 3 +- torch/csrc/autograd/profiler_python.cpp | 41 +++--- torch/csrc/distributed/c10d/reducer_timer.hpp | 3 +- .../jit/runtime/register_prim_ops_fulljit.cpp | 4 +- torch/csrc/profiler/collection.cpp | 20 +-- torch/csrc/profiler/collection.h | 35 +++-- .../profiler/orchestration/python_tracer.cpp | 4 +- .../profiler/orchestration/python_tracer.h | 7 +- torch/csrc/profiler/stubs/cuda.cpp | 3 +- torch/csrc/profiler/util.cpp | 71 ---------- torch/csrc/profiler/util.h | 103 --------------- 16 files changed, 283 insertions(+), 242 deletions(-) create mode 100644 c10/util/ApproximateClock.cpp create mode 100644 c10/util/ApproximateClock.h diff --git a/c10/util/ApproximateClock.cpp b/c10/util/ApproximateClock.cpp new file mode 100644 index 00000000000..0bda220d83d --- /dev/null +++ b/c10/util/ApproximateClock.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include + +namespace c10 { + +ApproximateClockToUnixTimeConverter::ApproximateClockToUnixTimeConverter() + : start_times_(measurePairs()) {} + +ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair +ApproximateClockToUnixTimeConverter::measurePair() { + // Take a measurement on either side to avoid an ordering bias. + auto fast_0 = getApproximateTime(); + auto wall = std::chrono::system_clock::now(); + auto fast_1 = getApproximateTime(); + + TORCH_INTERNAL_ASSERT(fast_1 >= fast_0, "getCount is non-monotonic."); + auto t = std::chrono::duration_cast( + wall.time_since_epoch()); + + // `x + (y - x) / 2` is a more numerically stable average than `(x + y) / 2`. + return {t.count(), fast_0 + (fast_1 - fast_0) / 2}; +} + +ApproximateClockToUnixTimeConverter::time_pairs +ApproximateClockToUnixTimeConverter::measurePairs() { + static constexpr auto n_warmup = 5; + for (C10_UNUSED const auto _ : c10::irange(n_warmup)) { + getApproximateTime(); + static_cast(steady_clock_t::now()); + } + + time_pairs out; + for (const auto i : c10::irange(out.size())) { + out[i] = measurePair(); + } + return out; +} + +std::function ApproximateClockToUnixTimeConverter:: + makeConverter() { + auto end_times = measurePairs(); + + // Compute the real time that passes for each tick of the approximate clock. + std::array scale_factors{}; + for (const auto i : c10::irange(replicates)) { + auto delta_ns = end_times[i].t_ - start_times_[i].t_; + auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_; + scale_factors[i] = (double)delta_ns / (double)delta_approx; + } + std::sort(scale_factors.begin(), scale_factors.end()); + long double scale_factor = scale_factors[replicates / 2 + 1]; + + // We shift all times by `t0` for better numerics. Double precision only has + // 16 decimal digits of accuracy, so if we blindly multiply times by + // `scale_factor` we may suffer from precision loss. The choice of `t0` is + // mostly arbitrary; we just need a factor that is the correct order of + // magnitude to bring the intermediate values closer to zero. We are not, + // however, guaranteed that `t0_approx` is *exactly* the getApproximateTime + // equivalent of `t0`; it is only an estimate that we have to fine tune. + auto t0 = start_times_[0].t_; + auto t0_approx = start_times_[0].approx_t_; + std::array t0_correction{}; + for (const auto i : c10::irange(replicates)) { + auto dt = start_times_[i].t_ - t0; + auto dt_approx = + (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor; + t0_correction[i] = dt - (time_t)dt_approx; // NOLINT + } + t0 += t0_correction[t0_correction.size() / 2 + 1]; // NOLINT + + return [=](approx_time_t t_approx) { + // See above for why this is more stable than `A * t_approx + B`. + return (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0; + }; +} + +} // namespace c10 diff --git a/c10/util/ApproximateClock.h b/c10/util/ApproximateClock.h new file mode 100644 index 00000000000..7de498cebed --- /dev/null +++ b/c10/util/ApproximateClock.h @@ -0,0 +1,121 @@ +// Copyright 2023-present Facebook. All Rights Reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifndef _WIN32 +#include +#endif +#if defined(C10_IOS) && defined(C10_MOBILE) +#include // for gettimeofday() +#endif + +#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) +#define C10_RDTSC +#if defined(_MSC_VER) +#include +#elif defined(__CUDACC__) || defined(__HIPCC__) +#undef C10_RDTSC +#elif defined(__clang__) +// `__rdtsc` is available by default. +// NB: This has to be first, because Clang will also define `__GNUC__` +#elif defined(__GNUC__) +#include +#else +#undef C10_RDTSC +#endif +#endif + +namespace c10 { + +using time_t = int64_t; +using steady_clock_t = std::conditional< + std::chrono::high_resolution_clock::is_steady, + std::chrono::high_resolution_clock, + std::chrono::steady_clock>::type; + +inline time_t getTimeSinceEpoch() { + auto now = std::chrono::system_clock::now().time_since_epoch(); + return std::chrono::duration_cast(now).count(); +} + +inline time_t getTime(bool allow_monotonic = false) { +#if defined(C10_IOS) && defined(C10_MOBILE) + // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS + // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime + // is implemented or not + struct timeval now; + gettimeofday(&now, NULL); + return static_cast(now.tv_sec) * 1000000000 + + static_cast(now.tv_usec) * 1000; +#elif defined(_WIN32) || defined(__MACH__) + return std::chrono::duration_cast( + steady_clock_t::now().time_since_epoch()) + .count(); +#else + // clock_gettime is *much* faster than std::chrono implementation on Linux + struct timespec t {}; + auto mode = CLOCK_REALTIME; + if (allow_monotonic) { + mode = CLOCK_MONOTONIC; + } + clock_gettime(mode, &t); + return static_cast(t.tv_sec) * 1000000000 + + static_cast(t.tv_nsec); +#endif +} + +// We often do not need to capture true wall times. If a fast mechanism such +// as TSC is available we can use that instead and convert back to epoch time +// during post processing. This greatly reduce the clock's contribution to +// profiling. +// http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/ +// https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io +// TODO: We should use +// `https://github.com/google/benchmark/blob/main/src/cycleclock.h` +inline auto getApproximateTime() { +#if defined(C10_RDTSC) + return static_cast(__rdtsc()); +#else + return getTime(); +#endif +} + +using approx_time_t = decltype(getApproximateTime()); +static_assert( + std::is_same::value || + std::is_same::value, + "Expected either int64_t (`getTime`) or uint64_t (some TSC reads)."); + +// Convert `getCount` results to Nanoseconds since unix epoch. +class C10_API ApproximateClockToUnixTimeConverter final { + public: + ApproximateClockToUnixTimeConverter(); + std::function makeConverter(); + + struct UnixAndApproximateTimePair { + time_t t_; + approx_time_t approx_t_; + }; + static UnixAndApproximateTimePair measurePair(); + + private: + static constexpr size_t replicates = 1001; + using time_pairs = std::array; + time_pairs measurePairs(); + + time_pairs start_times_; +}; + +} // namespace c10 diff --git a/test/cpp/jit/test_backend_compiler_lib.cpp b/test/cpp/jit/test_backend_compiler_lib.cpp index 14ea7bcd2f0..33262efd1e2 100644 --- a/test/cpp/jit/test_backend_compiler_lib.cpp +++ b/test/cpp/jit/test_backend_compiler_lib.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -112,14 +113,14 @@ class BackendWithCompiler : public PyTorchBackendInterface { c10::List output_list; #ifndef NO_PROFILING - auto start_us = torch::profiler::impl::getTime() / 1000; + auto start_us = c10::getTime() / 1000; #endif for (const auto& token : handle.toList()) { IValue val = token; auto instruction = val.toTupleRef().elements()[0].toStringRef(); auto debug_handle = val.toTupleRef().elements()[1].toInt(); #ifndef NO_PROFILING - auto start_time_us = torch::profiler::impl::getTime() / 1000; + auto start_time_us = c10::getTime() / 1000; #endif try { if (instruction.rfind("prim::Constant", 0) == 0) { @@ -171,7 +172,7 @@ class BackendWithCompiler : public PyTorchBackendInterface { TORCH_DELEGATED_BACKEND_THROW(false, e.what(), debug_handle); } #ifndef NO_PROFILING - auto end_time_us = torch::profiler::impl::getTime() / 1000; + auto end_time_us = c10::getTime() / 1000; auto duration = end_time_us - start_time_us; op_runtimes_us.emplace_back(duration, debug_handle, instruction); #endif diff --git a/test/cpp/profiler/containers.cpp b/test/cpp/profiler/containers.cpp index cb417af1bba..d5870795d74 100644 --- a/test/cpp/profiler/containers.cpp +++ b/test/cpp/profiler/containers.cpp @@ -5,6 +5,7 @@ #include +#include #include #include #include @@ -48,13 +49,12 @@ TEST(ProfilerTest, AppendOnlyList_ref) { // Test that we can convert TSC measurements back to wall clock time. TEST(ProfilerTest, clock_converter) { const int n = 10001; - torch::profiler::impl::ApproximateClockToUnixTimeConverter converter; - std::vector + c10::ApproximateClockToUnixTimeConverter converter; + std::vector< + c10::ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair> pairs; for (const auto i : c10::irange(n)) { - pairs.push_back(torch::profiler::impl::ApproximateClockToUnixTimeConverter:: - measurePair()); + pairs.push_back(c10::ApproximateClockToUnixTimeConverter::measurePair()); } auto count_to_ns = converter.makeConverter(); std::vector deltas; diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 056f4b17df1..63367ecb8eb 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -56,7 +57,7 @@ inline int64_t getTimeUs() { #ifdef USE_KINETO return libkineto::timeSinceEpoch(std::chrono::system_clock::now()); #else - return torch::profiler::impl::getTime() / 1000; + return c10::getTime() / 1000; #endif // USE_KINETO } @@ -321,7 +322,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase { void reportVulkanEventToProfiler(torch::profiler::impl::vulkan_id_t id) { if (!config_.disabled()) { record_queue_.getSubqueue()->emplace_vulkan_event( - torch::profiler::impl::getApproximateTime(), id); + c10::getApproximateTime(), id); } } @@ -333,7 +334,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase { c10::Device device) override { if (config_.profile_memory && !config_.disabled()) { record_queue_.getSubqueue()->emplace_allocation_event( - torch::profiler::impl::getApproximateTime(), + c10::getApproximateTime(), ptr, alloc_size, total_allocated, @@ -350,7 +351,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase { c10::Device device) override { if (config_.profile_memory && !config_.disabled()) { record_queue_.getSubqueue()->emplace_ooms_event( - torch::profiler::impl::getApproximateTime(), + c10::getApproximateTime(), alloc_size, total_allocated, total_reserved, @@ -421,7 +422,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase { } uint64_t start_time_; - torch::profiler::impl::ApproximateClockToUnixTimeConverter clock_converter_; + c10::ApproximateClockToUnixTimeConverter clock_converter_; torch::profiler::impl::RecordQueue record_queue_; std::vector kineto_events_; std::vector event_tree_; @@ -452,8 +453,7 @@ void onFunctionExit( auto* kineto_ctx_ptr = static_cast(ctx_ptr); TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); - kineto_ctx_ptr->event_->end_time_ = - torch::profiler::impl::getApproximateTime(); + kineto_ctx_ptr->event_->end_time_ = c10::getApproximateTime(); if (!config.experimental_config.performance_events.empty()) { state_ptr->record_queue_.getSubqueue()->disable_perf_profiler( *kineto_ctx_ptr->event_->counters_); diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index 388695957e4..5d7dd02312b 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -477,7 +478,7 @@ void LegacyEvent::record(bool record_cuda) { torch::profiler::impl::cudaStubs()->record(&device_, &cuda_event, &cpu_ns_); return; } - cpu_ns_ = torch::profiler::impl::getTime(); + cpu_ns_ = c10::getTime(); } /* static */ LegacyEvent LegacyEvent::fromIValue( diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp index b716693b0ca..9d827837bf4 100644 --- a/torch/csrc/autograd/profiler_python.cpp +++ b/torch/csrc/autograd/profiler_python.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -293,7 +294,7 @@ class ValueCache { auto caller = load(callsite.caller_); TORCH_INTERNAL_ASSERT(!caller.module_info_.has_value()); return ExtraFields::event_type>{ - /*end_time_ns=*/std::numeric_limits::min(), + /*end_time_ns=*/std::numeric_limits::min(), python_tid, caller.frame_state_, load(callsite.value_)}; @@ -666,8 +667,8 @@ struct ThreadLocalResults { ValueCache* value_cache_; PythonTracer* active_tracer_; CallTypeHelper::tuple_type trace_keys_; - AppendOnlyList exit_times_; - AppendOnlyList c_exit_times_; + AppendOnlyList exit_times_; + AppendOnlyList c_exit_times_; }; // ============================================================================ @@ -687,13 +688,13 @@ class PythonTracer final : public python_tracer::PythonTracerBase { void stop() override; std::vector> getEvents( - std::function time_converter, + std::function time_converter, std::vector& enters, - time_t end_time_ns) override; + c10::time_t end_time_ns) override; struct StartFrame { TraceKey trace_key_; - approx_time_t start_time{}; + c10::approx_time_t start_time{}; }; private: @@ -863,7 +864,7 @@ void PythonTracer::recordPyCall( return tls.intern(no_ephemeral_t(), frame, f_back); } }(); - const auto time = getApproximateTime(); + const auto time = c10::getApproximateTime(); is_startup_frame ? start_frames_.push_back({key, time}) : queue_->getSubqueue()->emplace_py_call(key, time); } @@ -879,7 +880,7 @@ void PythonTracer::recordCCall( // `frame->f_back`. auto key = tls.intern( arg, (void*)(fn->m_ml), frame); - queue_->getSubqueue()->emplace_py_call(key, getApproximateTime()); + queue_->getSubqueue()->emplace_py_call(key, c10::getApproximateTime()); } // ============================================================================ @@ -890,17 +891,17 @@ struct Exit { return t_ > other.t_; } - time_t t_; + c10::time_t t_; size_t python_tid_; }; class PostProcess { public: PostProcess( - std::function time_converter, + std::function time_converter, std::deque& tls, const ValueCache& value_cache, - time_t end_time_ns) + c10::time_t end_time_ns) : end_time_{end_time_ns}, time_converter_{std::move(time_converter)} { for (size_t python_tid : c10::irange(tls.size())) { CallTypeHelper::map( @@ -936,7 +937,9 @@ class PostProcess { } template - void addExits(AppendOnlyList& exits, size_t python_tid) { + void addExits( + AppendOnlyList& exits, + size_t python_tid) { for (const auto i : exits) { get_state().exits_.push({time_converter_(i), python_tid}); } @@ -961,7 +964,7 @@ class PostProcess { std::vector>& out) { using stack_t = std::vector>; const auto initial_size = out.size(); - auto pop = [](stack_t& stack, time_t t) { + auto pop = [](stack_t& stack, c10::time_t t) { TORCH_INTERNAL_ASSERT(!stack.empty(), "Python replay stack is empty."); std::get>(stack.back()->extra_fields_).end_time_ns_ = t; stack.pop_back(); @@ -1026,8 +1029,8 @@ class PostProcess { return std::get < E == EventType::PyCall ? 0 : 1 > (state_); } - time_t end_time_; - std::function time_converter_; + c10::time_t end_time_; + std::function time_converter_; std::tuple, State> state_; }; @@ -1054,9 +1057,9 @@ struct PythonIDVisitor { }; std::vector> PythonTracer::getEvents( - std::function time_converter, + std::function time_converter, std::vector& enters, - time_t end_time_ns) { + c10::time_t end_time_ns) { value_cache_.trimPrefixes(); PostProcess post_process( std::move(time_converter), @@ -1099,12 +1102,12 @@ int PythonTracer::pyProfileFn( case PyTrace_EXCEPTION: case PyTrace_RETURN: - local_results.exit_times_.emplace_back(getApproximateTime()); + local_results.exit_times_.emplace_back(c10::getApproximateTime()); break; case PyTrace_C_EXCEPTION: case PyTrace_C_RETURN: - local_results.c_exit_times_.emplace_back(getApproximateTime()); + local_results.c_exit_times_.emplace_back(c10::getApproximateTime()); break; } return 0; diff --git a/torch/csrc/distributed/c10d/reducer_timer.hpp b/torch/csrc/distributed/c10d/reducer_timer.hpp index ca8dd163eec..acd8975c4d2 100644 --- a/torch/csrc/distributed/c10d/reducer_timer.hpp +++ b/torch/csrc/distributed/c10d/reducer_timer.hpp @@ -1,11 +1,12 @@ #pragma once +#include #include namespace c10d { constexpr int kUnsetTime = -1; inline int64_t current_time_in_nanos() { - return torch::profiler::impl::getTime(); + return c10::getTime(); } class TORCH_API Timer { diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp index 822db07fa70..ac2dd62e64c 100644 --- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -370,8 +371,7 @@ RegisterOperators logging_operators( tracer::recordSourceLocation(node); graph->insertNode(node); } - auto output = - torch::profiler::impl::getTime(/*allow_monotonic=*/true); + auto output = c10::getTime(/*allow_monotonic=*/true); push(stack, output); if (jit::tracer::isTracing()) { jit::tracer::addOutput(node, output); diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp index bb18304359a..440cabe94d2 100644 --- a/torch/csrc/profiler/collection.cpp +++ b/torch/csrc/profiler/collection.cpp @@ -362,7 +362,7 @@ std::unique_ptr ThreadLocalSubqueue::begin_op( nullptr, &out->fallback_->device_event_start_, nullptr); } - event->start_time_ = torch::profiler::impl::getApproximateTime(); + event->start_time_ = c10::getApproximateTime(); event->allow_tf32_cublas_ = at::globalContext().allowTF32CuBLAS(); if (!config_.experimental_config.performance_events.empty()) { const size_t n = config_.experimental_config.performance_events.size(); @@ -402,7 +402,7 @@ struct StealOrDefault { void ThreadLocalSubqueue::TorchOpStorage::materialize( std::vector>& out, - const std::function& time_converter, + const std::function& time_converter, const uint64_t tid, const kineto::DeviceAndResource& kineto_info) { // Plumb Autograd info to the top level annotation. @@ -471,7 +471,7 @@ void materialize_vulkan( std::vector>& out, AppendOnlyList::raw_event_t, BlockSize>& raw_events, - const std::function& time_converter, + const std::function& time_converter, const uint64_t tid, const kineto::DeviceAndResource& kineto_info) { for (const auto& i : raw_events) { @@ -530,7 +530,7 @@ int64_t torchOpEndNS( const ExtraFields& e, const bool finished, const std::weak_ptr& parent) { - if (finished && e.end_time_ns_ == std::numeric_limits::min()) { + if (finished && e.end_time_ns_ == std::numeric_limits::min()) { auto p = parent.lock(); if (p) { return p->endTimeNS(); @@ -1176,7 +1176,7 @@ void build_tree(std::vector>& sorted_events) { if (event->endTimeNS() > event->start_time_ns_) { stacks[event->start_tid_] = event; end_events_.push(event); - } else if (event->endTimeNS() == std::numeric_limits::min()) { + } else if (event->endTimeNS() == std::numeric_limits::min()) { // We use min time to indicate the lack of a termination event, so if we // encounter such a case we don't push to `end_events_`. stacks[event->start_tid_] = event; @@ -1350,12 +1350,12 @@ std::pair< std::vector>, std::unique_ptr> RecordQueue::getRecords( - std::function time_converter, + std::function time_converter, uint64_t start_time_us, uint64_t end_time_us) { - auto converter = [&](approx_time_t t) { - return t == std::numeric_limits::min() - ? std::numeric_limits::min() + auto converter = [&](c10::approx_time_t t) { + return t == std::numeric_limits::min() + ? std::numeric_limits::min() : time_converter(t); }; std::vector> out; @@ -1364,7 +1364,7 @@ RecordQueue::getRecords( auto& queue = *subqueue_it.second; auto materialize = [&](auto& events) { for (auto& i : events) { - time_t start_time_ns = 0; + c10::time_t start_time_ns = 0; if constexpr (std::is_same_v< std::remove_reference_t, ExtraFields>) { diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h index b70def43c89..d4640eece1c 100644 --- a/torch/csrc/profiler/collection.h +++ b/torch/csrc/profiler/collection.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -126,7 +127,7 @@ struct ExtraFields : TorchOpBasicFields { ExtraFields( TorchOpBasicFields&& f, uint64_t correlation_id, - time_t end_time_ns, + c10::time_t end_time_ns, std::vector&& inputs, std::vector&& concrete_inputs, jit_stack_t&& jit_stack, @@ -149,7 +150,7 @@ struct ExtraFields : TorchOpBasicFields { allow_tf32_cublas_{allow_tf32_cublas}, perf_event_counters_{std::move(perf_event_counters)} {} uint64_t correlation_id_; - time_t end_time_ns_; + c10::time_t end_time_ns_; std::vector inputs_; std::vector concrete_inputs_; jit_stack_t jit_stack_; @@ -175,7 +176,7 @@ struct ExtraFields { template <> struct ExtraFields { - using raw_event_t = std::pair; + using raw_event_t = std::pair; std::string name_; int64_t duration_ns_{0}; // While building the event tree, we want to report a vulkan event's duration @@ -184,7 +185,7 @@ struct ExtraFields { }; struct RawAllocation { - torch::profiler::impl::approx_time_t start_time_; + c10::approx_time_t start_time_; void* ptr_; int64_t alloc_size_; size_t total_allocated_; @@ -210,7 +211,7 @@ struct ExtraFields : RawAllocation { template <> struct ExtraFields { - torch::profiler::impl::approx_time_t start_time_; + c10::approx_time_t start_time_; int64_t alloc_size_; size_t total_allocated_; size_t total_reserved_; @@ -270,12 +271,15 @@ struct OptimizerInfo { }; struct PyExtraFieldsBase { - PyExtraFieldsBase(time_t end_time_ns, size_t python_tid, PyFrameState caller) + PyExtraFieldsBase( + c10::time_t end_time_ns, + size_t python_tid, + PyFrameState caller) : end_time_ns_{end_time_ns}, python_tid_{python_tid}, caller_{std::move(caller)} {} - time_t end_time_ns_; + c10::time_t end_time_ns_; size_t python_tid_; PyFrameState caller_; @@ -292,7 +296,7 @@ struct ExtraFields : public PyExtraFieldsBase { }; ExtraFields( - time_t end_time_ns, + c10::time_t end_time_ns, size_t python_tid, PyFrameState caller, args_t args) @@ -311,7 +315,7 @@ struct ExtraFields : public PyExtraFieldsBase { using args_t = at::StringView; ExtraFields( - time_t end_time_ns, + c10::time_t end_time_ns, size_t python_tid, PyFrameState caller, args_t args) @@ -421,10 +425,11 @@ struct TORCH_API Result : public std::enable_shared_from_this { struct KinetoObserverContext : public at::ObserverContext { struct Event { TorchOpBasicFields basic_fields_; - approx_time_t start_time_; + c10::approx_time_t start_time_; // Set in the exit callback. - approx_time_t end_time_{std::numeric_limits::min()}; + c10::approx_time_t end_time_{ + std::numeric_limits::min()}; bool allow_tf32_cublas_; std::unique_ptr counters_; @@ -549,7 +554,7 @@ class TORCH_API ThreadLocalSubqueue { // NB: This is a destructive operation. void materialize( std::vector>& out, - const std::function& time_converter, + const std::function& time_converter, const uint64_t tid, const kineto::DeviceAndResource& kineto_info); @@ -605,7 +610,9 @@ class TORCH_API ThreadLocalSubqueue { AppendOnlyList, BlockSize> ooms_; // with_stack (Python) - AppendOnlyList, BlockSize> + AppendOnlyList< + std::pair, + BlockSize> py_calls_; }; @@ -622,7 +629,7 @@ class TORCH_API RecordQueue { std::vector>, std::unique_ptr> getRecords( - std::function time_converter, + std::function time_converter, uint64_t start_time_us, uint64_t end_time_us); diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp index 64db126b25e..61773fe23ca 100644 --- a/torch/csrc/profiler/orchestration/python_tracer.cpp +++ b/torch/csrc/profiler/orchestration/python_tracer.cpp @@ -13,9 +13,9 @@ struct NoOpPythonTracer : public PythonTracerBase { void stop() override {} std::vector> getEvents( - std::function, + std::function, std::vector&, - time_t) override { + c10::time_t) override { return {}; } }; diff --git a/torch/csrc/profiler/orchestration/python_tracer.h b/torch/csrc/profiler/orchestration/python_tracer.h index 93becfee3cf..f05aefecfe1 100644 --- a/torch/csrc/profiler/orchestration/python_tracer.h +++ b/torch/csrc/profiler/orchestration/python_tracer.h @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -29,7 +30,7 @@ struct CompressedEvent { TraceKey key_; uint64_t system_tid_; kineto::DeviceAndResource kineto_info_; - time_t enter_t_; + c10::time_t enter_t_; }; /* @@ -49,9 +50,9 @@ struct TORCH_API PythonTracerBase { virtual void stop() = 0; virtual std::vector> getEvents( - std::function time_converter, + std::function time_converter, std::vector& enters, - time_t end_time_ns) = 0; + c10::time_t end_time_ns) = 0; }; using MakeFn = std::unique_ptr (*)(RecordQueue*); diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp index dec87576f36..ff39df4eb16 100644 --- a/torch/csrc/profiler/stubs/cuda.cpp +++ b/torch/csrc/profiler/stubs/cuda.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -49,7 +50,7 @@ struct CUDAMethods : public ProfilerStubs { }); auto stream = at::cuda::getCurrentCUDAStream(); if (cpu_ns) { - *cpu_ns = torch::profiler::impl::getTime(); + *cpu_ns = c10::getTime(); } TORCH_CUDA_CHECK(cudaEventRecord(cuda_event_ptr, stream)); } diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp index 65dc3b5a5fd..f29366bc195 100644 --- a/torch/csrc/profiler/util.cpp +++ b/torch/csrc/profiler/util.cpp @@ -19,77 +19,6 @@ namespace torch { namespace profiler { namespace impl { -ApproximateClockToUnixTimeConverter::ApproximateClockToUnixTimeConverter() - : start_times_(measurePairs()) {} - -ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair -ApproximateClockToUnixTimeConverter::measurePair() { - // Take a measurement on either side to avoid an ordering bias. - auto fast_0 = getApproximateTime(); - auto wall = std::chrono::system_clock::now(); - auto fast_1 = getApproximateTime(); - - TORCH_INTERNAL_ASSERT(fast_1 >= fast_0, "getCount is non-monotonic."); - auto t = std::chrono::duration_cast( - wall.time_since_epoch()); - - // `x + (y - x) / 2` is a more numerically stable average than `(x + y) / 2`. - return {t.count(), fast_0 + (fast_1 - fast_0) / 2}; -} - -ApproximateClockToUnixTimeConverter::time_pairs -ApproximateClockToUnixTimeConverter::measurePairs() { - static constexpr auto n_warmup = 5; - for (C10_UNUSED const auto _ : c10::irange(n_warmup)) { - getApproximateTime(); - steady_clock_t::now(); - } - - time_pairs out; - for (const auto i : c10::irange(out.size())) { - out[i] = measurePair(); - } - return out; -} - -std::function ApproximateClockToUnixTimeConverter:: - makeConverter() { - auto end_times = measurePairs(); - - // Compute the real time that passes for each tick of the approximate clock. - std::array scale_factors{}; - for (const auto i : c10::irange(replicates)) { - auto delta_ns = end_times[i].t_ - start_times_[i].t_; - auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_; - scale_factors[i] = (double)delta_ns / (double)delta_approx; - } - std::sort(scale_factors.begin(), scale_factors.end()); - long double scale_factor = scale_factors[replicates / 2 + 1]; - - // We shift all times by `t0` for better numerics. Double precision only has - // 16 decimal digits of accuracy, so if we blindly multiply times by - // `scale_factor` we may suffer from precision loss. The choice of `t0` is - // mostly arbitrary; we just need a factor that is the correct order of - // magnitude to bring the intermediate values closer to zero. We are not, - // however, guaranteed that `t0_approx` is *exactly* the getApproximateTime - // equivilent of `t0`; it is only an estimate that we have to fine tune. - auto t0 = start_times_[0].t_; - auto t0_approx = start_times_[0].approx_t_; - std::array t0_correction{}; - for (const auto i : c10::irange(replicates)) { - auto dt = start_times_[i].t_ - t0; - auto dt_approx = - (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor; - t0_correction[i] = dt - (time_t)dt_approx; - } - t0 += t0_correction[t0_correction.size() / 2 + 1]; - - return [=](approx_time_t t_approx) { - // See above for why this is more stable than `A * t_approx + B`. - return (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0; - }; -} - namespace { c10::optional soft_assert_raises_; } // namespace diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h index dd7147374f0..4b565c691ca 100644 --- a/torch/csrc/profiler/util.h +++ b/torch/csrc/profiler/util.h @@ -14,29 +14,6 @@ #include #include -#ifndef _WIN32 -#include -#endif -#if defined(C10_IOS) && defined(C10_MOBILE) -#include // for gettimeofday() -#endif - -#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) -#define C10_RDTSC -#if defined(_MSC_VER) -#include -#elif defined(__CUDACC__) || defined(__HIPCC__) -#undef C10_RDTSC -#elif defined(__clang__) -// `__rdtsc` is available by default. -// NB: This has to be first, because Clang will also define `__GNUC__` -#elif defined(__GNUC__) -#include -#else -#undef C10_RDTSC -#endif -#endif - // TODO: replace with pytorch/rfcs#43 when it is ready. #define SOFT_ASSERT(cond, ...) \ [&]() -> bool { \ @@ -83,89 +60,10 @@ TORCH_API void logSoftAssert( const char* cond, const std::string& args); -using time_t = int64_t; -using steady_clock_t = std::conditional< - std::chrono::high_resolution_clock::is_steady, - std::chrono::high_resolution_clock, - std::chrono::steady_clock>::type; - using shape = std::variant, std::vector>>; constexpr int TENSOR_LIST_DISPLAY_LENGTH_LIMIT = 30; -inline time_t getTimeSinceEpoch() { - auto now = std::chrono::system_clock::now().time_since_epoch(); - return std::chrono::duration_cast(now).count(); -} - -inline time_t getTime(bool allow_monotonic = false) { -#if defined(C10_IOS) && defined(C10_MOBILE) - // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS - // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime - // is implemented or not - struct timeval now; - gettimeofday(&now, NULL); - return static_cast(now.tv_sec) * 1000000000 + - static_cast(now.tv_usec) * 1000; -#elif defined(_WIN32) || defined(__MACH__) - return std::chrono::duration_cast( - steady_clock_t::now().time_since_epoch()) - .count(); -#else - // clock_gettime is *much* faster than std::chrono implementation on Linux - struct timespec t {}; - auto mode = CLOCK_REALTIME; - if (allow_monotonic) { - mode = CLOCK_MONOTONIC; - } - clock_gettime(mode, &t); - return static_cast(t.tv_sec) * 1000000000 + - static_cast(t.tv_nsec); -#endif -} - -// We often do not need to capture true wall times. If a fast mechanism such -// as TSC is available we can use that instead and convert back to epoch time -// during post processing. This greatly reduce the clock's contribution to -// profiling. -// http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/ -// https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io -// TODO: We should use -// `https://github.com/google/benchmark/blob/main/src/cycleclock.h` -inline auto getApproximateTime() { -#if defined(C10_RDTSC) - return static_cast(__rdtsc()); -#else - return getTime(); -#endif -} - -using approx_time_t = decltype(getApproximateTime()); -static_assert( - std::is_same::value || - std::is_same::value, - "Expected either int64_t (`getTime`) or uint64_t (some TSC reads)."); - -// Convert `getCount` results to Nanoseconds since unix epoch. -class ApproximateClockToUnixTimeConverter final { - public: - ApproximateClockToUnixTimeConverter(); - std::function makeConverter(); - - struct UnixAndApproximateTimePair { - time_t t_; - approx_time_t approx_t_; - }; - static UnixAndApproximateTimePair measurePair(); - - private: - static constexpr size_t replicates = 1001; - using time_pairs = std::array; - time_pairs measurePairs(); - - time_pairs start_times_; -}; - std::string getNvtxStr( const char* name, int64_t sequence_nr, @@ -267,7 +165,6 @@ namespace torch { namespace autograd { namespace profiler { using torch::profiler::impl::computeFlops; -using torch::profiler::impl::getTime; } // namespace profiler } // namespace autograd } // namespace torch