mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/68128
Reland of D31762735 (0cbfd466d2).
This diff was originally reverted due to failure in test_send_export_type_through_rpc_with_custom_pickler.
I updated rpc_pickler_test.py to prevent a race condition where processes were not registering their pickler before handling their rpc_sync calls.
Test Plan:
rpc_pickler_test file:
buck test mode/dev-nosan -c 'cxx.coverage_only=caffe2' //caffe2/torch/fb/training_toolkit/backend/metrics/tests:rpc_pickler_test //caffe2/torch/fb/training_toolkit/backend/metrics/collectors/fbdata_aggregator/tests:batch_collector_test -- --run-disabled --collect-coverage '--code-coverage-session=test_session' --force-tpx
rpc_pickler stress test:
buck test mode/dev-nosan -c 'cxx.coverage_only=caffe2' //caffe2/torch/fb/training_toolkit/backend/metrics/tests:rpc_pickler_test -- --exact 'caffe2/torch/fb/training_toolkit/backend/metrics/tests:rpc_pickler_test - test_send_export_type_through_rpc_with_custom_pickler (caffe2.torch.fb.training_toolkit.backend.metrics.tests.rpc_pickler_test.CythonTypeRpcSpawnTest)' --run-disabled --collect-coverage '--code-coverage-session=test_session' --force-tpx --jobs 18 --stress-runs 10 --record-results
Reviewed By: mrshenli
Differential Revision: D32316077
fbshipit-source-id: e58de2335fbaa3ab46d46fe222c659197633a5e4
63 lines
1.9 KiB
C++
63 lines
1.9 KiB
C++
#include <gtest/gtest.h>
|
|
|
|
#include "e2e_test_base.h"
|
|
|
|
#include <c10d/ProcessGroupGloo.hpp>
|
|
#include <torch/csrc/distributed/rpc/request_callback_no_python.h>
|
|
#include <torch/csrc/distributed/rpc/tensorpipe_agent.h>
|
|
#include <torch/torch.h>
|
|
|
|
namespace torch {
|
|
namespace distributed {
|
|
namespace rpc {
|
|
|
|
#ifdef USE_TENSORPIPE
|
|
|
|
class TestE2ETensorPipe : public TestE2EBase {
|
|
protected:
|
|
void buildRpcAgent() override {
|
|
auto options = c10d::ProcessGroupGloo::Options::create();
|
|
options->devices.push_back(
|
|
::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress));
|
|
float rpcTimeout = 30;
|
|
|
|
TensorPipeRpcBackendOptions opts(
|
|
/*numWorkerThreads=*/std::max(16U, std::thread::hardware_concurrency()),
|
|
/*transports=*/nullopt,
|
|
/*channels=*/nullopt,
|
|
/*rpc_timeout=*/rpcTimeout,
|
|
/*init_method=*/"unused");
|
|
|
|
rpcAgent = std::make_shared<TensorPipeAgent>(
|
|
store,
|
|
"worker",
|
|
0,
|
|
numWorkers,
|
|
opts,
|
|
std::unordered_map<std::string, DeviceMap>{},
|
|
std::vector<c10::Device>{},
|
|
std::make_unique<RequestCallbackNoPython>());
|
|
}
|
|
};
|
|
|
|
// End to end training loop test in C++ so that we can run LSAN on this test to
|
|
// catch memory leaks. Enabling LSAN with python multiprocessing has been
|
|
// challenging and we don't have a good solution yet.
|
|
TEST_F(TestE2ETensorPipe, TestTrainingLoop) {
|
|
runTrainingLoop();
|
|
// Ensure the tensorpipe internal state is cleared up.
|
|
auto tensorpipeAgent = std::static_pointer_cast<TensorPipeAgent>(rpcAgent);
|
|
|
|
// Shutdown RPC agent for all RPCs to clean up.
|
|
tensorpipeAgent->join();
|
|
tensorpipeAgent->shutdown();
|
|
ASSERT_EQ(0, tensorpipeAgent->numPendingResponses());
|
|
ASSERT_EQ(0, tensorpipeAgent->timeoutMapSize());
|
|
ASSERT_EQ(0, tensorpipeAgent->messageIdToTimeoutMapSize());
|
|
}
|
|
|
|
#endif
|
|
|
|
} // namespace rpc
|
|
} // namespace distributed
|
|
} // namespace torch
|