pytorch/test/cpp/rpc/test_e2e_tensorpipe.cpp
Min Si 1ad0048b64 Refactor distribuetd to use absolute header path (#85780)
Headers under torch/csrc/distributed may be referened with relative path, e.g., "<c10d/...>". However, relative path cannot be gracefully handled by Meta internal build when the NCCL PG is hipified to support AMD/RCCL because the "hipified" header files are generated in other directories. Moreover, using absolute path for header inclusion is the state-of-the-art in most components in Pytorch. Thus, this patch refactors all header paths in torch/csrc/distributed to be absolute.

See D39835774 for more details about Meta internal complication.

**How to test**: commit 9e5d199 removes -I./torch/csrc/distributed in compile options. Thus use it to verify we don't miss any relative path use of torch/csrc/distributed headers.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85780
Approved by: https://github.com/kumpera, https://github.com/huydhn
2022-09-30 05:13:50 +00:00

63 lines
1.9 KiB
C++

#include <gtest/gtest.h>
#include "e2e_test_base.h"
#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
#include <torch/csrc/distributed/rpc/request_callback_no_python.h>
#include <torch/csrc/distributed/rpc/tensorpipe_agent.h>
#include <torch/torch.h>
namespace torch {
namespace distributed {
namespace rpc {
#ifdef USE_TENSORPIPE
class TestE2ETensorPipe : public TestE2EBase {
protected:
void buildRpcAgent() override {
auto options = c10d::ProcessGroupGloo::Options::create();
options->devices.push_back(
::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress));
float rpcTimeout = 30;
TensorPipeRpcBackendOptions opts(
/*numWorkerThreads=*/std::max(16U, std::thread::hardware_concurrency()),
/*transports=*/nullopt,
/*channels=*/nullopt,
/*rpc_timeout=*/rpcTimeout,
/*init_method=*/"unused");
rpcAgent = std::make_shared<TensorPipeAgent>(
store,
"worker",
0,
numWorkers,
opts,
std::unordered_map<std::string, DeviceMap>{},
std::vector<c10::Device>{},
std::make_unique<RequestCallbackNoPython>());
}
};
// End to end training loop test in C++ so that we can run LSAN on this test to
// catch memory leaks. Enabling LSAN with python multiprocessing has been
// challenging and we don't have a good solution yet.
TEST_F(TestE2ETensorPipe, TestTrainingLoop) {
runTrainingLoop();
// Ensure the tensorpipe internal state is cleared up.
auto tensorpipeAgent = std::static_pointer_cast<TensorPipeAgent>(rpcAgent);
// Shutdown RPC agent for all RPCs to clean up.
tensorpipeAgent->join();
tensorpipeAgent->shutdown();
ASSERT_EQ(0, tensorpipeAgent->numPendingResponses());
ASSERT_EQ(0, tensorpipeAgent->timeoutMapSize());
ASSERT_EQ(0, tensorpipeAgent->messageIdToTimeoutMapSize());
}
#endif
} // namespace rpc
} // namespace distributed
} // namespace torch