mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45867 In most cases the lock ordering was hold a lock in local autograd and then hold a lock in DistAutogradContext. In case of `set_exception_without_signal` the lock order was in reverse and as a result we saw potential deadlock issues in our TSAN tests. To fix this, I removed the lock and instead just used std::atomic exchange. In addition to this, I fixed TestE2E to ensure that we use the appropriate timeout. TestE2EProcessGroup was flaky for these two reasons and now is fixed. ghstack-source-id: 113592709 Test Plan: waitforbuildbot. Reviewed By: albanD Differential Revision: D24120962 fbshipit-source-id: 12447b84ceae772b91e9a183c90d1e6340f44e66
46 lines
1.3 KiB
C++
46 lines
1.3 KiB
C++
#include <gtest/gtest.h>
|
|
|
|
#include "e2e_test_base.h"
|
|
|
|
#include <c10d/ProcessGroupGloo.hpp>
|
|
#include <torch/csrc/distributed/rpc/process_group_agent.h>
|
|
#include <torch/csrc/distributed/rpc/request_callback_no_python.h>
|
|
#include <torch/torch.h>
|
|
|
|
namespace torch {
|
|
namespace distributed {
|
|
namespace rpc {
|
|
|
|
|
|
class TestE2EProcessGroup : public TestE2EBase {
|
|
protected:
|
|
void buildRpcAgent() override {
|
|
c10d::ProcessGroupGloo::Options options;
|
|
options.devices.push_back(
|
|
::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress));
|
|
std::chrono::milliseconds rpcTimeout(30000);
|
|
options.timeout = rpcTimeout;
|
|
|
|
// Initialize server rpc agent.
|
|
auto pg =
|
|
std::make_shared<c10d::ProcessGroupGloo>(store, 0, numWorkers, options);
|
|
|
|
rpcAgent = std::make_shared<ProcessGroupAgent>(
|
|
"worker",
|
|
pg,
|
|
std::max(16U, std::thread::hardware_concurrency()),
|
|
rpcTimeout,
|
|
std::make_unique<RequestCallbackNoPython>());
|
|
}
|
|
};
|
|
|
|
// End to end training loop test in C++ so that we can run LSAN on this test to
|
|
// catch memory leaks. Enabling LSAN with python multiprocessing has been
|
|
// challenging and we don't have a good solution yet.
|
|
TEST_F(TestE2EProcessGroup, TestTrainingLoop) {
|
|
runTrainingLoop();
|
|
}
|
|
|
|
} // namespace rpc
|
|
} // namespace distributed
|
|
} // namespace torch
|