#include #include "e2e_test_base.h" #include #include #include #include namespace torch { namespace distributed { namespace rpc { class TestE2EProcessGroup : public TestE2EBase { protected: void buildRpcAgent() override { c10d::ProcessGroupGloo::Options options; options.devices.push_back( ::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress)); std::chrono::milliseconds rpcTimeout(30000); options.timeout = rpcTimeout; // Initialize server rpc agent. auto pg = c10::make_intrusive( store, 0, numWorkers, options); rpcAgent = std::make_shared( "worker", pg, std::max(16U, std::thread::hardware_concurrency()), rpcTimeout, std::make_unique()); } }; // End to end training loop test in C++ so that we can run LSAN on this test to // catch memory leaks. Enabling LSAN with python multiprocessing has been // challenging and we don't have a good solution yet. TEST_F(TestE2EProcessGroup, TestTrainingLoop) { runTrainingLoop(); } } // namespace rpc } // namespace distributed } // namespace torch