mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Summary: Quite common, hard-to-debug, performance bug for multi-GPU training has been that operators have been passed tensors that reside on different GPU than what the op runs on. Since we have peer access enabled, this works, but is just much slower. With data parallel model this problem arises rarely as it has static analysis of the operators, but if someone bypassed DPM or uses FeedBlob with incorrect device options, this problem can happen. To make debugging easier, I added device-field to tensor that stores the device information that allocated the memory. In addition, I added a function to go through operator inputs and outputs and compare their tensor device to the operator device. This check is run after first iteration with prof_dag only. Also renamed ShapeCall to TensorInfoFun, as it now returns so much other info than the shape. I think this is pretty safe diff, but do you find it problematic to add a new field to tensor? Reviewed By: dzhulgakov Differential Revision: D5335505 fbshipit-source-id: 511b6c122dff9a205f43951984868ffd40f7ac30
108 lines
3 KiB
C++
108 lines
3 KiB
C++
#include "caffe2/core/tensor.h"
|
|
|
|
#include "caffe2/core/blob_stats.h"
|
|
#include "caffe2/core/flags.h"
|
|
|
|
CAFFE2_DEFINE_bool(
|
|
caffe2_keep_on_shrink,
|
|
true,
|
|
"If set, keeps memory when a tensor is shrinking its size.");
|
|
|
|
CAFFE2_DEFINE_int64(
|
|
caffe2_max_keep_on_shrink_memory,
|
|
LLONG_MAX,
|
|
"The maximum memory in bytes to keep on shrink, if the difference between "
|
|
"tensor sizes is bigger than this then tensor will be reset.");
|
|
|
|
namespace caffe2 {
|
|
// declaring it here instead of context.cc because tensor.h includes context.h
|
|
CAFFE_KNOWN_TYPE(Tensor<CPUContext>);
|
|
|
|
TensorPrinter::TensorPrinter(
|
|
const std::string& tensor_name,
|
|
const std::string& file_name,
|
|
int limit)
|
|
: to_file_(!file_name.empty()),
|
|
limit_(limit ? limit : k_limit_default_),
|
|
tensor_name_(tensor_name) {
|
|
if (to_file_) {
|
|
// We will output to file instead of printing on screen.
|
|
// We will write each individual tensor to its individual file.
|
|
log_file_.reset(new std::ofstream(
|
|
file_name, std::ofstream::out | std::ofstream::trunc));
|
|
CAFFE_ENFORCE(
|
|
log_file_->good(),
|
|
"Failed to open TensorPrinter file ",
|
|
file_name,
|
|
". rdstate() = ",
|
|
log_file_->rdstate());
|
|
}
|
|
}
|
|
|
|
TensorPrinter::~TensorPrinter() {
|
|
if (log_file_.get()) {
|
|
log_file_->close();
|
|
}
|
|
}
|
|
|
|
std::string TensorPrinter::MetaStr(const Tensor<CPUContext>& tensor) {
|
|
std::stringstream meta_stream;
|
|
meta_stream << "Tensor " << tensor_name_ << " of type "
|
|
<< tensor.meta().name() << ". Dims: (";
|
|
for (const auto dim : tensor.dims()) {
|
|
meta_stream << dim << ",";
|
|
}
|
|
meta_stream << "): ";
|
|
return meta_stream.str();
|
|
}
|
|
|
|
static CaffeMap<CaffeTypeId, TypeCall> type_call_registry_ {
|
|
{TypeMeta::Id<Tensor<CPUContext>>(), GetTensorType<Tensor<CPUContext>>}
|
|
};
|
|
|
|
TypeCall GetTypeCallFunction(CaffeTypeId id) {
|
|
auto f = type_call_registry_.find(id);
|
|
if (f == type_call_registry_.end()) {
|
|
return nullptr;
|
|
}
|
|
return f->second;
|
|
}
|
|
|
|
void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c) {
|
|
type_call_registry_[id] = c;
|
|
}
|
|
|
|
static CaffeMap<CaffeTypeId, TensorInfoCall> tensor_info_call_registry_{
|
|
{TypeMeta::Id<Tensor<CPUContext>>(), GetTensorInfo<Tensor<CPUContext>>}};
|
|
|
|
TensorInfoCall GetTensorInfoFunction(CaffeTypeId id) {
|
|
auto f = tensor_info_call_registry_.find(id);
|
|
if (f == tensor_info_call_registry_.end()) {
|
|
return nullptr;
|
|
}
|
|
return f->second;
|
|
}
|
|
|
|
void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c) {
|
|
tensor_info_call_registry_[id] = c;
|
|
}
|
|
|
|
namespace {
|
|
|
|
struct TensorCPUStatGetter : BlobStatGetter {
|
|
size_t sizeBytes(const Blob& blob) const override {
|
|
const auto& tensor = blob.Get<TensorCPU>();
|
|
auto nbytes = tensor.nbytes();
|
|
if (nbytes > 0 && tensor.IsType<std::string>()) {
|
|
const auto* data = tensor.data<std::string>();
|
|
for (size_t i = 0; i < tensor.size(); ++i) {
|
|
nbytes += data[i].size();
|
|
}
|
|
}
|
|
return nbytes;
|
|
}
|
|
};
|
|
REGISTER_BLOB_STAT_GETTER(TensorCPU, TensorCPUStatGetter);
|
|
}
|
|
|
|
} // namespace caffe2
|