Changes for Windows build to pass.

Summary:
After this, we should have contbuild guarding the Windows build both with
and without CUDA.

This includes a series of changes that are needed to make Windows build,
specifically:

(1) Various flags that are needed in the cmake system, specially dealing
with /MD, /MT, cuda, cudnn, whole static linking, etc.
(2) Contbuild scripts based on appveyo.
(3) For Windows build, note that one will need to use "cmake --build" to
build stuff so that the build type is consistent between configuration and
actual build. see scripts\build_windows.bat for details.
(4) In logging.h, ERROR is already defined by Windows. I don't have a good
solution now, and as a result, LOG(ERROR) on windows is going to be
LOG(INFO).
(5) variable length array is not supported by MSVC (and it is not part of
C++ standard). As a result I replaced them with vectors.
(6) sched.h is not available on Windows, so akyrola 's awesome simple
async net might encounter some slowdown due to no affinity setting on
Windows.
(7) MSVC has a
Closes https://github.com/caffe2/caffe2/pull/183

Reviewed By: ajtulloch

Differential Revision: D4657831

Pulled By: Yangqing

fbshipit-source-id: 070ded372ed78a7e3e3919fdffa1d337640f146e
This commit is contained in:
Yangqing Jia 2017-03-06 19:51:33 -08:00 committed by Facebook Github Bot
parent 2333ccadfb
commit 7b8c7b11d2
32 changed files with 333 additions and 221 deletions

View file

@ -83,13 +83,19 @@ endif()
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "binaries")
# ---[ Build flags
if (${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
message(WARNING "Develop note: when all errors are addressed, turn on warning.")
message(STATUS "Adding no warning argument to the compiler")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
else()
if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
else()
if (NOT ${BUILD_SHARED_LIBS})
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
endif()
endif()
if (CAFFE2_CPU_FLAGS)

View file

@ -8,7 +8,9 @@ Caffe2 is released under the [BSD 2-Clause license](https://github.com/Yangqing/
## Building Caffe2
[![Build Status](https://travis-ci.org/caffe2/caffe2.svg?branch=master)](https://travis-ci.org/caffe2/caffe2)
[![Travis Build Status](https://travis-ci.org/caffe2/caffe2.svg?branch=master)](https://travis-ci.org/caffe2/caffe2)
[![Windows Build status](https://ci.appveyor.com/api/projects/status/kec4ta779stuyb83?svg=true)](https://ci.appveyor.com/project/Yangqing/caffe2)
git clone --recursive https://github.com/caffe2/caffe2.git
cd caffe2

30
appveyor.yml Normal file
View file

@ -0,0 +1,30 @@
version: '{build}'
clone_folder: c:\projects\caffe2
environment:
matrix:
- USE_CUDA: OFF
CMAKE_BUILD_TYPE: Release
- USE_CUDA: ON
CMAKE_BUILD_TYPE: Release
- USE_CUDA: OFF
CMAKE_BUILD_TYPE: Debug
# Currently, CUDA + Debug does not work due to a error of using
# std::_Debug_lt in device code. Not sure where this comes from yet,
# but it is probably safe to assume that very few are going to build
# debug mode with CUDA and Windows.
#- USE_CUDA: ON
# CMAKE_BUILD_TYPE: Debug
install:
- cmd: c:\projects\caffe2\scripts\appveyor\install.bat
build_script:
- cmd: >-
cd c:\projects\caffe2
git submodule update --init
call scripts\build_windows.bat

View file

@ -74,7 +74,7 @@ list(APPEND Caffe2_MAIN_LIBS_ORDER Caffe2_CPU)
if (BUILD_SHARED_LIBS)
list(APPEND Caffe2_MAIN_LIBS Caffe2_CPU)
else()
caffe_add_whole_archive_flag(Caffe2_CPU tmp)
caffe_add_whole_archive_flag(Caffe2_CPU tmp)
list(APPEND Caffe2_MAIN_LIBS ${tmp})
endif()

View file

@ -43,7 +43,8 @@ int main(int argc, char** argv) {
} else {
// float data not supported right now.
CAFFE_ENFORCE_EQ(datum.float_data_size(), 0);
char buffer[datum.data().size()];
std::vector<char> buffer_vec(datum.data().size());
char* buffer = buffer_vec.data();
// swap order from CHW to HWC
int channels = datum.channels();
int size = datum.height() * datum.width();

View file

@ -84,6 +84,29 @@ private: \
#define CAFFE2_ALIGNED(x) __attribute__((aligned(x)))
#endif
/**
* Macro for marking functions as having public visibility.
* Ported from folly/CPortability.h
*/
#ifndef __GNUC_PREREQ
# if defined __GNUC__ && defined __GNUC_MINOR__
# define __GNUC_PREREQ(maj, min) ((__GNUC__ << 16) + __GNUC_MINOR__ >= \
((maj) << 16) + (min))
# else
# define __GNUC_PREREQ(maj, min) 0
# endif
#endif
#if defined(__GNUC__)
# if __GNUC_PREREQ(4, 9)
# define CAFFE2_EXPORT [[gnu::visibility("default")]]
# else
# define CAFFE2_EXPORT __attribute__((__visibility__("default")))
# endif
#else
# define CAFFE2_EXPORT
#endif
// make_unique is a C++14 feature. If we don't have 14, we will emulate
// its behavior. This is copied from folly/Memory.h
#if __cplusplus >= 201402L || \

View file

@ -159,7 +159,6 @@ struct EnforceOK {};
class EnforceFailMessage {
public:
constexpr /* implicit */ EnforceFailMessage(EnforceOK) : msg_(nullptr) {}
EnforceFailMessage(EnforceFailMessage&&) = default;
EnforceFailMessage(const EnforceFailMessage&) = delete;
EnforceFailMessage& operator=(EnforceFailMessage&&) = delete;
@ -180,7 +179,7 @@ class EnforceFailMessage {
msg_ = new std::string(std::move(msg));
}
inline bool bad() const {
return msg_;
return msg_ != nullptr;
}
std::string get_message_and_free(std::string&& extra) const {
std::string r;

View file

@ -15,7 +15,13 @@
// Log severity level constants.
const int FATAL = 3;
#if !defined(_MSC_VER) || !defined(ERROR)
// Windows defines the ERROR macro already, and as a result we will
// simply use that one. The downside is that one will now mix LOG(INFO)
// and LOG(ERROR) because ERROR is defined to be zero. Anyway, the
// recommended way is to use glog so fixing this is a low-pri item.
const int ERROR = 2;
#endif
const int WARNING = 1;
const int INFO = 0;
const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";

View file

@ -4,7 +4,10 @@
#include <mutex>
#include <stack>
#if !defined(_MSC_VER)
#include <sched.h>
#endif
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/flags.h"
#include "caffe2/core/operator.h"
@ -258,6 +261,10 @@ void GPUExecutor::Release(int gpu) {
}
void GPUExecutor::set_affinity() {
// TODO: find a Windows-compatible affinity setting approach.
// Currently, set_affinity has no effect in Windows. The code is still
// correct with possible slowdowns.
#if !defined(_MSC_VER)
/* Set CPU affinity */
int num_cores = std::thread::hardware_concurrency();
if (num_cores > 0) {
@ -269,6 +276,7 @@ void GPUExecutor::set_affinity() {
LOG(WARNING) << "Could not set CPU affinity";
}
}
#endif
}
// Worker that takes list of operators from the queue
@ -363,7 +371,9 @@ class SingleThreadAsyncNet : public SimpleNet {
}
bool RunAsync() {
LOG(FATAL) << "RunAsync() not implemented for singlethread_async net";
CAFFE_THROW("RunAsync() not implemented for singlethread_async net");
// Just to suppress compiler warning.
return false;
}
private:

View file

@ -169,7 +169,7 @@ class TypeMeta {
* is generated during run-time. Do NOT serialize the id for storage.
*/
template <typename T>
[[gnu::visibility("default")]] static CaffeTypeId Id();
CAFFE2_EXPORT static CaffeTypeId Id();
/**
* Returns the item size of the type. This is equivalent to sizeof(T).

View file

@ -48,22 +48,23 @@ class CudaRTCFunction {
if (compile_result != NVRTC_SUCCESS) {
size_t log_size;
NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
char nvrtc_log[log_size];
NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log));
vector<char> nvrtc_log(log_size);
NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log.data()));
LOG(FATAL) << "Compilation failure for nvrtc("
<< nvrtcGetErrorString(compile_result)
<< "): \n" << nvrtc_log;
<< "): \n" << nvrtc_log.data();
}
size_t ptx_size;
NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
char nvrtc_ptx[ptx_size];
NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx));
vector<char> nvrtc_ptx(ptx_size);
NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx.data()));
NVRTC_CHECK(nvrtcDestroyProgram(&prog));
// After compilation, load the module.
if (module_loaded_) {
CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
}
CUDA_DRIVERAPI_ENFORCE(cuModuleLoadDataEx(&module_, nvrtc_ptx, 0, 0, 0));
CUDA_DRIVERAPI_ENFORCE(cuModuleLoadDataEx(
&module_, nvrtc_ptx.data(), 0, 0, 0));
module_loaded_ = true;
CUDA_DRIVERAPI_ENFORCE(
cuModuleGetFunction(&kernel_, module_, name.c_str()));

View file

@ -84,7 +84,8 @@ class ElementwiseRTCOp final : public Operator<CUDAContext> {
static_assert(sizeof(void*) == sizeof(size_t),
"The argbuffer relies on the assumption that void* and "
"size_t have the same size.");
size_t argBuffer[InputSize() + OutputSize() + 1];
vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
size_t* argBuffer = argBuffer_vec.data();
CAFFE_ENFORCE(
Input(0).size() < std::numeric_limits<int>::max(),
"The kernel function currently only supports int index.");

View file

@ -572,15 +572,18 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
auto* dX =
Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
dX->ResizeLike(X);
const T* filter_data = filter.template data<T>();
const T* dYdata = dY.template data<T>();
T* dXdata = dX->template mutable_data<T>();
CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
state->cudnn_handle(),
filter_desc_,
filter.template data<T>(),
filter_data,
top_desc_,
dY.template data<T>(),
dYdata,
conv_desc_,
bottom_desc_,
dX->template mutable_data<T>(),
dXdata,
kNUM_CUDNN_BWD_DATA_ALGS,
&returned_algo_count,
data_perf_stat.data(),

View file

@ -121,7 +121,7 @@ __global__ void InstanceNormGradientKernel(
output_grad_offset += dim_stride;
}
temp *= -std::pow(inv_stdev_data[i], 3.0) / dim;
temp *= -powf(inv_stdev_data[i], 3.0) / dim;
input_grad_offset = input_grad_data + n * N_stride + c * C_stride;
output_grad_offset = output_grad_data + n * N_stride + c * C_stride;

View file

@ -9,6 +9,30 @@ namespace {
class LpPool {};
} // namespace
namespace {
template <typename T>
inline __device__ T cuda_pow(T x, T y);
template <typename T>
inline __device__ T cuda_abs(T x);
template<>
inline __device__ float cuda_pow<float>(float x, float y) {
return powf(x, y);
}
template<>
inline __device__ double cuda_pow<double>(double x, double y) {
return pow(x, y);
}
template <>
inline __device__ float cuda_abs(float x) { return fabsf(x); }
template <>
inline __device__ double cuda_abs(double x) { return fabs(x); }
}
namespace {
template <typename T>
__global__ void LpPoolForwardNCHW(
@ -47,10 +71,10 @@ __global__ void LpPoolForwardNCHW(
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
top_data[index] +=
std::pow(std::abs(bottom_data[bottom_offset + h * width + w]), p);
cuda_pow<T>(cuda_abs(bottom_data[bottom_offset + h * width + w]), p);
}
}
top_data[index] = std::pow(top_data[index], 1.0 / p);
top_data[index] = cuda_pow<T>(top_data[index], 1.0 / p);
}
}
@ -87,12 +111,12 @@ __global__ void LpPoolForwardNHWC(
int bottom_offset = n * height * width * channels + c;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
output += std::pow(
std::abs(bottom_data[bottom_offset + (h * width + w) * channels]),
output += cuda_pow<T>(
cuda_abs(bottom_data[bottom_offset + (h * width + w) * channels]),
p);
}
}
top_data[index] = std::pow(output, 1.0 / p);
top_data[index] = cuda_pow<T>(output, 1.0 / p);
}
}
@ -143,8 +167,8 @@ __global__ void LpPoolBackwardNCHW(
hstart = max(hstart, 0);
wstart = max(wstart, 0);
gradient += top_diff_slice[ph * pooled_width + pw] *
bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
std::pow(top_data_slice[ph * pooled_width + pw], p - 1);
bottom_data[index] * cuda_pow<T>(cuda_abs(bottom_data[index]), p - 2) /
cuda_pow<T>(top_data_slice[ph * pooled_width + pw], p - 1);
}
}
bottom_diff[index] = gradient;
@ -197,8 +221,8 @@ __global__ void LpPoolBackwardNHWC(
hstart = max(hstart, 0);
wstart = max(wstart, 0);
gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
std::pow(top_data_slice[(ph * pooled_width + pw) * channels],
bottom_data[index] * cuda_pow<T>(cuda_abs(bottom_data[index]), p - 2) /
cuda_pow<T>(top_data_slice[(ph * pooled_width + pw) * channels],
p - 1);
}
}

View file

@ -225,9 +225,12 @@ bool RecurrentOp<T>::RunOnDevice() {
&reserveNbytes_));
Output(RNN_SCRATCH)
->Resize(std::vector<int>{static_cast<int>(
reserveNbytes_ / 4 /* sizeof(T) - workaround clang bug */)});
reserveNbytes_ / 4)}); // sizeof(T) - workaround clang bug
Output(RNN_SCRATCH)->template mutable_data<T>();
auto InputData = [this](int i) { return this->Input(i).template data<T>(); };
auto OutputData = [this](int i) { return this->Output(i)->template mutable_data<T>(); };
if (OperatorBase::GetSingleArgument<int>("is_test", 0)) {
cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
CUDNN_ENFORCE(cudnnRNNForwardInference(
@ -235,19 +238,19 @@ bool RecurrentOp<T>::RunOnDevice() {
rnnDesc_,
seqLength,
xDesc_->descs(),
Input(INPUT).template data<T>(),
InputData(INPUT),//.template data<T>(),
hxDesc_,
Input(HIDDEN_INPUT).template data<T>(),
InputData(HIDDEN_INPUT),//.template data<T>(),
cxDesc_,
Input(CELL_INPUT).template data<T>(),
InputData(CELL_INPUT),//.template data<T>(),
wDesc_,
Input(WEIGHT).template data<T>(),
InputData(WEIGHT),//.template data<T>(),
yDesc_->descs(),
Output(OUTPUT)->template mutable_data<T>(),
OutputData(OUTPUT),//->template mutable_data<T>(),
hyDesc_,
Output(HIDDEN_OUTPUT)->template mutable_data<T>(),
OutputData(HIDDEN_OUTPUT),//->template mutable_data<T>(),
cyDesc_,
Output(CELL_OUTPUT)->template mutable_data<T>(),
OutputData(CELL_OUTPUT),//->template mutable_data<T>(),
state->workspace().get(cudnnWsNbytes_),
cudnnWsNbytes_));
});
@ -258,22 +261,22 @@ bool RecurrentOp<T>::RunOnDevice() {
rnnDesc_,
seqLength,
xDesc_->descs(),
Input(INPUT).template data<T>(),
InputData(INPUT),//.template data<T>(),
hxDesc_,
Input(HIDDEN_INPUT).template data<T>(),
InputData(HIDDEN_INPUT),//.template data<T>(),
cxDesc_,
Input(CELL_INPUT).template data<T>(),
InputData(CELL_INPUT),//.template data<T>(),
wDesc_,
Input(WEIGHT).template data<T>(),
InputData(WEIGHT),//.template data<T>(),
yDesc_->descs(),
Output(OUTPUT)->template mutable_data<T>(),
OutputData(OUTPUT),//->template mutable_data<T>(),
hyDesc_,
Output(HIDDEN_OUTPUT)->template mutable_data<T>(),
OutputData(HIDDEN_OUTPUT),//->template mutable_data<T>(),
cyDesc_,
Output(CELL_OUTPUT)->template mutable_data<T>(),
OutputData(CELL_OUTPUT),//->template mutable_data<T>(),
state->workspace().get(cudnnWsNbytes_),
cudnnWsNbytes_,
Output(RNN_SCRATCH)->template mutable_data<T>(),
OutputData(RNN_SCRATCH),//->template mutable_data<T>(),
reserveNbytes_));
});
}
@ -311,31 +314,34 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
#else
const auto * reserve = Output(RNN_SCRATCH_OUT)->template data<T>();
#endif
auto InputData = [this](int i) { return this->Input(i).template data<T>(); };
auto OutputData = [this](int i) { return this->Output(i)->template mutable_data<T>(); };
cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
CUDNN_ENFORCE(cudnnRNNBackwardData(
state->cudnn_handle(),
rnnDesc_,
seqLength,
yDesc_->descs(),
Input(OUTPUT).template data<T>(),
InputData(OUTPUT), //Input(OUTPUT).template data<T>(),
yDesc_->descs(),
Input(GRAD_OUTPUT).template data<T>(),
InputData(GRAD_OUTPUT), //Input(GRAD_OUTPUT).template data<T>(),
hyDesc_,
Input(GRAD_HIDDEN_OUTPUT).template data<T>(),
InputData(GRAD_HIDDEN_OUTPUT), //Input(GRAD_HIDDEN_OUTPUT).template data<T>(),
cyDesc_,
Input(GRAD_CELL_OUTPUT).template data<T>(),
InputData(GRAD_CELL_OUTPUT), //Input(GRAD_CELL_OUTPUT).template data<T>(),
wDesc_,
Input(WEIGHT).template data<T>(),
InputData(WEIGHT), //Input(WEIGHT).template data<T>(),
hxDesc_,
Input(HIDDEN_INPUT).template data<T>(),
InputData(HIDDEN_INPUT), //Input(HIDDEN_INPUT).template data<T>(),
cxDesc_,
Input(CELL_INPUT).template data<T>(),
InputData(CELL_INPUT), //Input(CELL_INPUT).template data<T>(),
xDesc_->descs(),
Output(GRAD_INPUT)->template mutable_data<T>(),
OutputData(GRAD_INPUT), //Output(GRAD_INPUT)->template mutable_data<T>(),
hxDesc_,
Output(GRAD_HIDDEN_INPUT)->template mutable_data<T>(),
OutputData(GRAD_HIDDEN_INPUT), //Output(GRAD_HIDDEN_INPUT)->template mutable_data<T>(),
cxDesc_,
Output(GRAD_CELL_INPUT)->template mutable_data<T>(),
OutputData(GRAD_CELL_INPUT), //Output(GRAD_CELL_INPUT)->template mutable_data<T>(),
state->workspace().get(cudnnWsNbytes_),
cudnnWsNbytes_,
reserve,
@ -345,18 +351,19 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
rnnDesc_,
seqLength,
xDesc_->descs(),
Input(INPUT).template data<T>(),
InputData(INPUT), //Input(INPUT).template data<T>(),
hxDesc_,
Input(HIDDEN_INPUT).template data<T>(),
InputData(HIDDEN_INPUT), //Input(HIDDEN_INPUT).template data<T>(),
yDesc_->descs(),
Input(OUTPUT).template data<T>(),
InputData(OUTPUT), //Input(OUTPUT).template data<T>(),
state->workspace().get(cudnnWsNbytes_),
cudnnWsNbytes_,
wDesc_,
Output(GRAD_WEIGHT)->template mutable_data<T>(),
OutputData(GRAD_WEIGHT), //Output(GRAD_WEIGHT)->template mutable_data<T>(),
reserve,
reserveNbytes_));
});
return true;
}
@ -371,7 +378,7 @@ bool RecurrentInitOp<T>::RunOnDevice() {
&weightsSize,
cudnnTypeWrapper<T>::type));
Output(WEIGHT)->Resize(std::vector<int>{(static_cast<int>(
weightsSize / 4 /* sizeof(T) - workaround clang bug */))});
weightsSize / 4 ))}); // sizeof(T) - workaround clang bug
math::RandUniform<T, CUDAContext>(
Output(WEIGHT)->size(),
-OperatorBase::GetSingleArgument<float>("scale", 0.01),
@ -413,6 +420,7 @@ bool RecurrentInitOp<T>::RunOnDevice() {
static_cast<T*>(bias),
&context_);
}
return true;
}

View file

@ -407,7 +407,7 @@ class SumReducer<T, CPUContext> : public BaseReducer {
template <int FixedSize>
void
process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
math::Axpy<T, CPUContext, FixedSize>(meta.block_size, 1, in, out_, context);
math::AxpyFixedSize<T, CPUContext, FixedSize>(meta.block_size, 1, in, out_, context);
}
private:
@ -489,7 +489,7 @@ class WeightedSumReducer<T, CPUContext> : public BaseReducer {
template <int FixedSize>
void
process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
math::Axpy<T, CPUContext, FixedSize>(
math::AxpyFixedSize<T, CPUContext, FixedSize>(
meta.block_size, meta.scalars[offset], in, out_, context);
}
@ -548,7 +548,7 @@ class WeightedSumReducerGradient : public BaseReducerGradient {
TIndex offset,
Context* context,
const int length) {
math::Scale<T, CPUContext, FixedSize>(
math::ScaleFixedSize<T, CPUContext, FixedSize>(
meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
}
@ -562,7 +562,7 @@ class WeightedSumReducerGradient : public BaseReducerGradient {
TIndex offset,
Context* context,
const int length) {
math::Scale<T, CPUContext, FixedSize>(
math::ScaleFixedSize<T, CPUContext, FixedSize>(
meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
math::Dot(
meta.block_size, s_grad_, data, meta.scalars_grad + offset, context);
@ -613,14 +613,15 @@ class MeanReducer<T, CPUContext> : public BaseReducer {
template <int FixedSize>
void
process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
math::Axpy<T, CPUContext, FixedSize>(meta.block_size, 1, in, out_, context);
math::AxpyFixedSize<T, CPUContext, FixedSize>(
meta.block_size, 1, in, out_, context);
current_size_++;
}
template <int FixedSize>
void finish(const Meta& meta, CPUContext* context) {
if (current_size_ > 0) {
math::Scale<T, CPUContext, FixedSize>(
math::ScaleFixedSize<T, CPUContext, FixedSize>(
meta.block_size, 1.0 / current_size_, out_, out_, context);
}
}
@ -650,7 +651,7 @@ class MeanReducerGradient : public BaseReducerGradient {
Context* context,
const int length) {
CAFFE_ENFORCE_GT(length, 0, "Segment length must be > 0");
math::Scale<T, CPUContext, FixedSize>(
math::ScaleFixedSize<T, CPUContext, FixedSize>(
meta.block_size, 1.0 / length, s_grad_, data_grad, context);
}

View file

@ -396,7 +396,7 @@ class ScatterWeightedSumOp : public Operator<Context> {
Index idx = idxs[i];
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
<< ", range 0 to " << N;
math::Scale<T, Context, FixedSize>(
math::ScaleFixedSize<T, Context, FixedSize>(
block_size,
w0,
data + block_size * idx,
@ -416,7 +416,7 @@ class ScatterWeightedSumOp : public Operator<Context> {
// double-checking the indices, but it's fine as it's DCHECK only
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
<< ", range 0 to " << N;
math::Axpy<T, Context, FixedSize>(
math::AxpyFixedSize<T, Context, FixedSize>(
block_size,
w,
x_data + block_size * i,

View file

@ -2,6 +2,33 @@
namespace caffe2 {
template <>
void rmsprop_update<CPUContext>(
int N,
const float* g,
const float* ms,
const float* mom,
float* ng,
float* nms,
float* nmom,
float decay,
float momentum,
float epsilon,
const float* lr,
CPUContext* context) {
ConstEigenVectorArrayMap<float> gVec(g, N);
ConstEigenVectorArrayMap<float> msVec(ms, N);
ConstEigenVectorArrayMap<float> momVec(mom, N);
// Update new mean square estimate
EigenVectorArrayMap<float> nmsVec(nms, N);
nmsVec = msVec + (1.0f - decay) * (gVec * gVec - msVec);
// Update momentum estimate
EigenVectorArrayMap<float> nmomVec(nmom, N);
nmomVec = momVec * momentum + lr[0] * gVec / (epsilon + nmsVec).sqrt();
// New gradient is the momentum
EigenVectorArrayMap<float>(ng, N) = nmomVec;
}
namespace {
REGISTER_CPU_OPERATOR(RmsProp, RmsPropOp<float, CPUContext>);
OPERATOR_SCHEMA(RmsProp)

View file

@ -18,19 +18,7 @@ void rmsprop_update(
float momentum,
float epsilon,
const float* lr,
Context* context) {
ConstEigenVectorArrayMap<float> gVec(g, N);
ConstEigenVectorArrayMap<float> msVec(ms, N);
ConstEigenVectorArrayMap<float> momVec(mom, N);
// Update new mean square estimate
EigenVectorArrayMap<float> nmsVec(nms, N);
nmsVec = msVec + (1.0f - decay) * (gVec * gVec - msVec);
// Update momentum estimate
EigenVectorArrayMap<float> nmomVec(nmom, N);
nmomVec = momVec * momentum + lr[0] * gVec / (epsilon + nmsVec).sqrt();
// New gradient is the momentum
EigenVectorArrayMap<float>(ng, N) = nmomVec;
}
Context* context);
template <typename T, class Context>
class RmsPropOp final : public Operator<Context> {

View file

@ -7,29 +7,13 @@ class CPUContext;
namespace math {
namespace detail {
template <typename T, class Context>
void ScaleDynamic(
const int N,
const T alpha,
const T* x,
T* y,
Context* context);
template <typename T, class Context>
void AxpyDynamic(
const int N,
const T alpha,
const T* x,
T* y,
Context* context);
// proxy to a class because of partial specialization limitations for functions
template<typename T, class Context, int FixedSize>
struct ScaleImpl {
inline void
operator()(const int N, const T alpha, const T* x, T* y, Context* context) {
ScaleDynamic(N, alpha, x, y, context);
Scale(N, alpha, x, y, context);
}
};
@ -51,7 +35,7 @@ template<typename T, class Context, int FixedSize>
struct AxpyImpl {
inline void
operator()(const int N, const T alpha, const T* x, T* y, Context* context) {
AxpyDynamic(N, alpha, x, y, context);
Axpy(N, alpha, x, y, context);
}
};
@ -73,13 +57,13 @@ struct AxpyImpl<T, CPUContext, 1> {
} // namespace detail
template <typename T, class Context, int FixedSize>
void Scale(const int N, const T alpha, const T* x, T* y,
inline void ScaleFixedSize(const int N, const T alpha, const T* x, T* y,
Context* context) {
detail::ScaleImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
}
template <typename T, class Context, int FixedSize>
void Axpy(const int N, const T alpha, const T* x, T* y,
inline void AxpyFixedSize(const int N, const T alpha, const T* x, T* y,
Context* context) {
detail::AxpyImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
}

View file

@ -15,8 +15,11 @@ extern "C" {
#include "caffe2/core/common.h"
#include "caffe2/core/types.h"
#ifndef __CUDACC__
#include "Eigen/Core"
#include "Eigen/Dense"
#endif
namespace caffe2 {
@ -24,6 +27,7 @@ namespace caffe2 {
// engine specified.
class DefaultEngine {};
#ifndef __CUDACC__
// Common Eigen types that we will often use
template <typename T>
using EigenMatrixMap =
@ -47,6 +51,7 @@ using ConstEigenVectorMap =
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1> >;
#endif
namespace math {
@ -197,9 +202,7 @@ template <typename T, class Context>
void Select(const int N, const int D, const T* x, const int* idx, T* y,
Context* context);
// For small FixedValues (like FixedSize=1) the function might provide more
// efficent implementation hard-coded statically for this size.
template <typename T, class Context, int FixedSize = -1>
template <typename T, class Context>
void Scale(const int N, const T alpha, const T* x, T* y,
Context* context);
@ -210,10 +213,9 @@ template <typename T, class Context>
void Scale(const int N, const T* alpha, const T* x, T* y,
Context* context);
// For small FixedValues (like FixedSize=1) the function might provide more
// efficent implementation hard-coded statically for this size.
template <typename T, class Context, int FixedSize = -1>
void Axpy(const int N, const T alpha, const T* x, T* y, Context* context);
template <typename T, class Context>
void Axpy(const int N, const T alpha, const T* x, T* y,
Context* context);
// Different from the Axpy function above, if alpha is passed in
// as a pointer, we will assume that it lives on the Context device,

View file

@ -206,17 +206,11 @@ void Gemv<float, CPUContext>(
}
#define CAFFE2_SPECIALIZED_SCALE(T) \
namespace detail { \
template <> \
void ScaleDynamic<T, CPUContext>( \
const int n, \
const T alpha, \
const T* x, \
T* y, \
CPUContext* context) { \
void Scale<T, CPUContext>( \
const int n, const T alpha, const T* x, T* y, CPUContext* context) { \
EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha; \
} \
} \
template <> \
void Scale<T, CPUContext>( \
const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
@ -238,17 +232,11 @@ CAFFE2_SPECIALIZED_DOT(double)
#undef CAFFE2_SPECIALIZED_DOT
#define CAFFE2_SPECIALIZED_AXPY(T) \
namespace detail { \
template <> \
void AxpyDynamic<T, CPUContext>( \
const int N, \
const T alpha, \
const T* x, \
T* Y, \
CPUContext* context) { \
void Axpy<T, CPUContext>( \
const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \
EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha; \
} \
} \
template <> \
void Axpy<T, CPUContext>( \
const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
@ -311,19 +299,13 @@ void Gemv<float, CPUContext>(
}
#define CAFFE2_SPECIALIZED_SCALE(T, prefix) \
namespace detail { \
template <> \
void ScaleDynamic<T, CPUContext>( \
const int n, \
const T alpha, \
const T* x, \
T* y, \
CPUContext* context) { \
void Scale<T, CPUContext>( \
const int n, const T alpha, const T* x, T* y, CPUContext* context) { \
if (y != x) \
cblas_##prefix##copy(n, x, 1, y, 1); \
cblas_##prefix##scal(n, alpha, y, 1); \
} \
} \
template <> \
void Scale<T, CPUContext>( \
const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
@ -347,17 +329,11 @@ CAFFE2_SPECIALIZED_DOT(double, d)
#undef CAFFE2_SPECIALIZED_DOT
#define CAFFE2_SPECIALIZED_AXPY(T, prefix) \
namespace detail { \
template <> \
void AxpyDynamic<T, CPUContext>( \
const int N, \
const T alpha, \
const T* x, \
T* y, \
CPUContext* context) { \
void Axpy<T, CPUContext>( \
const int N, const T alpha, const T* x, T* y, CPUContext* context) { \
cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
} \
} \
template <> \
void Axpy<T, CPUContext>( \
const int N, const T* alpha, const T* x, T* y, CPUContext* context) { \

View file

@ -425,48 +425,6 @@ void Axpy<double, CUDAContext>(
CUBLAS_ENFORCE(cublasDaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
}
namespace detail {
template <>
void ScaleDynamic<float, CUDAContext>(
const int n,
const float alpha,
const float* x,
float* y,
CUDAContext* context) {
return math::Scale<float, CUDAContext>(n, alpha, x, y, context);
}
template <>
void ScaleDynamic<double, CUDAContext>(
const int n,
const double alpha,
const double* x,
double* y,
CUDAContext* context) {
return math::Scale<double, CUDAContext>(n, alpha, x, y, context);
}
template <>
void AxpyDynamic<float, CUDAContext>(
const int n,
const float alpha,
const float* x,
float* y,
CUDAContext* context) {
return math::Axpy<float, CUDAContext>(n, alpha, x, y, context);
}
template <>
void AxpyDynamic<double, CUDAContext>(
const int n,
const double alpha,
const double* x,
double* y,
CUDAContext* context) {
return math::Axpy<double, CUDAContext>(n, alpha, x, y, context);
}
}
namespace {
template <typename T>
__global__ void AxpyKernel(const int n, const T* a, const T* x, T* y) {

View file

@ -64,33 +64,33 @@ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
* will be called once for each item.
*/
void pthreadpool_compute_1d(
pthreadpool_t threadpool,
pthreadpool_function_1d_t function,
void* argument,
size_t range);
pthreadpool_t threadpool,
pthreadpool_function_1d_t function,
void* argument,
size_t range);
void pthreadpool_compute_1d_tiled(
pthreadpool_t threadpool,
pthreadpool_function_1d_tiled_t function,
void* argument,
size_t range,
size_t tile);
pthreadpool_t threadpool,
pthreadpool_function_1d_tiled_t function,
void* argument,
size_t range,
size_t tile);
void pthreadpool_compute_2d(
pthreadpool_t threadpool,
pthreadpool_function_2d_t function,
void* argument,
size_t range_i,
size_t range_j);
pthreadpool_t threadpool,
pthreadpool_function_2d_t function,
void* argument,
size_t range_i,
size_t range_j);
void pthreadpool_compute_2d_tiled(
pthreadpool_t threadpool,
pthreadpool_function_2d_tiled_t function,
void* argument,
size_t range_i,
size_t range_j,
size_t tile_i,
size_t tile_j);
pthreadpool_t threadpool,
pthreadpool_function_2d_tiled_t function,
void* argument,
size_t range_i,
size_t range_j,
size_t tile_i,
size_t tile_j);
/**
* Terminates threads in the thread pool and releases associated resources.

View file

@ -160,7 +160,7 @@ endmacro()
# Special care for windows platform: we know that 32-bit windows does not support cuda.
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
if(NOT (CMAKE_SIZEOF_VOID_P EQUAL 8))
message(WARNING
message(FATAL_ERROR
"CUDA support not available with 32-bit windows. Did you "
"forget to set Win64 in the generator target?")
return()
@ -180,7 +180,12 @@ if (${CUDA_VERSION} LESS 8.0)
set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
else()
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
endif()
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
list(APPEND Caffe2_DEPENDENCY_LIBS ${CUDA_CUDART_LIBRARY}
${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
@ -223,8 +228,35 @@ endforeach()
# Set C++11 support
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
if (NOT MSVC)
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
endif()
# Debug and Release symbol support
if (MSVC)
if (${CMAKE_BUILD_TYPE} MATCHES "Release")
if (${BUILD_SHARED_LIBS})
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MD")
else()
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MT")
endif()
elseif(${CMAKE_BUILD_TYPE} MATCHES "Debug")
message(FATAL_ERROR
"Caffe2 currently does not support the combination of MSVC, Cuda "
"and Debug mode. Either set USE_CUDA=OFF or set the build type "
"to Release")
if (${BUILD_SHARED_LIBS})
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MDd")
else()
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MTd")
endif()
else()
message(FATAL_ERROR "Unknown cmake build type: " ${CMAKE_BUILD_TYPE})
endif()
endif()
if(OpenMP_FOUND)
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler ${OpenMP_CXX_FLAGS}")
endif()

View file

@ -53,6 +53,15 @@ if(NOT CAFFE2_NEED_TO_TURN_OFF_DEPRECATION_WARNING AND NOT MSVC)
endif()
# ---[ If we are using msvc, set no warning flags
if (${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
message(STATUS "Adding no warning argument to the compiler")
if (MSVC)
add_compile_options(/MP)
add_compile_options(
# Rough format: (warning level): Description
/wd4018 # (3): Signed/unsigned mismatch
/wd4244 # (2/3/4): Possible loss of precision
/wd4267 # (3): Conversion of size_t to smaller type. Possible loss of data.
/wd4800 # (3): Forcing non-boolean value to true or false.
/wd4996 # (3): Use of a deprecated member
/wd5030 # (?): Unrecognized C++ attribute
)
endif()

View file

@ -14,12 +14,12 @@ include(FindPackageHandleStandardArgs)
set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN")
find_path(CUDNN_INCLUDE_DIR cudnn.h
PATHS ${CUDNN_ROOT_DIR}
PATHS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES cuda/include include)
find_library(CUDNN_LIBRARY cudnn
PATHS ${CUDNN_ROOT_DIR}
PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64)
PATHS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARY)

View file

@ -1,12 +1,13 @@
# Finds Google Protocol Buffers library and compilers and extends
# the standard cmake script with version and python generation support
function(custom_protobuf_find)
# For a custom protobuf build, we will always use static protobuf.
option(protobuf_BUILD_SHARED_LIBS "" OFF)
option(protobuf_BUILD_TESTS "" OFF)
option(protobuf_BUILD_EXAMPLES "" OFF)
# MSVC protobuf built with static library explicitly uses /MT and /MTd which
# makes things a bit tricky, so we set it off.
option(protobuf_MSVC_STATIC_RUNTIME "" OFF)
#option(protobuf_MSVC_STATIC_RUNTIME "" OFF)
if (APPLE)
# Protobuf generated files triggers a deprecated atomic operation warning
# so we turn it off here.

View file

@ -383,7 +383,10 @@ endfunction()
function(caffe_add_whole_archive_flag lib output_var)
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
set(${output_var} -Wl,-force_load,$<TARGET_FILE:${lib}> PARENT_SCOPE)
set(${output_var} -Wl,-force_load,$<TARGET_FILE:${lib}> PARENT_SCOPE)
elseif(MSVC)
# In MSVC, we will add whole archive in default.
set(${output_var} -WHOLEARCHIVE:$<TARGET_FILE:${lib}> PARENT_SCOPE)
else()
# Assume everything else is like gcc
set(${output_var} -Wl,--whole-archive ${lib} -Wl,--no-whole-archive PARENT_SCOPE)

View file

@ -10,9 +10,16 @@ appveyor Downloadfile ^
-FileName cudnn-8.0-windows10-x64-v5.1.zip
@echo Installing CUDA toolkit 8 ...
cuda_8.0.44_windows.exe -s compiler_8.0 cublas_8.0 cublas_dev_8.0 cudart_8.0 curand_8.0 curand_dev_8.0
cuda_8.0.44_windows.exe -s compiler_8.0 cublas_8.0 cublas_dev_8.0 cudart_8.0 curand_8.0 curand_dev_8.0 nvrtc_8.0 nvrtc_dev_8.0
set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\libnvvp;%PATH%
:: TODO: we will still need to figure out how to install cudnn.
7z x cudnn-8.0-windows10-x64-v5.1.zip
copy cuda\include\cudnn.h ^
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include\"
copy cuda\lib\x64\cudnn.lib ^
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\lib\x64\"
copy cuda\bin\cudnn64_5.dll ^
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin\"
:: Make sure that nvcc is working correctly.
nvcc -V || exit /b

View file

@ -4,7 +4,7 @@
:: This script shows how one can build a Caffe2 binary for windows.
@echo off
@echo on
SET ORIGINAL_DIR=%cd%
SET CAFFE2_ROOT=%~dp0%..
@ -13,17 +13,27 @@ if not exist %CAFFE2_ROOT%\build_host_protoc\bin\protoc.exe call %CAFFE2_ROOT%\s
if not exist %CAFFE2_ROOT%\build mkdir %CAFFE2_ROOT%\build
cd %CAFFE2_ROOT%\build
if NOT DEFINED USE_CUDA (
set USE_CUDA=ON
)
if NOT DEFINED CMAKE_BUILD_TYPE (
set CMAKE_BUILD_TYPE=Release
)
:: Set up cmake. We will skip building the test files right now.
:: TODO: enable cuda support.
cmake .. ^
-G"Visual Studio 14 2015 Win64" ^
-DCMAKE_VERBOSE_MAKEFILE=1 ^
-DBUILD_TEST=OFF ^
-DUSE_CUDA=OFF ^
-DBUILD_SHARED_LIBS=OFF ^
-DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^
-DUSE_CUDA=%USE_CUDA% ^
-DPROTOBUF_PROTOC_EXECUTABLE=%CAFFE2_ROOT%\build_host_protoc\bin\protoc.exe ^
|| exit /b
:: Actually run the build
msbuild ALL_BUILD.vcxproj || exit /b
cmake --build . --config %CMAKE_BUILD_TYPE% || exit /b
cd %ORIGINAL_DIR%