Changes for Windows build to pass.

Summary: After this, we should have contbuild guarding the Windows build both with and without CUDA. This includes a series of changes that are needed to make Windows build, specifically: (1) Various flags that are needed in the cmake system, specially dealing with /MD, /MT, cuda, cudnn, whole static linking, etc. (2) Contbuild scripts based on appveyo. (3) For Windows build, note that one will need to use "cmake --build" to build stuff so that the build type is consistent between configuration and actual build. see scripts\build_windows.bat for details. (4) In logging.h, ERROR is already defined by Windows. I don't have a good solution now, and as a result, LOG(ERROR) on windows is going to be LOG(INFO). (5) variable length array is not supported by MSVC (and it is not part of C++ standard). As a result I replaced them with vectors. (6) sched.h is not available on Windows, so akyrola 's awesome simple async net might encounter some slowdown due to no affinity setting on Windows. (7) MSVC has a Closes https://github.com/caffe2/caffe2/pull/183 Reviewed By: ajtulloch Differential Revision: D4657831 Pulled By: Yangqing fbshipit-source-id: 070ded372ed78a7e3e3919fdffa1d337640f146e
2026-05-14 20:57:59 +00:00 · 2017-03-06 19:51:33 -08:00 · 2017-03-06 19:51:33 -08:00 · 7b8c7b11d2
commit 7b8c7b11d2
parent 2333ccadfb
32 changed files with 333 additions and 221 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -83,13 +83,19 @@ endif()
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "binaries")

 # ---[ Build flags
-if (${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
-  message(WARNING "Develop note: when all errors are addressed, turn on warning.")
-  message(STATUS "Adding no warning argument to the compiler")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
-else()
+if(NOT MSVC)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+else()
+  if (NOT ${BUILD_SHARED_LIBS})
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+  endif()
 endif()

 if (CAFFE2_CPU_FLAGS)
--- a/README.md
+++ b/README.md
@ -8,7 +8,9 @@ Caffe2 is released under the [BSD 2-Clause license](https://github.com/Yangqing/

 ## Building Caffe2

-[![Build Status](https://travis-ci.org/caffe2/caffe2.svg?branch=master)](https://travis-ci.org/caffe2/caffe2)
+[![Travis Build Status](https://travis-ci.org/caffe2/caffe2.svg?branch=master)](https://travis-ci.org/caffe2/caffe2)
+
+[![Windows Build status](https://ci.appveyor.com/api/projects/status/kec4ta779stuyb83?svg=true)](https://ci.appveyor.com/project/Yangqing/caffe2)

    git clone --recursive https://github.com/caffe2/caffe2.git
    cd caffe2
--- a/appveyor.yml
+++ b/appveyor.yml
@ -0,0 +1,30 @@
+version: '{build}'
+clone_folder: c:\projects\caffe2
+environment:
+  matrix:
+    - USE_CUDA: OFF
+      CMAKE_BUILD_TYPE: Release
+
+    - USE_CUDA: ON
+      CMAKE_BUILD_TYPE: Release
+
+    - USE_CUDA: OFF
+      CMAKE_BUILD_TYPE: Debug
+
+    # Currently, CUDA + Debug does not work due to a error of using
+    # std::_Debug_lt in device code. Not sure where this comes from yet,
+    # but it is probably safe to assume that very few are going to build
+    # debug mode with CUDA and Windows.
+    #- USE_CUDA: ON
+    #  CMAKE_BUILD_TYPE: Debug
+
+install:
+- cmd: c:\projects\caffe2\scripts\appveyor\install.bat
+
+build_script:
+- cmd: >-
+    cd c:\projects\caffe2
+    
+    git submodule update --init
+    
+    call scripts\build_windows.bat
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -74,7 +74,7 @@ list(APPEND Caffe2_MAIN_LIBS_ORDER Caffe2_CPU)
 if (BUILD_SHARED_LIBS)
  list(APPEND Caffe2_MAIN_LIBS Caffe2_CPU)
 else()
-    caffe_add_whole_archive_flag(Caffe2_CPU tmp)
+  caffe_add_whole_archive_flag(Caffe2_CPU tmp)
  list(APPEND Caffe2_MAIN_LIBS ${tmp})
 endif()

--- a/caffe2/binaries/convert_caffe_image_db.cc
+++ b/caffe2/binaries/convert_caffe_image_db.cc
@ -43,7 +43,8 @@ int main(int argc, char** argv) {
    } else {
      // float data not supported right now.
      CAFFE_ENFORCE_EQ(datum.float_data_size(), 0);
-      char buffer[datum.data().size()];
+      std::vector<char> buffer_vec(datum.data().size());
+      char* buffer = buffer_vec.data();
      // swap order from CHW to HWC
      int channels = datum.channels();
      int size = datum.height() * datum.width();
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -84,6 +84,29 @@ private:                                                                       \
 #define CAFFE2_ALIGNED(x) __attribute__((aligned(x)))
 #endif

+/**
+ * Macro for marking functions as having public visibility.
+ * Ported from folly/CPortability.h
+ */
+#ifndef __GNUC_PREREQ
+# if defined __GNUC__ && defined __GNUC_MINOR__
+#  define __GNUC_PREREQ(maj, min) ((__GNUC__ << 16) + __GNUC_MINOR__ >= \
+                                   ((maj) << 16) + (min))
+# else
+#  define __GNUC_PREREQ(maj, min) 0
+# endif
+#endif
+
+#if defined(__GNUC__)
+# if __GNUC_PREREQ(4, 9)
+#  define CAFFE2_EXPORT [[gnu::visibility("default")]]
+# else
+#  define CAFFE2_EXPORT __attribute__((__visibility__("default")))
+# endif
+#else
+# define CAFFE2_EXPORT
+#endif
+
 // make_unique is a C++14 feature. If we don't have 14, we will emulate
 // its behavior. This is copied from folly/Memory.h
 #if __cplusplus >= 201402L ||                                              \
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@ -159,7 +159,6 @@ struct EnforceOK {};
 class EnforceFailMessage {
 public:
  constexpr /* implicit */ EnforceFailMessage(EnforceOK) : msg_(nullptr) {}
-
  EnforceFailMessage(EnforceFailMessage&&) = default;
  EnforceFailMessage(const EnforceFailMessage&) = delete;
  EnforceFailMessage& operator=(EnforceFailMessage&&) = delete;
@ -180,7 +179,7 @@ class EnforceFailMessage {
    msg_ = new std::string(std::move(msg));
  }
  inline bool bad() const {
-    return msg_;
+    return msg_ != nullptr;
  }
  std::string get_message_and_free(std::string&& extra) const {
    std::string r;
--- a/caffe2/core/logging_is_not_google_glog.h
+++ b/caffe2/core/logging_is_not_google_glog.h
@ -15,7 +15,13 @@

 // Log severity level constants.
 const int FATAL   = 3;
+#if !defined(_MSC_VER) || !defined(ERROR)
+// Windows defines the ERROR macro already, and as a result we will
+// simply use that one. The downside is that one will now mix LOG(INFO)
+// and LOG(ERROR) because ERROR is defined to be zero. Anyway, the
+// recommended way is to use glog so fixing this is a low-pri item.
 const int ERROR   = 2;
+#endif
 const int WARNING = 1;
 const int INFO    = 0;
 const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";
--- a/caffe2/core/net_gpu.cc
+++ b/caffe2/core/net_gpu.cc
@ -4,7 +4,10 @@
 #include <mutex>
 #include <stack>

+#if !defined(_MSC_VER)
 #include <sched.h>
+#endif
+
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/flags.h"
 #include "caffe2/core/operator.h"
@ -258,6 +261,10 @@ void GPUExecutor::Release(int gpu) {
 }

 void GPUExecutor::set_affinity() {
+  // TODO: find a Windows-compatible affinity setting approach.
+  // Currently, set_affinity has no effect in Windows. The code is still
+  // correct with possible slowdowns.
+#if !defined(_MSC_VER)
  /* Set CPU affinity */
  int num_cores = std::thread::hardware_concurrency();
  if (num_cores > 0) {
@ -269,6 +276,7 @@ void GPUExecutor::set_affinity() {
      LOG(WARNING) << "Could not set CPU affinity";
    }
  }
+#endif
 }

 // Worker that takes list of operators from the queue
@ -363,7 +371,9 @@ class SingleThreadAsyncNet : public SimpleNet {
  }

  bool RunAsync() {
-    LOG(FATAL) << "RunAsync() not implemented for singlethread_async net";
+    CAFFE_THROW("RunAsync() not implemented for singlethread_async net");
+    // Just to suppress compiler warning.
+    return false;
  }

 private:
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@ -169,7 +169,7 @@ class TypeMeta {
   * is generated during run-time. Do NOT serialize the id for storage.
   */
  template <typename T>
-  [[gnu::visibility("default")]] static CaffeTypeId Id();
+  CAFFE2_EXPORT static CaffeTypeId Id();

  /**
   * Returns the item size of the type. This is equivalent to sizeof(T).
--- a/caffe2/cuda_rtc/common_rtc.h
+++ b/caffe2/cuda_rtc/common_rtc.h
@ -48,22 +48,23 @@ class CudaRTCFunction {
    if (compile_result != NVRTC_SUCCESS) {
      size_t log_size;
      NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
-      char nvrtc_log[log_size];
-      NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log));
+      vector<char> nvrtc_log(log_size);
+      NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log.data()));
      LOG(FATAL) << "Compilation failure for nvrtc("
                      << nvrtcGetErrorString(compile_result)
-                      << "): \n" << nvrtc_log;
+                      << "): \n" << nvrtc_log.data();
    }
    size_t ptx_size;
    NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
-    char nvrtc_ptx[ptx_size];
-    NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx));
+    vector<char> nvrtc_ptx(ptx_size);
+    NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx.data()));
    NVRTC_CHECK(nvrtcDestroyProgram(&prog));
    // After compilation, load the module.
    if (module_loaded_) {
      CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
    }
-    CUDA_DRIVERAPI_ENFORCE(cuModuleLoadDataEx(&module_, nvrtc_ptx, 0, 0, 0));
+    CUDA_DRIVERAPI_ENFORCE(cuModuleLoadDataEx(
+        &module_, nvrtc_ptx.data(), 0, 0, 0));
    module_loaded_ = true;
    CUDA_DRIVERAPI_ENFORCE(
        cuModuleGetFunction(&kernel_, module_, name.c_str()));
--- a/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
+++ b/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
@ -84,7 +84,8 @@ class ElementwiseRTCOp final : public Operator<CUDAContext> {
    static_assert(sizeof(void*) == sizeof(size_t),
                  "The argbuffer relies on the assumption that void* and "
                  "size_t have the same size.");
-    size_t argBuffer[InputSize() + OutputSize() + 1];
+    vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
+    size_t* argBuffer = argBuffer_vec.data();
    CAFFE_ENFORCE(
        Input(0).size() < std::numeric_limits<int>::max(),
        "The kernel function currently only supports int index.");
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@ -572,15 +572,18 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
                    auto* dX =
                        Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
                    dX->ResizeLike(X);
+                    const T* filter_data = filter.template data<T>();
+                    const T* dYdata = dY.template data<T>();
+                    T* dXdata = dX->template mutable_data<T>();
                    CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
                        state->cudnn_handle(),
                        filter_desc_,
-                        filter.template data<T>(),
+                        filter_data,
                        top_desc_,
-                        dY.template data<T>(),
+                        dYdata,
                        conv_desc_,
                        bottom_desc_,
-                        dX->template mutable_data<T>(),
+                        dXdata,
                        kNUM_CUDNN_BWD_DATA_ALGS,
                        &returned_algo_count,
                        data_perf_stat.data(),
--- a/caffe2/operators/instance_norm_op.cu
+++ b/caffe2/operators/instance_norm_op.cu
@ -121,7 +121,7 @@ __global__ void InstanceNormGradientKernel(
      output_grad_offset += dim_stride;
    }

-    temp *= -std::pow(inv_stdev_data[i], 3.0) / dim;
+    temp *= -powf(inv_stdev_data[i], 3.0) / dim;

    input_grad_offset = input_grad_data + n * N_stride + c * C_stride;
    output_grad_offset = output_grad_data + n * N_stride + c * C_stride;
--- a/caffe2/operators/lp_pool_op.cu
+++ b/caffe2/operators/lp_pool_op.cu
@ -9,6 +9,30 @@ namespace {
 class LpPool {};
 } // namespace

+namespace {
+template <typename T>
+inline __device__ T cuda_pow(T x, T y);
+
+template <typename T>
+inline __device__ T cuda_abs(T x);
+
+template<>
+inline __device__ float cuda_pow<float>(float x, float y) {
+  return powf(x, y);
+}
+template<>
+inline __device__ double cuda_pow<double>(double x, double y) {
+  return pow(x, y);
+}
+
+template <>
+inline __device__ float cuda_abs(float x) { return fabsf(x); }
+template <>
+inline __device__ double cuda_abs(double x) { return fabs(x); }
+
+
+}
+
 namespace {
 template <typename T>
 __global__ void LpPoolForwardNCHW(
@ -47,10 +71,10 @@ __global__ void LpPoolForwardNCHW(
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
        top_data[index] +=
-            std::pow(std::abs(bottom_data[bottom_offset + h * width + w]), p);
+            cuda_pow<T>(cuda_abs(bottom_data[bottom_offset + h * width + w]), p);
      }
    }
-    top_data[index] = std::pow(top_data[index], 1.0 / p);
+    top_data[index] = cuda_pow<T>(top_data[index], 1.0 / p);
  }
 }

@ -87,12 +111,12 @@ __global__ void LpPoolForwardNHWC(
    int bottom_offset = n * height * width * channels + c;
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
-        output += std::pow(
-            std::abs(bottom_data[bottom_offset + (h * width + w) * channels]),
+        output += cuda_pow<T>(
+            cuda_abs(bottom_data[bottom_offset + (h * width + w) * channels]),
            p);
      }
    }
-    top_data[index] = std::pow(output, 1.0 / p);
+    top_data[index] = cuda_pow<T>(output, 1.0 / p);
  }
 }

@ -143,8 +167,8 @@ __global__ void LpPoolBackwardNCHW(
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        gradient += top_diff_slice[ph * pooled_width + pw] *
-            bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
-            std::pow(top_data_slice[ph * pooled_width + pw], p - 1);
+            bottom_data[index] * cuda_pow<T>(cuda_abs(bottom_data[index]), p - 2) /
+            cuda_pow<T>(top_data_slice[ph * pooled_width + pw], p - 1);
      }
    }
    bottom_diff[index] = gradient;
@ -197,8 +221,8 @@ __global__ void LpPoolBackwardNHWC(
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
-            bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
-            std::pow(top_data_slice[(ph * pooled_width + pw) * channels],
+            bottom_data[index] * cuda_pow<T>(cuda_abs(bottom_data[index]), p - 2) /
+            cuda_pow<T>(top_data_slice[(ph * pooled_width + pw) * channels],
                     p - 1);
      }
    }
--- a/caffe2/operators/recurrent_op_cudnn.cc
+++ b/caffe2/operators/recurrent_op_cudnn.cc
@ -225,9 +225,12 @@ bool RecurrentOp<T>::RunOnDevice() {
      &reserveNbytes_));
  Output(RNN_SCRATCH)
      ->Resize(std::vector<int>{static_cast<int>(
-          reserveNbytes_ / 4 /* sizeof(T) - workaround clang bug */)});
+          reserveNbytes_ / 4)}); // sizeof(T) - workaround clang bug
  Output(RNN_SCRATCH)->template mutable_data<T>();

+  auto InputData = [this](int i) { return this->Input(i).template data<T>(); };
+  auto OutputData = [this](int i) { return this->Output(i)->template mutable_data<T>(); };
+  
  if (OperatorBase::GetSingleArgument<int>("is_test", 0)) {
    cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
      CUDNN_ENFORCE(cudnnRNNForwardInference(
@ -235,19 +238,19 @@ bool RecurrentOp<T>::RunOnDevice() {
          rnnDesc_,
          seqLength,
          xDesc_->descs(),
-          Input(INPUT).template data<T>(),
+          InputData(INPUT),//.template data<T>(),
          hxDesc_,
-          Input(HIDDEN_INPUT).template data<T>(),
+          InputData(HIDDEN_INPUT),//.template data<T>(),
          cxDesc_,
-          Input(CELL_INPUT).template data<T>(),
+          InputData(CELL_INPUT),//.template data<T>(),
          wDesc_,
-          Input(WEIGHT).template data<T>(),
+          InputData(WEIGHT),//.template data<T>(),
          yDesc_->descs(),
-          Output(OUTPUT)->template mutable_data<T>(),
+          OutputData(OUTPUT),//->template mutable_data<T>(),
          hyDesc_,
-          Output(HIDDEN_OUTPUT)->template mutable_data<T>(),
+          OutputData(HIDDEN_OUTPUT),//->template mutable_data<T>(),
          cyDesc_,
-          Output(CELL_OUTPUT)->template mutable_data<T>(),
+          OutputData(CELL_OUTPUT),//->template mutable_data<T>(),
          state->workspace().get(cudnnWsNbytes_),
          cudnnWsNbytes_));
    });
@ -258,22 +261,22 @@ bool RecurrentOp<T>::RunOnDevice() {
          rnnDesc_,
          seqLength,
          xDesc_->descs(),
-          Input(INPUT).template data<T>(),
+          InputData(INPUT),//.template data<T>(),
          hxDesc_,
-          Input(HIDDEN_INPUT).template data<T>(),
+          InputData(HIDDEN_INPUT),//.template data<T>(),
          cxDesc_,
-          Input(CELL_INPUT).template data<T>(),
+          InputData(CELL_INPUT),//.template data<T>(),
          wDesc_,
-          Input(WEIGHT).template data<T>(),
+          InputData(WEIGHT),//.template data<T>(),
          yDesc_->descs(),
-          Output(OUTPUT)->template mutable_data<T>(),
+          OutputData(OUTPUT),//->template mutable_data<T>(),
          hyDesc_,
-          Output(HIDDEN_OUTPUT)->template mutable_data<T>(),
+          OutputData(HIDDEN_OUTPUT),//->template mutable_data<T>(),
          cyDesc_,
-          Output(CELL_OUTPUT)->template mutable_data<T>(),
+          OutputData(CELL_OUTPUT),//->template mutable_data<T>(),
          state->workspace().get(cudnnWsNbytes_),
          cudnnWsNbytes_,
-          Output(RNN_SCRATCH)->template mutable_data<T>(),
+          OutputData(RNN_SCRATCH),//->template mutable_data<T>(),
          reserveNbytes_));
    });
  }
@ -311,31 +314,34 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
 #else
  const auto * reserve = Output(RNN_SCRATCH_OUT)->template data<T>();
 #endif
+  auto InputData = [this](int i) { return this->Input(i).template data<T>(); };
+  auto OutputData = [this](int i) { return this->Output(i)->template mutable_data<T>(); };
+  
  cudnn_wrapper_.with_cudnn_state(0, [&](CuDNNState* state) {
    CUDNN_ENFORCE(cudnnRNNBackwardData(
        state->cudnn_handle(),
        rnnDesc_,
        seqLength,
        yDesc_->descs(),
-        Input(OUTPUT).template data<T>(),
+        InputData(OUTPUT), //Input(OUTPUT).template data<T>(),
        yDesc_->descs(),
-        Input(GRAD_OUTPUT).template data<T>(),
+        InputData(GRAD_OUTPUT), //Input(GRAD_OUTPUT).template data<T>(),
        hyDesc_,
-        Input(GRAD_HIDDEN_OUTPUT).template data<T>(),
+        InputData(GRAD_HIDDEN_OUTPUT), //Input(GRAD_HIDDEN_OUTPUT).template data<T>(),
        cyDesc_,
-        Input(GRAD_CELL_OUTPUT).template data<T>(),
+        InputData(GRAD_CELL_OUTPUT), //Input(GRAD_CELL_OUTPUT).template data<T>(),
        wDesc_,
-        Input(WEIGHT).template data<T>(),
+        InputData(WEIGHT), //Input(WEIGHT).template data<T>(),
        hxDesc_,
-        Input(HIDDEN_INPUT).template data<T>(),
+        InputData(HIDDEN_INPUT), //Input(HIDDEN_INPUT).template data<T>(),
        cxDesc_,
-        Input(CELL_INPUT).template data<T>(),
+        InputData(CELL_INPUT), //Input(CELL_INPUT).template data<T>(),
        xDesc_->descs(),
-        Output(GRAD_INPUT)->template mutable_data<T>(),
+        OutputData(GRAD_INPUT), //Output(GRAD_INPUT)->template mutable_data<T>(),
        hxDesc_,
-        Output(GRAD_HIDDEN_INPUT)->template mutable_data<T>(),
+        OutputData(GRAD_HIDDEN_INPUT), //Output(GRAD_HIDDEN_INPUT)->template mutable_data<T>(),
        cxDesc_,
-        Output(GRAD_CELL_INPUT)->template mutable_data<T>(),
+        OutputData(GRAD_CELL_INPUT), //Output(GRAD_CELL_INPUT)->template mutable_data<T>(),
        state->workspace().get(cudnnWsNbytes_),
        cudnnWsNbytes_,
        reserve,
@ -345,18 +351,19 @@ bool RecurrentGradientOp<T>::RunOnDevice() {
        rnnDesc_,
        seqLength,
        xDesc_->descs(),
-        Input(INPUT).template data<T>(),
+        InputData(INPUT), //Input(INPUT).template data<T>(),
        hxDesc_,
-        Input(HIDDEN_INPUT).template data<T>(),
+        InputData(HIDDEN_INPUT), //Input(HIDDEN_INPUT).template data<T>(),
        yDesc_->descs(),
-        Input(OUTPUT).template data<T>(),
+        InputData(OUTPUT), //Input(OUTPUT).template data<T>(),
        state->workspace().get(cudnnWsNbytes_),
        cudnnWsNbytes_,
        wDesc_,
-        Output(GRAD_WEIGHT)->template mutable_data<T>(),
+        OutputData(GRAD_WEIGHT), //Output(GRAD_WEIGHT)->template mutable_data<T>(),
        reserve,
        reserveNbytes_));
  });
+
  return true;
 }

@ -371,7 +378,7 @@ bool RecurrentInitOp<T>::RunOnDevice() {
      &weightsSize,
      cudnnTypeWrapper<T>::type));
  Output(WEIGHT)->Resize(std::vector<int>{(static_cast<int>(
-      weightsSize / 4 /* sizeof(T) - workaround clang bug */))});
+      weightsSize / 4 ))}); // sizeof(T) - workaround clang bug
  math::RandUniform<T, CUDAContext>(
      Output(WEIGHT)->size(),
      -OperatorBase::GetSingleArgument<float>("scale", 0.01),
@ -413,6 +420,7 @@ bool RecurrentInitOp<T>::RunOnDevice() {
        static_cast<T*>(bias),
        &context_);
  }
+
  return true;
 }

--- a/caffe2/operators/reducer_functors.h
+++ b/caffe2/operators/reducer_functors.h
@ -407,7 +407,7 @@ class SumReducer<T, CPUContext> : public BaseReducer {
  template <int FixedSize>
  void
  process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
-    math::Axpy<T, CPUContext, FixedSize>(meta.block_size, 1, in, out_, context);
+    math::AxpyFixedSize<T, CPUContext, FixedSize>(meta.block_size, 1, in, out_, context);
  }

 private:
@ -489,7 +489,7 @@ class WeightedSumReducer<T, CPUContext> : public BaseReducer {
  template <int FixedSize>
  void
  process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
-    math::Axpy<T, CPUContext, FixedSize>(
+    math::AxpyFixedSize<T, CPUContext, FixedSize>(
        meta.block_size, meta.scalars[offset], in, out_, context);
  }

@ -548,7 +548,7 @@ class WeightedSumReducerGradient : public BaseReducerGradient {
      TIndex offset,
      Context* context,
      const int length) {
-    math::Scale<T, CPUContext, FixedSize>(
+    math::ScaleFixedSize<T, CPUContext, FixedSize>(
        meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
  }

@ -562,7 +562,7 @@ class WeightedSumReducerGradient : public BaseReducerGradient {
      TIndex offset,
      Context* context,
      const int length) {
-    math::Scale<T, CPUContext, FixedSize>(
+    math::ScaleFixedSize<T, CPUContext, FixedSize>(
        meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
    math::Dot(
        meta.block_size, s_grad_, data, meta.scalars_grad + offset, context);
@ -613,14 +613,15 @@ class MeanReducer<T, CPUContext> : public BaseReducer {
  template <int FixedSize>
  void
  process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
-    math::Axpy<T, CPUContext, FixedSize>(meta.block_size, 1, in, out_, context);
+    math::AxpyFixedSize<T, CPUContext, FixedSize>(
+        meta.block_size, 1, in, out_, context);
    current_size_++;
  }

  template <int FixedSize>
  void finish(const Meta& meta, CPUContext* context) {
    if (current_size_ > 0) {
-      math::Scale<T, CPUContext, FixedSize>(
+      math::ScaleFixedSize<T, CPUContext, FixedSize>(
          meta.block_size, 1.0 / current_size_, out_, out_, context);
    }
  }
@ -650,7 +651,7 @@ class MeanReducerGradient : public BaseReducerGradient {
      Context* context,
      const int length) {
    CAFFE_ENFORCE_GT(length, 0, "Segment length must be > 0");
-    math::Scale<T, CPUContext, FixedSize>(
+    math::ScaleFixedSize<T, CPUContext, FixedSize>(
        meta.block_size, 1.0 / length, s_grad_, data_grad, context);
  }

--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@ -396,7 +396,7 @@ class ScatterWeightedSumOp : public Operator<Context> {
        Index idx = idxs[i];
        DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
                                    << ", range 0 to " << N;
-        math::Scale<T, Context, FixedSize>(
+        math::ScaleFixedSize<T, Context, FixedSize>(
            block_size,
            w0,
            data + block_size * idx,
@ -416,7 +416,7 @@ class ScatterWeightedSumOp : public Operator<Context> {
        // double-checking the indices, but it's fine as it's DCHECK only
        DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
                                    << ", range 0 to " << N;
-        math::Axpy<T, Context, FixedSize>(
+        math::AxpyFixedSize<T, Context, FixedSize>(
            block_size,
            w,
            x_data + block_size * i,
--- a/caffe2/sgd/rmsprop_op.cc
+++ b/caffe2/sgd/rmsprop_op.cc
@ -2,6 +2,33 @@

 namespace caffe2 {

+template <>
+void rmsprop_update<CPUContext>(
+    int N,
+    const float* g,
+    const float* ms,
+    const float* mom,
+    float* ng,
+    float* nms,
+    float* nmom,
+    float decay,
+    float momentum,
+    float epsilon,
+    const float* lr,
+    CPUContext* context) {
+  ConstEigenVectorArrayMap<float> gVec(g, N);
+  ConstEigenVectorArrayMap<float> msVec(ms, N);
+  ConstEigenVectorArrayMap<float> momVec(mom, N);
+  // Update new mean square estimate
+  EigenVectorArrayMap<float> nmsVec(nms, N);
+  nmsVec = msVec + (1.0f - decay) * (gVec * gVec - msVec);
+  // Update momentum estimate
+  EigenVectorArrayMap<float> nmomVec(nmom, N);
+  nmomVec = momVec * momentum + lr[0] * gVec / (epsilon + nmsVec).sqrt();
+  // New gradient is the momentum
+  EigenVectorArrayMap<float>(ng, N) = nmomVec;
+}
+
 namespace {
 REGISTER_CPU_OPERATOR(RmsProp, RmsPropOp<float, CPUContext>);
 OPERATOR_SCHEMA(RmsProp)
--- a/caffe2/sgd/rmsprop_op.h
+++ b/caffe2/sgd/rmsprop_op.h
@ -18,19 +18,7 @@ void rmsprop_update(
    float momentum,
    float epsilon,
    const float* lr,
-    Context* context) {
-  ConstEigenVectorArrayMap<float> gVec(g, N);
-  ConstEigenVectorArrayMap<float> msVec(ms, N);
-  ConstEigenVectorArrayMap<float> momVec(mom, N);
-  // Update new mean square estimate
-  EigenVectorArrayMap<float> nmsVec(nms, N);
-  nmsVec = msVec + (1.0f - decay) * (gVec * gVec - msVec);
-  // Update momentum estimate
-  EigenVectorArrayMap<float> nmomVec(nmom, N);
-  nmomVec = momVec * momentum + lr[0] * gVec / (epsilon + nmsVec).sqrt();
-  // New gradient is the momentum
-  EigenVectorArrayMap<float>(ng, N) = nmomVec;
-}
+    Context* context);

 template <typename T, class Context>
 class RmsPropOp final : public Operator<Context> {
--- a/caffe2/utils/math-detail.h
+++ b/caffe2/utils/math-detail.h
@ -7,29 +7,13 @@ class CPUContext;
 namespace math {
 namespace detail {

-template <typename T, class Context>
-void ScaleDynamic(
-    const int N,
-    const T alpha,
-    const T* x,
-    T* y,
-    Context* context);
-
-template <typename T, class Context>
-void AxpyDynamic(
-    const int N,
-    const T alpha,
-    const T* x,
-    T* y,
-    Context* context);
-
 // proxy to a class because of partial specialization limitations for functions

 template<typename T, class Context, int FixedSize>
 struct ScaleImpl {
  inline void
  operator()(const int N, const T alpha, const T* x, T* y, Context* context) {
-    ScaleDynamic(N, alpha, x, y, context);
+    Scale(N, alpha, x, y, context);
  }
 };

@ -51,7 +35,7 @@ template<typename T, class Context, int FixedSize>
 struct AxpyImpl {
  inline void
  operator()(const int N, const T alpha, const T* x, T* y, Context* context) {
-    AxpyDynamic(N, alpha, x, y, context);
+    Axpy(N, alpha, x, y, context);
  }
 };

@ -73,13 +57,13 @@ struct AxpyImpl<T, CPUContext, 1> {
 }  // namespace detail

 template <typename T, class Context, int FixedSize>
-void Scale(const int N, const T alpha, const T* x, T* y,
+inline void ScaleFixedSize(const int N, const T alpha, const T* x, T* y,
           Context* context) {
  detail::ScaleImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
 }

 template <typename T, class Context, int FixedSize>
-void Axpy(const int N, const T alpha, const T* x, T* y,
+inline void AxpyFixedSize(const int N, const T alpha, const T* x, T* y,
           Context* context) {
  detail::AxpyImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
 }
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@ -15,8 +15,11 @@ extern "C" {

 #include "caffe2/core/common.h"
 #include "caffe2/core/types.h"
+
+#ifndef __CUDACC__
 #include "Eigen/Core"
 #include "Eigen/Dense"
+#endif

 namespace caffe2 {

@ -24,6 +27,7 @@ namespace caffe2 {
 // engine specified.
 class DefaultEngine {};

+#ifndef __CUDACC__
 // Common Eigen types that we will often use
 template <typename T>
 using EigenMatrixMap =
@ -47,6 +51,7 @@ using ConstEigenVectorMap =
 template <typename T>
 using ConstEigenVectorArrayMap =
    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1> >;
+#endif

 namespace math {

@ -197,9 +202,7 @@ template <typename T, class Context>
 void Select(const int N, const int D, const T* x, const int* idx, T* y,
            Context* context);

-// For small FixedValues (like FixedSize=1) the function might provide more
-// efficent implementation hard-coded statically for this size.
-template <typename T, class Context, int FixedSize = -1>
+template <typename T, class Context>
 void Scale(const int N, const T alpha, const T* x, T* y,
           Context* context);

@ -210,10 +213,9 @@ template <typename T, class Context>
 void Scale(const int N, const T* alpha, const T* x, T* y,
           Context* context);

-// For small FixedValues (like FixedSize=1) the function might provide more
-// efficent implementation hard-coded statically for this size.
-template <typename T, class Context, int FixedSize = -1>
-void Axpy(const int N, const T alpha, const T* x, T* y, Context* context);
+template <typename T, class Context>
+void Axpy(const int N, const T alpha, const T* x, T* y,
+          Context* context);

 // Different from the Axpy function above, if alpha is passed in
 // as a pointer, we will assume that it lives on the Context device,
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@ -206,17 +206,11 @@ void Gemv<float, CPUContext>(
 }

 #define CAFFE2_SPECIALIZED_SCALE(T)                                         \
-  namespace detail {                                                        \
  template <>                                                               \
-  void ScaleDynamic<T, CPUContext>(                                         \
-      const int n,                                                          \
-      const T alpha,                                                        \
-      const T* x,                                                           \
-      T* y,                                                                 \
-      CPUContext* context) {                                                \
+  void Scale<T, CPUContext>(                                                \
+      const int n, const T alpha, const T* x, T* y, CPUContext* context) {  \
    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha;         \
  }                                                                         \
-  }                                                                         \
  template <>                                                               \
  void Scale<T, CPUContext>(                                                \
      const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
@ -238,17 +232,11 @@ CAFFE2_SPECIALIZED_DOT(double)
 #undef CAFFE2_SPECIALIZED_DOT

 #define CAFFE2_SPECIALIZED_AXPY(T)                                          \
-  namespace detail {                                                        \
  template <>                                                               \
-  void AxpyDynamic<T, CPUContext>(                                          \
-      const int N,                                                          \
-      const T alpha,                                                        \
-      const T* x,                                                           \
-      T* Y,                                                                 \
-      CPUContext* context) {                                                \
+  void Axpy<T, CPUContext>(                                                 \
+      const int N, const T alpha, const T* x, T* Y, CPUContext* context) {  \
    EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha;        \
  }                                                                         \
-  }                                                                         \
  template <>                                                               \
  void Axpy<T, CPUContext>(                                                 \
      const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
@ -311,19 +299,13 @@ void Gemv<float, CPUContext>(
 }

 #define CAFFE2_SPECIALIZED_SCALE(T, prefix)                                 \
-  namespace detail {                                                       \
  template <>                                                               \
-  void ScaleDynamic<T, CPUContext>(                                         \
-      const int n,                                                          \
-      const T alpha,                                                        \
-      const T* x,                                                           \
-      T* y,                                                                 \
-      CPUContext* context) {                                                \
+  void Scale<T, CPUContext>(                                                \
+      const int n, const T alpha, const T* x, T* y, CPUContext* context) {  \
    if (y != x)                                                             \
      cblas_##prefix##copy(n, x, 1, y, 1);                                  \
    cblas_##prefix##scal(n, alpha, y, 1);                                   \
  }                                                                         \
-  }                                                                         \
  template <>                                                               \
  void Scale<T, CPUContext>(                                                \
      const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
@ -347,17 +329,11 @@ CAFFE2_SPECIALIZED_DOT(double, d)
 #undef CAFFE2_SPECIALIZED_DOT

 #define CAFFE2_SPECIALIZED_AXPY(T, prefix)                                  \
-  namespace detail {                                                        \
  template <>                                                               \
-  void AxpyDynamic<T, CPUContext>(                                          \
-      const int N,                                                          \
-      const T alpha,                                                        \
-      const T* x,                                                           \
-      T* y,                                                                 \
-      CPUContext* context) {                                                \
+  void Axpy<T, CPUContext>(                                                 \
+      const int N, const T alpha, const T* x, T* y, CPUContext* context) {  \
    cblas_##prefix##axpy(N, alpha, x, 1, y, 1);                             \
  }                                                                         \
-  }                                                                         \
  template <>                                                               \
  void Axpy<T, CPUContext>(                                                 \
      const int N, const T* alpha, const T* x, T* y, CPUContext* context) { \
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@ -425,48 +425,6 @@ void Axpy<double, CUDAContext>(
  CUBLAS_ENFORCE(cublasDaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
 }

-namespace detail {
-template <>
-void ScaleDynamic<float, CUDAContext>(
-    const int n,
-    const float alpha,
-    const float* x,
-    float* y,
-    CUDAContext* context) {
-  return math::Scale<float, CUDAContext>(n, alpha, x, y, context);
-}
-
-template <>
-void ScaleDynamic<double, CUDAContext>(
-    const int n,
-    const double alpha,
-    const double* x,
-    double* y,
-    CUDAContext* context) {
-  return math::Scale<double, CUDAContext>(n, alpha, x, y, context);
-}
-
-template <>
-void AxpyDynamic<float, CUDAContext>(
-    const int n,
-    const float alpha,
-    const float* x,
-    float* y,
-    CUDAContext* context) {
-  return math::Axpy<float, CUDAContext>(n, alpha, x, y, context);
-}
-
-template <>
-void AxpyDynamic<double, CUDAContext>(
-    const int n,
-    const double alpha,
-    const double* x,
-    double* y,
-    CUDAContext* context) {
-  return math::Axpy<double, CUDAContext>(n, alpha, x, y, context);
-}
-}
-
 namespace {
 template <typename T>
 __global__ void AxpyKernel(const int n, const T* a, const T* x, T* y) {
--- a/caffe2/utils/threadpool/pthreadpool.h
+++ b/caffe2/utils/threadpool/pthreadpool.h
@ -64,33 +64,33 @@ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
 *    will be called once for each item.
 */
 void pthreadpool_compute_1d(
-	pthreadpool_t threadpool,
-	pthreadpool_function_1d_t function,
-	void* argument,
-	size_t range);
+    pthreadpool_t threadpool,
+    pthreadpool_function_1d_t function,
+    void* argument,
+    size_t range);

 void pthreadpool_compute_1d_tiled(
-	pthreadpool_t threadpool,
-	pthreadpool_function_1d_tiled_t function,
-	void* argument,
-	size_t range,
-	size_t tile);
+    pthreadpool_t threadpool,
+    pthreadpool_function_1d_tiled_t function,
+    void* argument,
+    size_t range,
+    size_t tile);

 void pthreadpool_compute_2d(
-	pthreadpool_t threadpool,
-	pthreadpool_function_2d_t function,
-	void* argument,
-	size_t range_i,
-	size_t range_j);
+    pthreadpool_t threadpool,
+    pthreadpool_function_2d_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j);

 void pthreadpool_compute_2d_tiled(
-	pthreadpool_t threadpool,
-	pthreadpool_function_2d_tiled_t function,
-	void* argument,
-	size_t range_i,
-	size_t range_j,
-	size_t tile_i,
-	size_t tile_j);
+    pthreadpool_t threadpool,
+    pthreadpool_function_2d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t tile_i,
+    size_t tile_j);

 /**
 * Terminates threads in the thread pool and releases associated resources.
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@ -160,7 +160,7 @@ endmacro()
 # Special care for windows platform: we know that 32-bit windows does not support cuda.
 if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
  if(NOT (CMAKE_SIZEOF_VOID_P EQUAL 8))
-    message(WARNING
+    message(FATAL_ERROR
            "CUDA support not available with 32-bit windows. Did you "
            "forget to set Win64 in the generator target?")
    return()
@ -180,7 +180,12 @@ if (${CUDA_VERSION} LESS 8.0)
  set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs7})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+else()
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
 endif()
+
 include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 list(APPEND Caffe2_DEPENDENCY_LIBS ${CUDA_CUDART_LIBRARY}
                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
@ -223,8 +228,35 @@ endforeach()

 # Set C++11 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+if (NOT MSVC)
+  list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif()
+
+# Debug and Release symbol support
+if (MSVC)
+  if (${CMAKE_BUILD_TYPE} MATCHES "Release")
+	if (${BUILD_SHARED_LIBS})
+	  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MD")
+	else()
+	  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MT")
+	endif()
+  elseif(${CMAKE_BUILD_TYPE} MATCHES "Debug")
+    message(FATAL_ERROR
+	        "Caffe2 currently does not support the combination of MSVC, Cuda "
+			"and Debug mode. Either set USE_CUDA=OFF or set the build type "
+			"to Release")
+	if (${BUILD_SHARED_LIBS})
+	  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MDd")
+	else()
+	  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MTd")
+	endif()
+  else()
+    message(FATAL_ERROR "Unknown cmake build type: " ${CMAKE_BUILD_TYPE})
+  endif()
+endif()
+
+
 if(OpenMP_FOUND)
  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler ${OpenMP_CXX_FLAGS}")
 endif()
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@ -53,6 +53,15 @@ if(NOT CAFFE2_NEED_TO_TURN_OFF_DEPRECATION_WARNING AND NOT MSVC)
 endif()

 # ---[ If we are using msvc, set no warning flags
-if (${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
-  message(STATUS "Adding no warning argument to the compiler")
+if (MSVC)
+  add_compile_options(/MP)
+  add_compile_options(
+      # Rough format: (warning level): Description
+      /wd4018 # (3): Signed/unsigned mismatch
+      /wd4244 # (2/3/4): Possible loss of precision
+      /wd4267 # (3): Conversion of size_t to smaller type. Possible loss of data.
+      /wd4800 # (3): Forcing non-boolean value to true or false.
+      /wd4996 # (3): Use of a deprecated member
+      /wd5030 # (?): Unrecognized C++ attribute
+  )
 endif()
--- a/cmake/Modules/FindCuDNN.cmake
+++ b/cmake/Modules/FindCuDNN.cmake
@ -14,12 +14,12 @@ include(FindPackageHandleStandardArgs)
 set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN")

 find_path(CUDNN_INCLUDE_DIR cudnn.h
-    PATHS ${CUDNN_ROOT_DIR}
+    PATHS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
    PATH_SUFFIXES cuda/include include)

 find_library(CUDNN_LIBRARY cudnn
-    PATHS ${CUDNN_ROOT_DIR}
-    PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64)
+    PATHS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)

 find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARY)

--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@ -1,12 +1,13 @@
 # Finds Google Protocol Buffers library and compilers and extends
 # the standard cmake script with version and python generation support
 function(custom_protobuf_find)
+  # For a custom protobuf build, we will always use static protobuf.
  option(protobuf_BUILD_SHARED_LIBS "" OFF)
  option(protobuf_BUILD_TESTS "" OFF)
  option(protobuf_BUILD_EXAMPLES "" OFF)
  # MSVC protobuf built with static library explicitly uses /MT and /MTd which
  # makes things a bit tricky, so we set it off.
-  option(protobuf_MSVC_STATIC_RUNTIME "" OFF)
+  #option(protobuf_MSVC_STATIC_RUNTIME "" OFF)
  if (APPLE)
    # Protobuf generated files triggers a deprecated atomic operation warning
    # so we turn it off here.
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -383,7 +383,10 @@ endfunction()

 function(caffe_add_whole_archive_flag lib output_var)
  if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-      set(${output_var} -Wl,-force_load,$<TARGET_FILE:${lib}> PARENT_SCOPE)
+    set(${output_var} -Wl,-force_load,$<TARGET_FILE:${lib}> PARENT_SCOPE)
+  elseif(MSVC)
+    # In MSVC, we will add whole archive in default.
+    set(${output_var} -WHOLEARCHIVE:$<TARGET_FILE:${lib}> PARENT_SCOPE)
  else()
    # Assume everything else is like gcc
    set(${output_var} -Wl,--whole-archive ${lib} -Wl,--no-whole-archive PARENT_SCOPE)
--- a/scripts/appveyor/install.bat
+++ b/scripts/appveyor/install.bat
@ -10,9 +10,16 @@ appveyor Downloadfile ^
  -FileName cudnn-8.0-windows10-x64-v5.1.zip

@echo Installing CUDA toolkit 8 ...
-cuda_8.0.44_windows.exe -s compiler_8.0 cublas_8.0 cublas_dev_8.0 cudart_8.0 curand_8.0 curand_dev_8.0
+cuda_8.0.44_windows.exe -s compiler_8.0 cublas_8.0 cublas_dev_8.0 cudart_8.0 curand_8.0 curand_dev_8.0 nvrtc_8.0 nvrtc_dev_8.0
 set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\libnvvp;%PATH%
 :: TODO: we will still need to figure out how to install cudnn.
+7z x cudnn-8.0-windows10-x64-v5.1.zip
+copy cuda\include\cudnn.h ^
+  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include\"
+copy cuda\lib\x64\cudnn.lib ^
+  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\lib\x64\"
+copy cuda\bin\cudnn64_5.dll ^
+  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin\"

 :: Make sure that nvcc is working correctly.
 nvcc -V || exit /b
--- a/scripts/build_windows.bat
+++ b/scripts/build_windows.bat
@ -4,7 +4,7 @@

 :: This script shows how one can build a Caffe2 binary for windows.

-@echo off
+@echo on

 SET ORIGINAL_DIR=%cd%
 SET CAFFE2_ROOT=%~dp0%..
@ -13,17 +13,27 @@ if not exist %CAFFE2_ROOT%\build_host_protoc\bin\protoc.exe call %CAFFE2_ROOT%\s
 if not exist %CAFFE2_ROOT%\build mkdir %CAFFE2_ROOT%\build
 cd %CAFFE2_ROOT%\build

+if NOT DEFINED USE_CUDA (
+  set USE_CUDA=ON
+)
+
+if NOT DEFINED CMAKE_BUILD_TYPE (
+  set CMAKE_BUILD_TYPE=Release
+)
+
 :: Set up cmake. We will skip building the test files right now.
 :: TODO: enable cuda support.
 cmake .. ^
  -G"Visual Studio 14 2015 Win64" ^
  -DCMAKE_VERBOSE_MAKEFILE=1 ^
  -DBUILD_TEST=OFF ^
-  -DUSE_CUDA=OFF ^
+  -DBUILD_SHARED_LIBS=OFF ^
+  -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^
+  -DUSE_CUDA=%USE_CUDA% ^
  -DPROTOBUF_PROTOC_EXECUTABLE=%CAFFE2_ROOT%\build_host_protoc\bin\protoc.exe ^
  || exit /b

 :: Actually run the build
-msbuild ALL_BUILD.vcxproj || exit /b
+cmake --build . --config %CMAKE_BUILD_TYPE% || exit /b

 cd %ORIGINAL_DIR%