Make CentOS 6 CUDA build and run (#2159)

* Add manylinux1 source code changes * Disable a python test
2026-07-08 17:17:15 +00:00 · 2019-10-19 15:33:31 -07:00 · 2019-10-19 15:33:31 -07:00 · acec4b446f
commit acec4b446f
parent 96b33f4597
18 changed files with 161 additions and 86 deletions
--- a/onnxruntime/core/graph/record.h
+++ b/onnxruntime/core/graph/record.h
@ -32,7 +32,7 @@ class Record {
    values_ = other.values_;
  }

-  Status GetName(int index, const std::string** pp_name) const {
+  Status GetName(size_t index, const std::string** pp_name) const {
    if (nullptr == pp_name || index >= names_.size()) {
      return Status(ONNXRUNTIME, common::INVALID_ARGUMENT);
    }
--- a/onnxruntime/core/providers/cpu/math/matmul_helper.h
+++ b/onnxruntime/core/providers/cpu/math/matmul_helper.h
@ -239,7 +239,7 @@ class MatMulComputeHelper {
  template <typename T>
  static void OffsetToArrays(T* p, const std::vector<size_t>& offsets, gsl::span<T*> arrays) {
    auto len = offsets.size();
-    ORT_ENFORCE(arrays.size() == gsl::narrow_cast<ptrdiff_t>(len));
+    ORT_ENFORCE(arrays.size() == len);
    for (size_t i = 0; i < len; i++) {
      arrays[i] = p + offsets[i];
    }
@ -248,7 +248,7 @@ class MatMulComputeHelper {
  template <typename T>
  static void OffsetToArrays(const T* p, const std::vector<size_t>& offsets, gsl::span<const T*> arrays) {
    auto len = offsets.size();
-    ORT_ENFORCE(arrays.size() == gsl::narrow_cast<ptrdiff_t>(len));
+    ORT_ENFORCE(arrays.size() == len);
    for (size_t i = 0; i < len; i++) {
      arrays[i] = p + offsets[i];
    }
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@ -114,7 +114,7 @@ Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
        std::vector<int64_t> b_dims(2 + kernel_shape.size());
        b_dims[0] = 1;           // N
        b_dims[1] = b_shape[0];  // C
-        for (int i = 0; i < kernel_shape.size(); i++)
+        for (size_t i = 0; i < kernel_shape.size(); i++)
          b_dims[2 + i] = 1;

        ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType<CudaT>()));
@ -212,7 +212,7 @@ Status CudnnConvolutionDescriptor::Set(
  std::vector<int> pad_dims(rank);
  std::vector<int> stride_dims(rank);
  std::vector<int> dilation_dims(rank);
-  for (int i = 0; i < rank; i++) {
+  for (size_t i = 0; i < rank; i++) {
    pad_dims[i] = gsl::narrow_cast<int>(pads[i]);
    stride_dims[i] = gsl::narrow_cast<int>(strides[i]);
    dilation_dims[i] = gsl::narrow_cast<int>(dilations[i]);
--- a/onnxruntime/core/providers/cuda/tensor/compress.cc
+++ b/onnxruntime/core/providers/cuda/tensor/compress.cc
@ -61,7 +61,7 @@ Status Compress::ComputeInternal(OpKernelContext* ctx) const {

  int64_t axis_right_stride = 1;
  if (has_axis_) {
-    for (int i = static_cast<int>(axis_ + 1); i < rank; ++i) {
+    for (auto i = static_cast<size_t>(axis_ + 1); i < rank; ++i) {
      axis_right_stride *= input_dimensions[i];
    }
  }
--- a/onnxruntime/core/providers/cuda/tensor/concat.cc
+++ b/onnxruntime/core/providers/cuda/tensor/concat.cc
@ -40,7 +40,7 @@ Status Concat::ComputeInternal(OpKernelContext* ctx) const {
    }
  }
  std::vector<int64_t> concat_sizes_range(concat_sizes);
-  for (int i = 1; i < concat_sizes_range.size(); ++i) {
+  for (size_t i = 1; i < concat_sizes_range.size(); ++i) {
    concat_sizes_range[i] += concat_sizes_range[i - 1];
  }

--- a/onnxruntime/core/providers/cuda/tensor/expand.cc
+++ b/onnxruntime/core/providers/cuda/tensor/expand.cc
@ -28,7 +28,7 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const {
  auto input_shape = input0.Shape().GetDims();

  // pad input_dims with 1 to make ranks match
-  for (int i = 0; i < rank - input_shape.size(); i++) {
+  for (size_t i = 0; i < rank - input_shape.size(); i++) {
    input_shape.insert(input_shape.begin(), 1);
  }

@ -41,7 +41,7 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const {
    auto out_span = fdm_output_dims.CpuSpan();
    auto sdm_span = fdm_output_subdim_size.CpuSpan();
    auto subdim_size = output_shape.Size();
-    for (auto i = 0; i < rank; i++) {
+    for (size_t i = 0; i < rank; i++) {
      in_span[i] = fast_divmod(static_cast<int>(input_shape[i]));
      out_span[i] = fast_divmod(static_cast<int>(output_shape[i]));
      // output_shape[i] won't be 0 here, it's covered in (0 == output_shape.Size())
--- a/onnxruntime/core/providers/cuda/tensor/slice.cc
+++ b/onnxruntime/core/providers/cuda/tensor/slice.cc
@ -70,14 +70,14 @@ Status Slice<Tind, dynamic>::ComputeInternal(OpKernelContext* ctx) const {
  }
  CudaAsyncBuffer<int64_t> starts_buffer(this, dimension_count);
  gsl::span<int64_t> starts_buffer_span = starts_buffer.CpuSpan();
-  for (int i = 0; i < dimension_count; ++i) {
+  for (size_t i = 0; i < dimension_count; ++i) {
    starts_buffer_span[i] = starts[i];
  }
  starts_buffer.CopyToGpu();

  CudaAsyncBuffer<int64_t> steps_buffer(this, dimension_count);
  gsl::span<int64_t> steps_buffer_span = steps_buffer.CpuSpan();
-  for (int i = 0; i < dimension_count; ++i) {
+  for (size_t i = 0; i < dimension_count; ++i) {
    steps_buffer_span[i] = steps[i];
  }
  steps_buffer.CopyToGpu();
@ -90,7 +90,7 @@ Status Slice<Tind, dynamic>::ComputeInternal(OpKernelContext* ctx) const {

  CudaAsyncBuffer<fast_divmod> div_strides(this, dimension_count);
  gsl::span<fast_divmod> div_strides_span = div_strides.CpuSpan();
-  for (int i = 0; i < dimension_count; ++i) {
+  for (size_t i = 0; i < dimension_count; ++i) {
    div_strides_span[i] = fast_divmod(gsl::narrow_cast<int>(output_pitches[i]));
  }
  div_strides.CopyToGpu();
--- a/onnxruntime/core/providers/cuda/tensor/split.cc
+++ b/onnxruntime/core/providers/cuda/tensor/split.cc
@ -63,7 +63,7 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const {
  split_sizes_gpu.CopyToGpu();
  
  std::vector<int64_t> split_sizes_range(split_sizes);
-  for (int i = 1; i < split_sizes_range.size(); ++i) {
+  for (size_t i = 1; i < split_sizes_range.size(); ++i) {
    split_sizes_range[i] += split_sizes_range[i - 1];
  }
  CudaAsyncBuffer<int64_t> split_sizes_range_gpu(this, split_sizes_range);
--- a/onnxruntime/core/providers/cuda/tensor/tile.cc
+++ b/onnxruntime/core/providers/cuda/tensor/tile.cc
@ -36,7 +36,7 @@ Status Tile<T>::ComputeInternal(OpKernelContext* ctx) const {
  auto* repeats = repeats_tensor.template Data<int64_t>();
  const auto& input_shape = input_tensor.Shape().GetDims();
  std::vector<int64_t> output_dims(input_shape);
-  for (auto axis = 0; axis < rank; axis++)
+  for (size_t axis = 0; axis < rank; axis++)
    output_dims[axis] *= repeats[axis];
  TensorShape outputShape(output_dims);
  auto& output_tensor = *ctx->Output(0, outputShape);
--- a/onnxruntime/core/providers/cuda/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc
@ -70,7 +70,7 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context, const std::vector<floa
  CudaAsyncBuffer<fast_divmod> output_div_pitches(this, rank);
  gsl::span<fast_divmod> div_strides_span = output_div_pitches.CpuSpan();

-  for (int i = 0; i < rank; ++i) {
+  for (size_t i = 0; i < rank; ++i) {
    input_stride_span[i] = input_pitches[i];
    div_strides_span[i] = fast_divmod(gsl::narrow_cast<int>(output_pitches[i]));
  }
@ -95,7 +95,7 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context, const std::vector<floa
    CudaAsyncBuffer<fast_divmod> scales_div(this, rank);
    gsl::span<fast_divmod> scales_div_span = scales_div.CpuSpan();

-    for (int i = 0; i < rank; ++i) {
+    for (size_t i = 0; i < rank; ++i) {
      scales_div_span[i] = fast_divmod(gsl::narrow_cast<int>(ceil(scales[i])));
    }
    scales_div.CopyToGpu();
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@ -114,16 +114,17 @@ TEST(GraphTraversalTest, ReverseDFS) {
  Model model("graph_1");
  auto& graph = model.MainGraph();

-  // Case 1: A normal graph.
-  //                 SouceNode
-  //                 /       \
-            //  node_1 (Variable)      node_2 (Variable)
-  //                 \       /
-  //                 node_3 (Add)
-  //                     |
-  //                 node_4 (NoOp)
-  //                     |
-  //                  SinkNode
+  /* Case 1: A normal graph.
+   *                 SouceNode
+   *                 /       \
+   *  node_1 (Variable)      node_2 (Variable)
+   *                 \       /
+   *                 node_3 (Add)
+   *                     |
+   *                 node_4 (NoOp)
+   *                     |
+   *                  SinkNode
+  */
  std::vector<NodeArg*> inputs;
  std::vector<NodeArg*> outputs;

@ -267,16 +268,17 @@ TEST(ResolvingGraphTest, GraphConstruction_CheckIsAcyclic) {
  Model model("graph_1");
  auto& graph = model.MainGraph();

-  // A normal graph.
-  //                 SouceNode
-  //                 /       \
-            //    node_1 (Variable)  node_2 (Variable)
-  //                 \       /
-  //                 node_3 (Add)
-  //                     |
-  //                 node_4 (NoOp)
-  //                     |
-  //                  SinkNode
+  /* A normal graph.
+   *                 SouceNode
+   *                 /       \
+   *    node_1 (Variable)  node_2 (Variable)
+   *                 \       /
+   *                 node_3 (Add)
+   *                     |
+   *                 node_4 (NoOp)
+   *                     |
+   *                  SinkNode
+   */ 
  std::vector<NodeArg*> inputs;
  std::vector<NodeArg*> outputs;

@ -445,14 +447,15 @@ TEST(ResolvingGraphTest, GraphConstruction_CheckGraphInputOutputOrderMaintained)
    map.insert({std::to_string(i), i});
  }

-  //               |         |
-  //       b (Identity)  a (Identity)   values
-  //                \   /
-  //                  c (Merge)
-  //                  |
-  //                  d (Split)
-  //                /   \
-  //              1  ..  10
+  /*               |         |
+   *       b (Identity)  a (Identity)   values
+   *                \   /
+   *                  c (Merge)
+   *                  |
+   *                  d (Split)
+   *                /   \
+   *              1  ..  10
+   */ 
  TypeProto tensor_int32;
  tensor_int32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32);
  tensor_int32.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
@ -653,14 +656,15 @@ TEST(ResolvingGraphTest, GraphConstruction_TypeInference) {
  Model model("graph_1");
  auto& graph = model.MainGraph();

-  // Case 1: A normal graph.
-  //                         SourceNode
-  //                   /         |         \
-            //  node_1 (Variable)  node_2 (Variable)  node_3 (Variable)
-  //                   \         |         / (it's all 3 nodes above outputs to the one input of node_4)
-  //                        node_4 (Max)
-  //                             |
-  //                          SinkNode
+  /* Case 1: A normal graph.
+   *                         SourceNode
+   *                   /         |         \
+   *  node_1 (Variable)  node_2 (Variable)  node_3 (Variable)
+   *                   \         |         / (it's all 3 nodes above outputs to the one input of node_4)
+   *                        node_4 (Max)
+   *                             |
+   *                          SinkNode
+  */
  std::vector<NodeArg*> inputs;
  std::vector<NodeArg*> outputs;

--- a/onnxruntime/test/providers/provider_test_utils.cc
+++ b/onnxruntime/test/providers/provider_test_utils.cc
@ -259,7 +259,7 @@ void Check<TensorSeq>(const OpTester::Data& expected_data, const TensorSeq& outp
  // now check the contents of the tensors
  auto null_deleter = [](void*) {};

-  for (int i = 0; i < output_num_tensors; ++i) {
+  for (size_t i = 0; i < output_num_tensors; ++i) {
    OrtValue temp_value;
    // Reason for null_deleter: we don't want the tensor destructor to be called as part of this OrtValue destructor
    // as we're creating this OrtValue only to reuse the Check functionality
--- a/onnxruntime/test/providers/provider_test_utils.h
+++ b/onnxruntime/test/providers/provider_test_utils.h
@ -481,7 +481,7 @@ class OpTester {
    ptr->dtype = DataTypeImpl::GetType<T>();
    auto num_tensors = seq_tensors.tensors.size();
    ptr->tensors.resize(num_tensors);
-    for (int i = 0; i < num_tensors; ++i) {
+    for (size_t i = 0; i < num_tensors; ++i) {
      TensorShape shape{seq_tensors.tensors[i].shape};
      auto values_count = static_cast<int64_t>(seq_tensors.tensors[i].data.size());
      ORT_ENFORCE(shape.Size() == values_count, values_count,
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@ -302,32 +302,6 @@ class TestInferenceSession(unittest.TestCase):
                         ['identity', 'test\x00\x00\x00\x00']], dtype=object)
        np.testing.assert_equal(expr, res[0])

-    def testConvAutoPad(self):
-        sess = onnxrt.InferenceSession(self.get_name("conv_autopad.onnx"))
-        x = np.array(25 * [1.0], dtype=np.float32).reshape((1, 1, 5, 5))
-
-        x_name = sess.get_inputs()[0].name
-        self.assertEqual(x_name, "Input4")
-        x_shape = sess.get_inputs()[0].shape
-        self.assertEqual(x_shape, [1, 1, 5, 5])
-        x_type = sess.get_inputs()[0].type
-        self.assertEqual(x_type, 'tensor(float)')
-
-        output_name = sess.get_outputs()[0].name
-        self.assertEqual(output_name, "Convolution5_Output_0")
-        output_shape = sess.get_outputs()[0].shape
-        self.assertEqual(output_shape, [1, 1, 5, 5])
-        output_type = sess.get_outputs()[0].type
-        self.assertEqual(output_type, 'tensor(float)')
-
-        res = sess.run([output_name], {x_name: x})
-        output_expected = np.array([[[[24., 33., 33., 33., 20.],
-                                      [27., 36., 36., 36., 21.],
-                                      [27., 36., 36., 36., 21.],
-                                      [27., 36., 36., 36., 21.],
-                                      [12., 15., 15., 15.,  8.]]]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0])
-
    def testZipMapStringFloat(self):
        sess = onnxrt.InferenceSession(
            self.get_name("zipmap_stringfloat.onnx"))
--- a/tools/ci_build/github/linux/docker/Dockerfile.centos6_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.centos6_gpu
@ -0,0 +1,70 @@
+# FROM mcr.microsoft.com/dotnet-buildtools/prereqs:centos-7-50f0d02-20190918214028
+FROM centos:6
+
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+
+
+ARG PYTHON_VERSION
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh 
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/bin:/usr/bin:/usr/local/bin:/opt/rh/devtoolset-7/root/usr/bin:{PATH}
+RUN /tmp/scripts/install_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts
+
+#Below are copied from https://gitlab.com/nvidia/container-images/cuda/tree/master/dist/centos6
+
+RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
+curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64/7fa2af80.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
+    echo "$NVIDIA_GPGKEY_SUM  /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c -
+
+COPY cuda_manylinux2010.repo  /etc/yum.repos.d/cuda.repo
+
+ENV CUDA_VERSION 10.0.130
+
+ENV CUDA_PKG_VERSION 10-0-$CUDA_VERSION-1
+RUN yum install -y \
+cuda-cudart-$CUDA_PKG_VERSION \
+cuda-libraries-$CUDA_PKG_VERSION \
+cuda-nvtx-$CUDA_PKG_VERSION \
+cuda-nvml-dev-$CUDA_PKG_VERSION \
+cuda-command-line-tools-$CUDA_PKG_VERSION \
+cuda-libraries-dev-$CUDA_PKG_VERSION \
+cuda-minimal-build-$CUDA_PKG_VERSION \
+&& \
+    ln -s cuda-10.0 /usr/local/cuda && \
+    rpm -e --nodeps gcc gcc-c++ && \
+    rm -rf /var/cache/yum/*
+
+# nvidia-docker 1.0
+RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
+
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411"
+
+
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
+ENV CUDNN_VERSION 7.6.4.38
+LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
+
+# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+RUN CUDNN_DOWNLOAD_SUM=417bb5daf51377037eb2f5c87649000ca1b9cec0acb16cfe07cb1d3e9a961dbf && \
+    curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.4/cudnn-10.0-linux-x64-v7.6.4.38.tgz -O && \
+    echo "$CUDNN_DOWNLOAD_SUM  cudnn-10.0-linux-x64-v7.6.4.38.tgz" | sha256sum -c - && \
+    tar --no-same-owner -xzf cudnn-10.0-linux-x64-v7.6.4.38.tgz -C /usr/local && \
+    rm cudnn-10.0-linux-x64-v7.6.4.38.tgz && \
+    /sbin/ldconfig
+
+
+ARG BUILD_UID=1000
+ARG BUILD_USER=onnxruntimedev
+RUN /usr/sbin/adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
+
--- a/tools/ci_build/github/linux/docker/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_centos.sh
@ -7,16 +7,33 @@ os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
 if ! rpm -q --quiet epel-release ; then
  yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-$os_major_version.noarch.rpm
 fi
+
+echo "installing for os major version : $os_major_version"
 if [ "$os_major_version" == "5" ]; then
- #Be careful, don't pull gcc into the base system, because we already have one in /opt/rh/devtoolset-2/root/usr/bin
- yum install -y redhat-lsb expat-devel libcurl-devel tar unzip curl zlib-devel make  python2-devel icu  rsync bzip2 git bzip2-devel
+  #Be careful, don't pull gcc into the base system, because we already have one in /opt/rh/devtoolset-2/root/usr/bin
+  yum install -y redhat-lsb expat-devel libcurl-devel tar unzip curl zlib-devel make  python2-devel icu  rsync bzip2 git bzip2-devel
+elif [ "$os_major_version" == "6" ] && [ ! -d "/opt/python/cp35-cp35m" ]; then
+  yum install -y centos-release-scl
+  yum repolist
+  yum install -y redhat-lsb-core expat-devel libcurl-devel tar unzip curl zlib-devel make libunwind icu aria2 rsync bzip2 git bzip2-devel
+  yum upgrade -y
+  yum install -y \
+    ccache \
+    devtoolset-7-binutils \
+    devtoolset-7-gcc \
+    devtoolset-7-gcc-c++ \
+    devtoolset-7-gcc-gfortran 
+  # The way to get python 3.6.8
+  yum install -y https://centos6.iuscommunity.org/ius-release.rpm 
+  yum --enablerepo=ius install -y python36u python36u-devel python36u-pip python36u-numpy python36u-setuptools python36u-wheel protobuf
+  /usr/bin/python3.6 -m pip install --upgrade pip
 else
- yum install -y redhat-lsb-core expat-devel libcurl-devel tar unzip curl zlib-devel make  python2-devel  libunwind icu aria2 rsync bzip2 git bzip2-devel
+  yum install -y redhat-lsb-core expat-devel libcurl-devel tar unzip curl zlib-devel make  python2-devel  libunwind icu aria2 rsync bzip2 git bzip2-devel
 fi


 #If the /opt/python folder exists, we assume this is the manylinux docker image
-if [ ! -d "/opt/python/cp35-cp35m"  ] 
+if [ "$os_major_version" != "6" ] && [ ! -d "/opt/python/cp35-cp35m"  ] 
 then
-yum install -y ccache gcc gcc-c++ python3-devel python3-pip python3-numpy python3-setuptools python3-wheel
+  yum install -y ccache gcc gcc-c++ python3 python3-devel python3-pip python3-numpy python3-setuptools python3-wheel
 fi
--- a/tools/ci_build/github/linux/docker/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_deps.sh
@ -1,6 +1,7 @@
 #!/bin/bash
 set -e

+
 while getopts p:d: parameter_Option
 do case "${parameter_Option}"
 in
--- a/tools/ci_build/github/linux/docker/scripts/install_onnx.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_onnx.sh
@ -25,6 +25,8 @@ else
   PYTHON_EXE="/usr/bin/python${PYTHON_VER}"
 fi

+${PYTHON_EXE} -m pip install protobuf
+
 version2tag=(5af210ca8a1c73aa6bae8754c9346ec54d0a756e-onnx123
             bae6333e149a59a3faa9c4d9c44974373dcf5256-onnx130
             9e55ace55aad1ada27516038dfbdc66a8a0763db-onnx141
@ -46,7 +48,14 @@ for v2t in ${version2tag[*]}; do
  if [ ! -d "third_party/pybind11/pybind11" ]; then
    git clone https://github.com/pybind/pybind11.git third_party/pybind11
  fi 
-  ${PYTHON_EXE} -m pip install .
+  # We need to make the adjustment only for CentOS6 OR we substitue this only for
+  # ${PYTHON_EXE} where we'd need to escape slashes
+  # Make sure we do not hit pyhon2 as on CentOS 6 it does not work
+  ESCAPED_PY=$(echo "${PYTHON_EXE}" | sed 's/\//\\\//g')
+  sed "1,1 s/\/usr\/bin\/env python/$ESCAPED_PY/" /tmp/src/onnx-$onnx_version/tools/protoc-gen-mypy.py > /tmp/src/onnx-$onnx_version/tools/repl_protoc-gen-mypy.py
+  chmod a+w /tmp/src/onnx-$onnx_version/tools/protoc-gen-mypy.py
+  mv /tmp/src/onnx-$onnx_version/tools/repl_protoc-gen-mypy.py /tmp/src/onnx-$onnx_version/tools/protoc-gen-mypy.py
  mkdir -p /data/onnx/${onnx_tag}
+  ${PYTHON_EXE} -m pip install .
  backend-test-tools generate-data -o /data/onnx/$onnx_tag
 done