cross compile x86 linux (#562)

* cross compile x86 linux

* fix comments

* install multilib for ubuntu cross compile

* remove tailing slash

* fix -fPIC relocations for x86 target too

* add asm make flag

* fix x86 compile err

* test x86 with zlib and png

* Disable zlib from x86

* install x86 python header

* remove cross-compiling changes

* test 32bit ubuntu

* add x86 ubuntu docker file

* add x86 as arch parametr for docker build

* config pipeline

* avoid dotnet install

* install cmake

* skip dep install

* use latest ubuntu

* install latest cmake

* install x86 deps

* configure cmake

* install ninja

* correct ninja dir

* apt get re2c

* install onnx

* set processor x86

* disable warning

* skip test

* disable test

* disable test

* find lib

* fix typo

* restore test

* disable backend model test

* disable test

* fix test err

* stop installing onnx

* disable onnx test on x86

* restore yml

* mergef with master yml

* cancel needless config setting

* enable x86 flag

* restore all onnx tests

* fix yml typo

* install onnx

* add back x86 flag

* disable cases

* disable case

* disable cases

* add macro to disable cases

* fix typo

* print platform

* remove condition
This commit is contained in:
Randy 2019-03-12 09:47:45 -07:00 committed by Pranav Sharma
parent 3ef273b84b
commit f048fc5fb0
15 changed files with 1067 additions and 177 deletions

View file

@ -134,6 +134,11 @@ else()
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -march=native -mtune=native")
string(APPEND CMAKE_C_FLAGS_RELWITHDEBINFO " -march=native -mtune=native")
endif()
if(onnxruntime_BUILD_x86)
set (CMAKE_SYSTEM_PROCESSOR "x86")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -mfpmath=sse -Wno-narrowing")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -mfpmath=sse -Wno-narrowing")
endif()
endif()
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")

View file

@ -93,6 +93,25 @@ else()
${ONNXRUNTIME_ROOT}/core/mlas/lib/aarch64/sgemma.s
)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
enable_language(ASM)
set(mlas_platform_srcs_sse2
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelSse2.S
)
set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
set(mlas_platform_srcs_avx
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelAvx.S
)
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
set(mlas_platform_srcs
${mlas_platform_srcs_sse2}
${mlas_platform_srcs_avx}
)
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
enable_language(ASM)
@ -106,7 +125,7 @@ else()
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelSse2.S
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmTransposePackB16x4Sse2.S
)
set_source_files_properties(${mlas_platform_srcs_sse} PROPERTIES COMPILE_FLAGS "-msse2")
set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
set(mlas_platform_srcs_avx
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelAvx.S

View file

@ -34,9 +34,6 @@ Abstract:
#include <cpuid.h>
#include <immintrin.h>
#endif
#if defined(__x86_64__)
#include "x86_64/xgetbv.h"
#endif
#endif
//

View file

@ -23,6 +23,41 @@ Abstract:
MLAS_PLATFORM MlasPlatform;
#ifdef MLAS_TARGET_AMD64_IX86
//
// Reads the processor extended control register to determine platform
// capabilities.
//
#if !defined(_XCR_XFEATURE_ENABLED_MASK)
#define _XCR_XFEATURE_ENABLED_MASK 0
#endif
inline
uint64_t
MlasReadExtendedControlRegister(
unsigned int ext_ctrl_reg
)
{
#if defined(_WIN32)
return _xgetbv(ext_ctrl_reg);
#else
uint32_t eax, edx;
__asm__
(
"xgetbv"
: "=a" (eax), "=d" (edx)
: "c" (ext_ctrl_reg)
);
return ((uint64_t)edx << 32) | eax;
#endif
}
#endif
MLAS_PLATFORM::MLAS_PLATFORM(
void
)
@ -74,11 +109,7 @@ Return Value:
// Check if the operating system supports saving SSE and AVX states.
//
#if defined(_WIN32)
uint64_t xcr0 = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
#else
uint64_t xcr0 = xgetbv(_XCR_XFEATURE_ENABLED_MASK);
#endif
uint64_t xcr0 = MlasReadExtendedControlRegister(_XCR_XFEATURE_ENABLED_MASK);
if ((xcr0 & 0x6) == 0x6) {

View file

@ -0,0 +1,433 @@
/*++
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
Module Name:
SgemmKernelAvx.s
Abstract:
This module implements the kernels for the single precision matrix/matrix
multiply operation (SGEMM).
This implementation uses AVX instructions.
--*/
#include "asmmacro.h"
.intel_syntax noprefix
//
// Stack frame layout for the SGEMM kernel.
//
#define SgemmKernelFrame 0
#define SgemmKernelFrame_SavedEsi 4
#define SgemmKernelFrame_SavedEbx 8
#define SgemmKernelFrame_SavedEbp 12
#define SgemmKernelFrame_ReturnAddress 16
#define SgemmKernelFrame_MatrixA 20
#define SgemmKernelFrame_MatrixB 24
#define SgemmKernelFrame_MatrixC 28
#define SgemmKernelFrame_CountK 32
#define SgemmKernelFrame_CountM 36
#define SgemmKernelFrame_CountN 40
#define SgemmKernelFrame_lda 44
#define SgemmKernelFrame_ldc 48
#define SgemmKernelFrame_alpha 52
.text
/*++
Macro Description:
This macro multiplies and accumulates for a 16xN block (where N is 1,2)
of the output matrix.
Arguments:
Count - Supplies the number of rows to access from matrix A.
VectorOffset - Supplies the byte offset from matrix B to fetch elements.
BroadcastOffset - Supplies the byte offset from matrix A to fetch elements.
Implicit Arguments:
ebx - Supplies the length in bytes of a row from matrix A.
ecx - Supplies the address into the matrix A data.
edx - Supplies the address into the matrix B data.
ymm4-ymm7 - Supplies the block accumulators.
--*/
.macro ComputeBlockAvxBy16 Count, VectorOffset, BroadcastOffset
.if \Count\() == 1
vbroadcastss ymm3,DWORD PTR [ecx+\BroadcastOffset\()]
vmulps ymm1,ymm3,YMMWORD PTR [edx+\VectorOffset\()]
vaddps ymm4,ymm1,ymm4
vmulps ymm3,ymm3,YMMWORD PTR [edx+\VectorOffset\()+32]
vaddps ymm5,ymm3,ymm5
.else
vmovaps ymm0,YMMWORD PTR [edx+\VectorOffset\()]
vmovaps ymm1,YMMWORD PTR [edx+\VectorOffset\()+32]
vbroadcastss ymm3,DWORD PTR [ecx+\BroadcastOffset\()]
vmulps ymm2,ymm3,ymm0
vaddps ymm4,ymm2,ymm4
vmulps ymm2,ymm3,ymm1
vaddps ymm5,ymm2,ymm5
vbroadcastss ymm3,DWORD PTR [ecx+ebx+\BroadcastOffset\()]
vmulps ymm2,ymm3,ymm0
vaddps ymm6,ymm2,ymm6
vmulps ymm2,ymm3,ymm1
vaddps ymm7,ymm2,ymm7
.endif
.endm
/*++
Macro Description:
This macro multiplies and accumulates for a 8xN block (where N is 1,2)
of the output matrix.
Arguments:
Count - Supplies the number of rows to access from matrix A.
VectorOffset - Supplies the byte offset from matrix B to fetch elements.
BroadcastOffset - Supplies the byte offset from matrix A to fetch elements.
Implicit Arguments:
ebx - Supplies the length in bytes of a row from matrix A.
ecx - Supplies the address into the matrix A data.
edx - Supplies the address into the matrix B data.
ymm4-ymm7 - Supplies the block accumulators.
--*/
.macro ComputeBlockAvxBy8 Count, VectorOffset, BroadcastOffset
.if \Count\() == 1
vbroadcastss ymm3,DWORD PTR [ecx+\BroadcastOffset\()]
vmulps ymm3,ymm3,YMMWORD PTR [edx+\VectorOffset\()]
vaddps ymm5,ymm3,ymm5
.else
vmovaps ymm0,YMMWORD PTR [edx+\VectorOffset\()]
vbroadcastss ymm3,DWORD PTR [ecx+\BroadcastOffset\()]
vmulps ymm3,ymm3,ymm0
vaddps ymm5,ymm3,ymm5
vbroadcastss ymm3,DWORD PTR [ecx+ebx+\BroadcastOffset\()]
vmulps ymm3,ymm3,ymm0
vaddps ymm7,ymm3,ymm7
.endif
.endm
/*++
Macro Description:
This macro generates code to execute the block compute macro multiple
times and advancing the matrix A and matrix B data pointers.
Arguments:
ComputeBlock - Supplies the macro to compute a single block.
Count - Supplies the number of rows to access from matrix A.
Implicit Arguments:
ebx - Supplies the number of bytes to the next row of matrix A.
ecx - Supplies the address into the matrix A data.
edx - Supplies the address into the matrix B data.
edi - Supplies the number of columns from matrix A and the number of rows
from matrix B to iterate over.
ymm4-ymm7 - Supplies the block accumulators.
--*/
.macro ComputeBlockAvxLoop Mode, ComputeBlock, Count
sub edi,4
jb .L\Mode\().\ComputeBlock\().\Count\().ProcessRemainingBlocks
.L\Mode\().\ComputeBlock\().\Count\().ComputeBlockBy4Loop:
\ComputeBlock\() \Count\(), 0, 0
\ComputeBlock\() \Count\(), 16*4, 4
sub edx,-32*4 # advance matrix B by 32 columns
\ComputeBlock\() \Count\(), 0, 8
\ComputeBlock\() \Count\(), 16*4, 12
sub edx,-32*4 # advance matrix B by 32 columns
add ecx,4*4 # advance matrix A by 4 columns
sub edi,4
jae .L\Mode\().\ComputeBlock\().\Count\().ComputeBlockBy4Loop
.L\Mode\().\ComputeBlock\().\Count\().ProcessRemainingBlocks:
add edi,4 # correct for over-subtract above
jz .L\Mode\().\ComputeBlock\().\Count\().OutputBlock
.L\Mode\().\ComputeBlock\().\Count\().ComputeBlockBy1Loop:
\ComputeBlock\() \Count\(), 0, 0
add edx,16*4 # advance matrix B by 16 columns
add ecx,4 # advance matrix A by 1 column
dec edi
jne .L\Mode\().\ComputeBlock\().\Count\().ComputeBlockBy1Loop
.L\Mode\().\ComputeBlock\().\Count\().OutputBlock:
.endm
/*++
Routine Description:
This routine is an inner kernel to compute matrix multiplication for a
set of rows.
Arguments:
A - Supplies the address of matrix A.
B - Supplies the address of matrix B. The matrix data has been packed using
MlasSgemmCopyPackB or MlasSgemmTransposePackB.
C - Supplies the address of matrix C.
CountK - Supplies the number of columns from matrix A and the number of
rows from matrix B to iterate over.
CountM - Supplies the maximum number of rows that can be processed for
matrix A and matrix C. The actual number of rows handled for this
invocation depends on the kernel implementation.
CountN - Supplies the number of columns from matrix B and matrix C to
iterate over.
lda - Supplies the first dimension of matrix A.
ldc - Supplies the first dimension of matrix C.
Alpha - Supplies the scaler multiplier (see SGEMM definition).
Return Value:
Returns the number of rows handled.
--*/
.macro SgemmKernelAvxFunction Mode
.globl C_UNDERSCORE(MlasSgemmKernel\Mode\()Avx)
C_UNDERSCORE(MlasSgemmKernel\Mode\()Avx):
push ebp
push ebx
push esi
push edi
mov edx,SgemmKernelFrame_MatrixB[esp]
mov esi,SgemmKernelFrame_MatrixC[esp]
mov ebp,SgemmKernelFrame_CountN[esp]
//
// Process 2 rows of the matrices.
//
cmp DWORD PTR SgemmKernelFrame_CountM[esp],2
jb .L\Mode\().ProcessCountMLessThan2
mov BYTE PTR SgemmKernelFrame_CountM[esp],2
mov eax,SgemmKernelFrame_ldc[esp]
mov ebx,SgemmKernelFrame_lda[esp]
shl eax,2 # convert ldc to bytes
shl ebx,2 # convert lda to bytes
cmp ebp,8
jbe .L\Mode\().ProcessRemainingCountN2
.L\Mode\().ProcessNextColumnLoop16x2:
mov edi,SgemmKernelFrame_CountK[esp]
mov ecx,SgemmKernelFrame_MatrixA[esp]
vxorps xmm4,xmm4,xmm4 # clear block accumulators
vxorps xmm5,xmm5,xmm5
vxorps xmm6,xmm6,xmm6
vxorps xmm7,xmm7,xmm7
ComputeBlockAvxLoop \Mode\(), ComputeBlockAvxBy16, 2
vbroadcastss ymm2,DWORD PTR SgemmKernelFrame_alpha[esp]
vmulps ymm4,ymm4,ymm2 # multiply by alpha
vmulps ymm5,ymm5,ymm2
vmulps ymm6,ymm6,ymm2
vmulps ymm7,ymm7,ymm2
sub ebp,16
jb .L\Mode\().OutputMasked16x2Block
.ifeqs "\Mode\()","Add"
vaddps ymm4,ymm4,YMMWORD PTR [esi]
vaddps ymm5,ymm5,YMMWORD PTR [esi+32]
vaddps ymm6,ymm6,YMMWORD PTR [esi+eax]
vaddps ymm7,ymm7,YMMWORD PTR [esi+eax+32]
.endif
vmovups YMMWORD PTR [esi],ymm4
vmovups YMMWORD PTR [esi+32],ymm5
vmovups YMMWORD PTR [esi+eax],ymm6
vmovups YMMWORD PTR [esi+eax+32],ymm7
add esi,16*4 # advance matrix C by 16 columns
cmp ebp,8
ja .L\Mode\().ProcessNextColumnLoop16x2
test ebp,ebp
jz .L\Mode\().ExitKernel
.L\Mode\().ProcessRemainingCountN2:
mov edi,SgemmKernelFrame_CountK[esp]
mov ecx,SgemmKernelFrame_MatrixA[esp]
vxorps xmm5,xmm5,xmm5 # clear block accumulators
vxorps xmm7,xmm7,xmm7
ComputeBlockAvxLoop \Mode\(), ComputeBlockAvxBy8, 2
vbroadcastss ymm2,DWORD PTR SgemmKernelFrame_alpha[esp]
vmulps ymm5,ymm5,ymm2 # multiply by alpha
vmulps ymm7,ymm7,ymm2
cmp ebp,8
jb .L\Mode\().OutputMasked8x2Block
.ifeqs "\Mode\()","Add"
vaddps ymm5,ymm5,YMMWORD PTR [esi]
vaddps ymm7,ymm7,YMMWORD PTR [esi+eax]
.endif
vmovups YMMWORD PTR [esi],ymm5
vmovups YMMWORD PTR [esi+eax],ymm7
//
// Restore non-volatile registers and return.
//
.L\Mode\().ExitKernel:
movzx eax,BYTE PTR SgemmKernelFrame_CountM[esp]
vzeroupper
pop edi
pop esi
pop ebx
pop ebp
ret
.L\Mode\().OutputMasked16x2Block:
.ifeqs "\Mode\()","Add"
vaddps ymm4,ymm4,YMMWORD PTR [esi]
vaddps ymm6,ymm6,YMMWORD PTR [esi+eax]
.endif
vmovups YMMWORD PTR [esi],ymm4
vmovups YMMWORD PTR [esi+eax],ymm6
add esi,8*4 # advance matrix C by 8 columns
add ebp,8 # correct for over-subtract above
.L\Mode\().OutputMasked8x2Block:
call __x86.get_pc_thunk.bx
add ebx,OFFSET _GLOBAL_OFFSET_TABLE_
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveAvx)@GOT[ebx]
mov SgemmKernelFrame_CountN[esp],ebp
vbroadcastss xmm0,SgemmKernelFrame_CountN[esp]
vpcmpgtd xmm1,xmm0,XMMWORD PTR [ebx+16]
vpcmpgtd xmm0,xmm0,XMMWORD PTR [ebx]
vinsertf128 ymm0,ymm0,xmm1,1
.ifeqs "\Mode\()","Add"
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]
vmaskmovps ymm6,ymm0,YMMWORD PTR [esi+eax]
vaddps ymm5,ymm5,ymm4
vaddps ymm7,ymm7,ymm6
.endif
vmaskmovps YMMWORD PTR [esi],ymm0,ymm5
vmaskmovps YMMWORD PTR [esi+eax],ymm0,ymm7
jmp .L\Mode\().ExitKernel
//
// Process 1 row of the matrices.
//
.L\Mode\().ProcessCountMLessThan2:
mov BYTE PTR SgemmKernelFrame_CountM[esp],1
mov ebx,SgemmKernelFrame_MatrixA[esp]
vbroadcastss ymm2,DWORD PTR SgemmKernelFrame_alpha[esp]
cmp ebp,8
jbe .L\Mode\().ProcessRemainingCountN1
.L\Mode\().ProcessNextColumnLoop16x1:
mov edi,SgemmKernelFrame_CountK[esp]
mov ecx,ebx # reload matrix A
vxorps xmm4,xmm4,xmm4 # clear block accumulators
vxorps xmm5,xmm5,xmm5
ComputeBlockAvxLoop \Mode\(), ComputeBlockAvxBy16, 1
vmulps ymm4,ymm4,ymm2 # multiply by alpha
vmulps ymm5,ymm5,ymm2
sub ebp,16
jb .L\Mode\().OutputMasked16x1Block
.ifeqs "\Mode\()","Add"
vaddps ymm4,ymm4,YMMWORD PTR [esi]
vaddps ymm5,ymm5,YMMWORD PTR [esi+32]
.endif
vmovups YMMWORD PTR [esi],ymm4
vmovups YMMWORD PTR [esi+32],ymm5
add esi,16*4 # advance matrix C by 16 columns
cmp ebp,8
ja .L\Mode\().ProcessNextColumnLoop16x1
test ebp,ebp
jz .L\Mode\().ExitKernel
.L\Mode\().ProcessRemainingCountN1:
mov edi,SgemmKernelFrame_CountK[esp]
mov ecx,ebx # reload matrix A
vxorps xmm5,xmm5,xmm5 # clear block accumulators
ComputeBlockAvxLoop \Mode\(), ComputeBlockAvxBy8, 1
vmulps ymm5,ymm5,ymm2 # multiply by alpha
cmp ebp,8
jb .L\Mode\().OutputMasked8x1Block
.ifeqs "\Mode\()","Add"
vaddps ymm5,ymm5,YMMWORD PTR [esi]
.endif
vmovups YMMWORD PTR [esi],ymm5
jmp .L\Mode\().ExitKernel
.L\Mode\().OutputMasked16x1Block:
.ifeqs "\Mode\()","Add"
vaddps ymm4,ymm4,YMMWORD PTR [esi]
.endif
vmovups YMMWORD PTR [esi],ymm4
add esi,8*4 # advance matrix C by 8 columns
add ebp,8 # correct for over-subtract above
.L\Mode\().OutputMasked8x1Block:
call __x86.get_pc_thunk.bx
add ebx,OFFSET _GLOBAL_OFFSET_TABLE_
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveAvx)@GOT[ebx]
mov SgemmKernelFrame_CountN[esp],ebp
vbroadcastss xmm0,SgemmKernelFrame_CountN[esp]
vpcmpgtd xmm1,xmm0,XMMWORD PTR [ebx+16]
vpcmpgtd xmm0,xmm0,XMMWORD PTR [ebx]
vinsertf128 ymm0,ymm0,xmm1,1
.ifeqs "\Mode\()","Add"
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]
vaddps ymm5,ymm5,ymm4
.endif
vmaskmovps YMMWORD PTR [esi],ymm0,ymm5
jmp .L\Mode\().ExitKernel
.endm
SgemmKernelAvxFunction Zero
SgemmKernelAvxFunction Add
.end

View file

@ -0,0 +1,389 @@
/*++
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
Module Name:
SgemmKernelSse2.s
Abstract:
This module implements the kernels for the single precision matrix/matrix
multiply operation (SGEMM).
This implementation uses SSE2 instructions.
--*/
#include "asmmacro.h"
.intel_syntax noprefix
//
// Stack frame layout for the SGEMM kernel.
//
#define SgemmKernelFrame 0
#define SgemmKernelFrame_SavedEsi 4
#define SgemmKernelFrame_SavedEbx 8
#define SgemmKernelFrame_SavedEbp 12
#define SgemmKernelFrame_ReturnAddress 16
#define SgemmKernelFrame_MatrixA 20
#define SgemmKernelFrame_MatrixB 24
#define SgemmKernelFrame_MatrixC 28
#define SgemmKernelFrame_CountK 32
#define SgemmKernelFrame_CountM 36
#define SgemmKernelFrame_CountN 40
#define SgemmKernelFrame_lda 44
#define SgemmKernelFrame_ldc 48
#define SgemmKernelFrame_alpha 52
.text
/*++
Macro Description:
This macro multiplies and accumulates for a Nx1 block (where N is 1,2,3,4)
of the output matrix.
Arguments:
VectorOffset - Supplies the byte offset from matrix B to fetch elements.
Shuffle - Supplies the shuffle mask to extract the element from matrix A.
Implicit Arguments:
ebx - Supplies the length in bytes of a row from matrix A.
ecx - Supplies the address into the matrix A data.
edx - Supplies the address into the matrix B data.
xmm2 - Supplies up to four elements loaded from matrix A.
xmm4-xmm7 - Supplies the block accumulators.
--*/
.macro ComputeBlockSseBy4 VectorOffset, Shuffle
pshufd xmm3,xmm1,\Shuffle\()
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()]
mulps xmm0,xmm3
addps xmm4,xmm0
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()+16]
mulps xmm0,xmm3
addps xmm5,xmm0
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()+32]
mulps xmm0,xmm3
addps xmm6,xmm0
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()+48]
mulps xmm0,xmm3
addps xmm7,xmm0
.endm
.macro ComputeBlockSseBy3 VectorOffset, Shuffle
pshufd xmm3,xmm1,\Shuffle\()
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()]
mulps xmm0,xmm3
addps xmm5,xmm0
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()+16]
mulps xmm0,xmm3
addps xmm6,xmm0
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()+32]
mulps xmm0,xmm3
addps xmm7,xmm0
.endm
.macro ComputeBlockSseBy2 VectorOffset, Shuffle
pshufd xmm3,xmm1,\Shuffle\()
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()]
mulps xmm0,xmm3
addps xmm6,xmm0
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()+16]
mulps xmm0,xmm3
addps xmm7,xmm0
.endm
.macro ComputeBlockSseBy1 VectorOffset, Shuffle
pshufd xmm3,xmm1,\Shuffle\()
movaps xmm0,XMMWORD PTR [edx+\VectorOffset\()]
mulps xmm0,xmm3
addps xmm7,xmm0
.endm
/*++
Macro Description:
This macro generates code to execute the block compute macro multiple
times and advancing the matrix A and matrix B data pointers.
Arguments:
ComputeBlock - Supplies the macro to compute a single block.
Count - Supplies the number of rows to access from matrix A.
Implicit Arguments:
ebx - Supplies the number of bytes to the next row of matrix A.
ecx - Supplies the address into the matrix A data.
edx - Supplies the address into the matrix B data.
edi - Supplies the number of columns from matrix A and the number of rows
from matrix B to iterate over.
xmm4-xmm7 - Supplies the block accumulators.
--*/
.macro ComputeBlockSseLoop Mode, Count
sub edi,4
jb .L\Mode\().\Count\().ProcessRemainingBlocks
.L\Mode\().\Count\().ComputeBlockBy4Loop:
movups xmm1,XMMWORD PTR [ecx]
ComputeBlockSseBy\Count\() 0, 0x00
ComputeBlockSseBy\Count\() 16*4, 0x55
sub edx,-32*4 # advance matrix B by 32 columns
ComputeBlockSseBy\Count\() 0, 0xAA
ComputeBlockSseBy\Count\() 16*4, 0xFF
sub edx,-32*4 # advance matrix B by 32 columns
add ecx,4*4 # advance matrix A by 4 columns
sub edi,4
jae .L\Mode\().\Count\().ComputeBlockBy4Loop
.L\Mode\().\Count\().ProcessRemainingBlocks:
add edi,4 # correct for over-subtract above
jz .L\Mode\().\Count\().OutputBlock
.L\Mode\().\Count\().ComputeBlockBy1Loop:
movss xmm1,DWORD PTR [ecx]
ComputeBlockSseBy\Count\() 0, 0x00
add edx,16*4 # advance matrix B by 16 columns
add ecx,4 # advance matrix A by 1 column
dec edi
jne .L\Mode\().\Count\().ComputeBlockBy1Loop
.L\Mode\().\Count\().OutputBlock:
.endm
/*++
Routine Description:
This routine is an inner kernel to compute matrix multiplication for a
set of rows.
Arguments:
A - Supplies the address of matrix A.
B - Supplies the address of matrix B. The matrix data has been packed using
MlasSgemmCopyPackB or MlasSgemmTransposePackB.
C - Supplies the address of matrix C.
CountK - Supplies the number of columns from matrix A and the number of
rows from matrix B to iterate over.
CountM - Supplies the maximum number of rows that can be processed for
matrix A and matrix C. The actual number of rows handled for this
invocation depends on the kernel implementation.
CountN - Supplies the number of columns from matrix B and matrix C to
iterate over.
lda - Supplies the first dimension of matrix A.
ldc - Supplies the first dimension of matrix C.
Alpha - Supplies the scaler multiplier (see SGEMM definition).
Return Value:
Returns the number of rows handled.
--*/
.macro SgemmKernelSseFunction Mode
.globl C_UNDERSCORE(MlasSgemmKernel\Mode\()Sse)
C_UNDERSCORE(MlasSgemmKernel\Mode\()Sse):
push ebp
push ebx
push esi
push edi
mov edx,SgemmKernelFrame_MatrixB[esp]
mov esi,SgemmKernelFrame_MatrixC[esp]
mov ebp,SgemmKernelFrame_CountN[esp]
//
// Process 1 row of the matrices.
//
mov eax,SgemmKernelFrame_CountK[esp]
mov ebx,SgemmKernelFrame_MatrixA[esp]
cmp ebp,12
jbe .L\Mode\().ProcessRemainingCountN
.L\Mode\().ProcessNextColumnLoop16x1:
mov edi,eax # reload CountK
mov ecx,ebx # reload matrix A
xorps xmm4,xmm4 # clear block accumulators
xorps xmm5,xmm5
xorps xmm6,xmm6
xorps xmm7,xmm7
ComputeBlockSseLoop \Mode\(), 4
movss xmm2,DWORD PTR SgemmKernelFrame_alpha[esp]
shufps xmm2,xmm2,0
mulps xmm4,xmm2 # multiply by alpha
mulps xmm5,xmm2
mulps xmm6,xmm2
mulps xmm7,xmm2
.ifeqs "\Mode\()","Add"
movups xmm0,XMMWORD PTR [esi]
movups xmm1,XMMWORD PTR [esi+16]
movups xmm2,XMMWORD PTR [esi+32]
addps xmm4,xmm0
addps xmm5,xmm1
addps xmm6,xmm2
.endif
movups XMMWORD PTR [esi],xmm4
movups XMMWORD PTR [esi+16],xmm5
movups XMMWORD PTR [esi+32],xmm6
sub ebp,16
jb .L\Mode\().OutputMasked16x1Block
.ifeqs "\Mode\()","Add"
movups xmm3,XMMWORD PTR [esi+48]
addps xmm7,xmm3
.endif
movups XMMWORD PTR [esi+48],xmm7
add esi,16*4 # advance matrix C by 16 columns
cmp ebp,12
ja .L\Mode\().ProcessNextColumnLoop16x1
test ebp,ebp
jnz .L\Mode\().ProcessRemainingCountN
//
// Restore non-volatile registers and return.
//
.L\Mode\().ExitKernel:
mov eax,1 # return 1 row handled
pop edi
pop esi
pop ebx
pop ebp
ret
//
// Process the remaining 1 to 12 columns of the matrices.
//
.L\Mode\().ProcessRemainingCountN:
mov edi,eax # reload CountK
mov ecx,ebx # reload matrix A
movss xmm4,DWORD PTR SgemmKernelFrame_alpha[esp]
shufps xmm4,xmm4,0
xorps xmm5,xmm5 # clear block accumulators
xorps xmm6,xmm6
xorps xmm7,xmm7
cmp ebp,4
jbe .L\Mode\().ProcessRemainingCountN4OrLess
cmp ebp,8
jbe .L\Mode\().ProcessRemainingCountN8OrLess
.L\Mode\().ProcessRemainingCountN12OrLess:
ComputeBlockSseLoop \Mode\(), 3
mulps xmm5,xmm4 # multiply by alpha
mulps xmm6,xmm4
mulps xmm7,xmm4
.ifeqs "\Mode\()","Add"
movups xmm0,XMMWORD PTR [esi]
movups xmm1,XMMWORD PTR [esi+16]
addps xmm5,xmm0
addps xmm6,xmm1
.endif
movups XMMWORD PTR [esi],xmm5
movups XMMWORD PTR [esi+16],xmm6
add esi,8*4 # advance matrix C by 8 columns
jmp .L\Mode\().OutputTrailingBlock
.L\Mode\().ProcessRemainingCountN8OrLess:
ComputeBlockSseLoop \Mode\(), 2
mulps xmm6,xmm4 # multiply by alpha
mulps xmm7,xmm4
.ifeqs "\Mode\()","Add"
movups xmm0,XMMWORD PTR [esi]
addps xmm6,xmm0
.endif
movups XMMWORD PTR [esi],xmm6
add esi,4*4 # advance matrix C by 4 columns
jmp .L\Mode\().OutputTrailingBlock
.L\Mode\().ProcessRemainingCountN4OrLess:
ComputeBlockSseLoop \Mode\(), 1
mulps xmm7,xmm4 # multiply by alpha
jmp .L\Mode\().OutputTrailingBlock
.L\Mode\().OutputMasked16x1Block:
add esi,12*4 # advance matrix C by 12 columns
.L\Mode\().OutputTrailingBlock:
test ebp,3
jz .L\Mode\().OutputTrailingBlock4Elements
test ebp,2
jz .L\Mode\().OutputTrailingBlock1Element
.L\Mode\().OutputTrailingBlock2Elements:
.ifeqs "\Mode\()","Add"
movsd xmm0,QWORD PTR [esi]
addps xmm7,xmm0
.endif
movsd QWORD PTR [esi],xmm7
test ebp,1
jz .L\Mode\().ExitKernel
shufps xmm7,xmm7,0xAA # shuffle third float down
add esi,2*4 # advance matrix C by 2 columns
.L\Mode\().OutputTrailingBlock1Element:
.ifeqs "\Mode\()","Add"
movss xmm0,DWORD PTR [esi]
addss xmm7,xmm0
.endif
movss DWORD PTR [esi],xmm7
jmp .L\Mode\().ExitKernel
.L\Mode\().OutputTrailingBlock4Elements:
.ifeqs "\Mode\()","Add"
movups xmm0,XMMWORD PTR [esi]
addps xmm7,xmm0
.endif
movups XMMWORD PTR [esi],xmm7
jmp .L\Mode\().ExitKernel
.endm
SgemmKernelSseFunction Zero
SgemmKernelSseFunction Add
.end

View file

@ -0,0 +1,21 @@
/*++
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
Module Name:
asmmacro.h
Abstract:
This module implements common macros for the assembly modules.
--*/
#if defined(__APPLE__)
#define C_UNDERSCORE(symbol) _##symbol
#else
#define C_UNDERSCORE(symbol) symbol
#endif

View file

@ -350,6 +350,17 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
broken_tests["tf_nasnet_large"] = "failed: bad allocation";
broken_tests["tf_pnasnet_large"] = "failed: bad allocation";
#endif
#ifdef __GNUG__
#ifndef __LP64__
broken_tests["nonzero_example"] = "failed: type mismatch";
broken_tests["tf_resnet_v2_152"] = "failed: type mismatch";
broken_tests["tf_nasnet_large"] = "failed: bad allocation";
broken_tests["tf_resnet_v1_152"] = "failed: type mismatch";
broken_tests["tf_resnet_v2_101"] = "failed: type mismatch";
broken_tests["tf_pnasnet_large"] = "failed: bad allocation";
#endif
#endif
int result = 0;

View file

@ -8,6 +8,7 @@ import onnx.backend.test
import numpy as np
import onnxruntime.backend as c2
import platform
pytest_plugins = 'onnx.backend.test.report',
@ -79,6 +80,9 @@ backend_test.exclude(r'('
'|^test_operator_params_cpu.*'
'|^test_operator_pow_cpu.*'
'|^test_shrink_cpu.*'
'|^test_vgg19_cpu.*'
'|^test_zfnet512_cpu.*'
'|^test_gru_seq_length_cpu.*'
')')
# import all test cases at global scope to make

View file

@ -308,6 +308,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
"-Donnxruntime_USE_TRT=" + ("ON" if args.use_trt else "OFF"),
# By default - we currently support only cross compiling for ARM/ARM64 (no native compilation supported through this script)
"-Donnxruntime_CROSS_COMPILING=" + ("ON" if args.arm64 or args.arm else "OFF"),
"-Donnxruntime_BUILD_x86=" + ("ON" if args.x86 else "OFF"),
]
if args.use_brainslice:
bs_pkg_name = args.brain_slice_package_name.split('.', 1)

View file

@ -1,25 +1,10 @@
jobs:
- job: Linux_C_API_Packaging_CPU_x64
pool: Linux-CPU
steps:
- template: templates/set-test-data-variables-step.yml
- script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x " --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)"'
displayName: 'Build and Test Linux on Docker'
- template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
parameters:
buildConfig: 'Release'
artifactName: 'onnxruntime-linux-x64'
libraryName: 'libonnxruntime.so'
- template: templates/clean-agent-build-directory-step.yml
- job: Linux_C_API_Packaging_CPU_x86
pool: Linux-CPU
steps:
- template: templates/set-test-data-variables-step.yml
- script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x " --x86 --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)"'
- script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -a x86 -x " --x86 --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)"'
displayName: 'Build and Test Linux on Docker'
- template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
parameters:
@ -27,153 +12,3 @@ jobs:
artifactName: 'onnxruntime-linux-x86'
libraryName: 'libonnxruntime.so'
- template: templates/clean-agent-build-directory-step.yml
- job: Linux_C_API_Packaging_GPU_x64
pool: Linux-GPU
steps:
- template: templates/set-test-data-variables-step.yml
- script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d gpu -c cuda9.1-cudnn7.1 -r $(Build.BinariesDirectory) -x " --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)"'
displayName: 'Build and Test Linux on Docker'
- template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
parameters:
buildConfig: 'Release'
artifactName: 'onnxruntime-linux-x64-gpu'
libraryName: 'libonnxruntime.so'
- template: templates/clean-agent-build-directory-step.yml
- job: MacOS_C_API_Packaging_CPU_x64
pool:
vmImage: 'macOS-10.13'
steps:
- template: templates/set-test-data-variables-step.yml
- script: |
sudo xcode-select --switch /Applications/Xcode_10.app/Contents/Developer
python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --use_openmp --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --build_shared_lib --config RelWithDebInfo --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)
displayName: 'Build and Test MacOS'
- template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
parameters:
buildConfig: 'RelWithDebInfo'
artifactName: 'onnxruntime-osx-x64'
libraryName: 'libonnxruntime.dylib'
- template: templates/clean-agent-build-directory-step.yml
- job: Windows_Packaging_CPU_x64
pool: 'Win-CPU'
variables:
buildDirectory: '$(Build.BinariesDirectory)'
buildConfig: 'RelWithDebInfo'
buildArch: 'x64'
steps:
- template: templates/set-test-data-variables-step.yml
- template: templates/windows-build-tools-setup-steps.yml
parameters:
EnvSetupScript: 'setup_env.bat'
buildArch: 'amd64' # amd64 is needed for vcvars target arch
setVcvars: false
- template: templates/windows-build-and-test-steps.yml
parameters:
buildAdditionalParams: ' --use_openmp '
buildArch: $(buildArch)
msbuildPlatform: $(buildArch)
buildConfig: $(buildConfig)
- template: templates/c-api-artifacts-package-and-publish-steps-windows.yml
parameters:
buildConfig: $(buildConfig)
artifactName: 'onnxruntime-win-$(buildArch)'
- template: templates/clean-agent-build-directory-step.yml
- job: Windows_Packaging_CPU_x86
pool: 'Win-CPU'
variables:
buildDirectory: '$(Build.BinariesDirectory)'
buildConfig: 'RelWithDebInfo'
buildArch: 'x86'
steps:
- template: templates/set-test-data-variables-step.yml
- template: templates/windows-build-tools-setup-steps.yml
parameters:
EnvSetupScript: 'setup_env.bat'
buildArch: $(buildArch)
setVcVars: false
- template: templates/windows-build-and-test-steps.yml
parameters:
buildAdditionalParams: ' --use_openmp --x86 '
buildArch: $(buildArch)
msbuildPlatform: 'Win32'
buildConfig: $(buildConfig)
- template: templates/c-api-artifacts-package-and-publish-steps-windows.yml
parameters:
buildConfig: $(buildConfig)
artifactName: 'onnxruntime-win-$(buildArch)'
- template: templates/clean-agent-build-directory-step.yml
- job: Windows_Packaging_GPU_x64
pool: 'Win-GPU'
variables:
buildDirectory: '$(Build.BinariesDirectory)'
buildConfig: 'RelWithDebInfo'
buildArch: 'x64'
steps:
- template: templates/set-test-data-variables-step.yml
- task: CmdLine@2
displayName: 'Set CUDA 9.1 path'
inputs:
script: |
set PATH=C:\local\cuda-9.1.85-windows10-x64-0\bin;C:\local\cudnn-9.1-windows10-x64-v7.1\cuda\bin;%PATH%
modifyEnvironment: true
workingDirectory: '$(Build.BinariesDirectory)'
- task: PowerShell@2
displayName: 'Set CUDA 9.1 MSBuild properties'
inputs:
targetType: 'filePath'
filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/set_cuda_path.ps1'
arguments: '-CudaMsbuildPath C:\local\cudaMsbuildIntegration-9.1.85-windows10-x64-0 -CudaVersion 9.1'
- template: templates/windows-build-tools-setup-steps.yml
parameters:
EnvSetupScript: 'setup_env.bat'
buildArch: 'amd64' # amd64 is needed for vcvars target arch
setVcvars: true
- task: CmdLine@2
displayName: 'Build and Test OnnxRuntime'
inputs:
script: |
$(Build.BinariesDirectory)\packages\python\python.exe $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(buildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --use_openmp --msvc_toolset=14.11 --use_cuda --cuda_version 9.1 --cuda_home="C:\local\cuda-9.1.85-windows10-x64-0" --cudnn_home="C:\local\cudnn-9.1-windows10-x64-v7.1\cuda"
workingDirectory: '$(Build.BinariesDirectory)'
- template: templates/c-api-artifacts-package-and-publish-steps-windows.yml
parameters:
buildConfig: $(buildConfig)
artifactName: 'onnxruntime-win-gpu-$(buildArch)'
- task: PowerShell@2
displayName: 'Clean up Cuda Path 9.1'
inputs:
targetType: 'filePath'
filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/clean_up_cuda_prop_files.ps1'
arguments: '-CudaVersion 9.1'
- template: templates/clean-agent-build-directory-step.yml

View file

@ -0,0 +1,18 @@
ARG OS_VERSION=16.04
FROM i386/ubuntu:${OS_VERSION}
ARG PYTHON_VERSION=3.5
ADD scripts /tmp/scripts
ENV PATH="/opt/cmake/bin:${PATH}"
RUN /tmp/scripts/install_ubuntu_x86.sh -p ${PYTHON_VERSION} && /tmp/scripts/install_deps_x86.sh && rm -rf /tmp/scripts
WORKDIR /root
ENV LD_LIBRARY_PATH /usr/local/openblas/lib:$LD_LIBRARY_PATH
ARG BUILD_UID=1000
ARG BUILD_USER=onnxruntimedev
WORKDIR /home/$BUILD_USER
RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
USER $BUILD_USER

View file

@ -0,0 +1,62 @@
#!/bin/bash
set -e
aria2c -q -d /tmp/src https://github.com/Kitware/CMake/releases/download/v3.12.4/cmake-3.12.4.tar.gz
tar -xf /tmp/src/cmake-3.12.4.tar.gz -C /tmp/src
cd /tmp/src/cmake-3.12.4
./configure
make
make install
aria2c -q -d /tmp/src https://github.com/protocolbuffers/protobuf/archive/v3.6.1.tar.gz
tar -xf /tmp/src/protobuf-3.6.1.tar.gz -C /tmp/src
cd /tmp/src/protobuf-3.6.1
if [ -f /etc/redhat-release ] ; then
PB_LIBDIR=lib64
else
PB_LIBDIR=lib
fi
for build_type in 'Debug' 'Relwithdebinfo'; do
pushd .
mkdir build_$build_type
cd build_$build_type
cmake -G Ninja ../cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_LIBDIR=$PB_LIBDIR -DCMAKE_INSTALL_SYSCONFDIR=/etc -DCMAKE_POSITION_INDEPENDENT_CODE=ON -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=$build_type
ninja
ninja install
popd
done
export ONNX_ML=1
INSTALLED_PYTHON_VERSION=$(python3 -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version));')
if [ "$INSTALLED_PYTHON_VERSION" = "3.7" ];then
pip3 install --upgrade setuptools
else
#Install ONNX
#5af210ca8a1c73aa6bae8754c9346ec54d0a756e is v1.2.3
#bae6333e149a59a3faa9c4d9c44974373dcf5256 is v1.3.0
#9e55ace55aad1ada27516038dfbdc66a8a0763db is v1.4.1
#873ddbbc33c6e54d90c5628387edd391fb651dfc is v1.4.1 latest
for onnx_version in "5af210ca8a1c73aa6bae8754c9346ec54d0a756e" "bae6333e149a59a3faa9c4d9c44974373dcf5256" "9e55ace55aad1ada27516038dfbdc66a8a0763db" "873ddbbc33c6e54d90c5628387edd391fb651dfc"; do
if [ -z ${lastest_onnx_version+x} ]; then
echo "first pass";
else
echo "deleting old onnx-${lastest_onnx_version}";
pip3 uninstall -y onnx
fi
lastest_onnx_version=$onnx_version
aria2c -q -d /tmp/src https://github.com/onnx/onnx/archive/$onnx_version.tar.gz
tar -xf /tmp/src/onnx-$onnx_version.tar.gz -C /tmp/src
cd /tmp/src/onnx-$onnx_version
git clone https://github.com/pybind/pybind11.git third_party/pybind11
python3 setup.py bdist_wheel
pip3 install onnx
mkdir -p /data/onnx/$onnx_version
backend-test-tools generate-data -o /data/onnx/$onnx_version
done
fi
#The last onnx version will be kept
aria2c -q -d /tmp/src http://bitbucket.org/eigen/eigen/get/3.3.7.tar.bz2
tar -jxf /tmp/src/eigen-eigen-323c052e1731.tar.bz2 -C /usr/include
mv /usr/include/eigen-eigen-323c052e1731 /usr/include/eigen3
rm -rf /tmp/src

View file

@ -0,0 +1,59 @@
#!/bin/bash
set -e
while getopts p: parameter_Option
do case "${parameter_Option}"
in
p) PYTHON_VER=${OPTARG};;
esac
done
PYTHON_VER=${PYTHON_VER:=3.5}
DEBIAN_FRONTEND=noninteractive
apt-get update && apt-get install -y software-properties-common
add-apt-repository ppa:deadsnakes/ppa
apt-get update && apt-get install -y --no-install-recommends \
autotools-dev \
build-essential \
git apt-transport-https \
ca-certificates \
pkg-config \
wget \
zlib1g \
zlib1g-dev \
libssl-dev \
curl \
autoconf \
sudo \
gfortran \
python3-dev \
language-pack-en \
libopenblas-dev \
liblttng-ust0 \
libcurl3 \
libssl1.0.0 \
libkrb5-3 \
libicu55 \
aria2 \
bzip2 \
unzip \
zip \
rsync libunwind8 libpng16-dev \
python3-setuptools python3-numpy python3-wheel python python3-pip python3-pytest \
re2c \
ninja-build
locale-gen en_US.UTF-8
update-locale LANG=en_US.UTF-8
if [ $PYTHON_VER!="3.5" ]; then
apt-get install -y --no-install-recommends \
python${PYTHON_VER} \
python${PYTHON_VER}-dev
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VER} 1
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.5 2
update-alternatives --set python3 /usr/bin/python${PYTHON_VER}
fi
/usr/bin/python${PYTHON_VER} -m pip install --upgrade --force-reinstall numpy==1.15.0
rm -rf /var/lib/apt/lists/*

View file

@ -5,7 +5,7 @@ SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )"
SOURCE_ROOT=$(realpath $SCRIPT_DIR/../../../../)
CUDA_VER=cuda10.0-cudnn7.3
while getopts c:o:d:r:p:x: parameter_Option
while getopts c:o:d:r:p:x:a: parameter_Option
do case "${parameter_Option}"
in
#ubuntu16.04
@ -19,6 +19,7 @@ p) PYTHON_VER=${OPTARG};;
x) BUILD_EXTR_PAR=${OPTARG};;
# "cuda10.0-cudnn7.3, cuda9.1-cudnn7.1"
c) CUDA_VER=${OPTARG};;
a) BUILD_ARCH=${OPTARG};;
esac
done
@ -36,7 +37,11 @@ if [ $BUILD_DEVICE = "gpu" ]; then
docker build -t "onnxruntime-$IMAGE" --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} -f $DOCKER_FILE .
else
IMAGE="ubuntu16.04"
docker build -t "onnxruntime-$IMAGE" --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg OS_VERSION=16.04 --build-arg PYTHON_VERSION=${PYTHON_VER} -f Dockerfile.ubuntu .
if [ $BUILD_ARCH = "x86" ]; then
docker build -t "onnxruntime-$IMAGE" --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg OS_VERSION=16.04 --build-arg PYTHON_VERSION=${PYTHON_VER} -f Dockerfile.ubuntu_x86 .
else
docker build -t "onnxruntime-$IMAGE" --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg OS_VERSION=16.04 --build-arg PYTHON_VERSION=${PYTHON_VER} -f Dockerfile.ubuntu .
fi
fi
set +e