mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-22 22:01:08 +00:00
Simple eager training models (#12180)
* Simple NN using ort, and added or modified ort op support.
This commit is contained in:
parent
fafb24142f
commit
89bf6c9b5d
4 changed files with 126 additions and 22 deletions
|
|
@ -138,7 +138,8 @@ hand_implemented = {
|
|||
"aten::addmm": Gemm("mat1", "mat2", "self", alpha="alpha", beta="beta"),
|
||||
"aten::add_.Tensor": SignatureOnly(),
|
||||
"aten::t": Transpose("self"),
|
||||
"aten::mm.out": MatMul("self", "mat2"),
|
||||
# MatMul("self", "mat2"), fails since it resizes based on self but should be based on result shape of the mult
|
||||
"aten::mm.out": MakeTorchFallback(),
|
||||
"aten::zeros_like": ConstantOfShape(
|
||||
Shape("self")
|
||||
), # the default constant is 0, so don't need to speicify attribute
|
||||
|
|
@ -169,6 +170,10 @@ hand_implemented = {
|
|||
"aten::argmax.out": SignatureOnly(),
|
||||
"aten::nonzero": Transpose(NonZero("self")),
|
||||
"aten::nonzero.out": SignatureOnly(),
|
||||
"aten::_log_softmax.out": MakeTorchFallback(),
|
||||
"aten::nll_loss_forward.output": MakeTorchFallback(),
|
||||
"aten::nll_loss_backward.grad_input": MakeTorchFallback(),
|
||||
"aten::_log_softmax_backward_data.out": MakeTorchFallback(),
|
||||
}
|
||||
|
||||
# If the aten op expects a specific output type that differs from self
|
||||
|
|
|
|||
|
|
@ -1,26 +1,27 @@
|
|||
## This code is from https://github.com/pytorch/examples/blob/master/mnist/main.py
|
||||
## with modification to do training using onnxruntime as backend on cuda device.
|
||||
## A private PyTorch build from https://aiinfra.visualstudio.com/Lotus/_git/pytorch (ORTTraining branch) is needed to run the demo.
|
||||
## with modification to do training using onnxruntime as backend.
|
||||
|
||||
## Model testing is not complete.
|
||||
# pylint: disable=missing-docstring
|
||||
# pylint: disable=C0103
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
import onnxruntime_pybind11_state as torch_ort
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
from torchvision import datasets, transforms
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
dataset_root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
|
||||
import onnxruntime_pybind11_state as torch_ort
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn, optim
|
||||
from torchvision import datasets, transforms
|
||||
|
||||
# we use the build directory so gitignore applies.
|
||||
dataset_root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "build/data")
|
||||
|
||||
|
||||
class NeuralNet(nn.Module):
|
||||
def __init__(self, input_size, hidden_size, num_classes):
|
||||
super(NeuralNet, self).__init__()
|
||||
super().__init__()
|
||||
self.fc1 = nn.Linear(input_size, hidden_size)
|
||||
self.relu = nn.ReLU()
|
||||
self.fc2 = nn.Linear(hidden_size, num_classes)
|
||||
|
|
@ -40,9 +41,10 @@ def train_with_eager(args, model, optimizer, device, train_loader, epoch):
|
|||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
data_cpu = data.reshape(data.shape[0], -1)
|
||||
data = data_cpu.to(device)
|
||||
target_ort = target.to(device)
|
||||
|
||||
x = model(data)
|
||||
loss = my_loss(x.cpu(), target)
|
||||
loss = my_loss(x, target_ort)
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
|
@ -70,9 +72,8 @@ def main():
|
|||
parser.add_argument(
|
||||
"--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)"
|
||||
)
|
||||
parser.add_argument("--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 10)")
|
||||
parser.add_argument("--epochs", type=int, default=1, metavar="N", help="number of epochs to train (default: 1)")
|
||||
parser.add_argument("--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)")
|
||||
parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
|
||||
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
|
||||
parser.add_argument(
|
||||
"--log-interval",
|
||||
|
|
@ -83,7 +84,6 @@ def main():
|
|||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
use_cuda = not args.no_cuda and torch.cuda.is_available()
|
||||
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
|
|
@ -110,18 +110,18 @@ def main():
|
|||
**kwargs,
|
||||
)
|
||||
|
||||
device = torch.device("ort")
|
||||
device_ort = torch_ort.device()
|
||||
input_size = 784
|
||||
hidden_size = 500
|
||||
num_classes = 10
|
||||
model = NeuralNet(input_size, hidden_size, num_classes)
|
||||
model.to(device)
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.01)
|
||||
model_nn = NeuralNet(input_size, hidden_size, num_classes)
|
||||
model_nn.to(device_ort)
|
||||
optimizer = optim.SGD(model_nn.parameters(), lr=0.01)
|
||||
|
||||
print("\nStart Training.")
|
||||
|
||||
for epoch in range(1, args.epochs + 1):
|
||||
train_with_eager(args, model, optimizer, device, train_loader, epoch)
|
||||
train_with_eager(args, model_nn, optimizer, device_ort, train_loader, epoch)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
99
orttraining/orttraining/eager/test_models/training_test.py
Normal file
99
orttraining/orttraining/eager/test_models/training_test.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# pylint: disable=missing-docstring
|
||||
# pylint: disable=C0103
|
||||
# pylint: disable=R0903
|
||||
|
||||
# The following is a simple neural network trained and tested using FashinMINST data.
|
||||
# It is using eager mode targeting the ort device. After building eager mode run
|
||||
# PYTHONPATH=~/{repo root}/build/Linux/Debug python ~/{repo root}/orttraining/orttraining/eager/test/training_test.py
|
||||
|
||||
import os
|
||||
|
||||
import onnxruntime_pybind11_state as torch_ort
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils.data import DataLoader
|
||||
from torchvision import datasets
|
||||
from torchvision.transforms import ToTensor
|
||||
|
||||
# we copy traing data to build folder as it is gitignored
|
||||
dataset_root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "build/data")
|
||||
training_data = datasets.FashionMNIST(root=dataset_root_dir, train=True, download=True, transform=ToTensor())
|
||||
test_data = datasets.FashionMNIST(root=dataset_root_dir, train=False, download=True, transform=ToTensor())
|
||||
|
||||
train_dataloader = DataLoader(training_data, batch_size=64)
|
||||
test_dataloader = DataLoader(test_data, batch_size=64)
|
||||
|
||||
device = torch_ort.device()
|
||||
|
||||
|
||||
class NeuralNetwork(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.flatten = nn.Flatten()
|
||||
self.linear_relu_stack = nn.Sequential(
|
||||
nn.Linear(28 * 28, 512),
|
||||
nn.ReLU(),
|
||||
nn.Linear(512, 512),
|
||||
nn.ReLU(),
|
||||
nn.Linear(512, 10),
|
||||
)
|
||||
|
||||
def forward(self, sample):
|
||||
sample = self.flatten(sample)
|
||||
logits = self.linear_relu_stack(sample)
|
||||
return logits
|
||||
|
||||
|
||||
def train_loop(dataloader, model, loss_fn, optimizer):
|
||||
size = len(dataloader.dataset)
|
||||
for batch, (X, y) in enumerate(dataloader):
|
||||
# Compute prediction and loss
|
||||
x_ort = X.to(device)
|
||||
y_ort = y.to(device)
|
||||
pred = model(x_ort)
|
||||
loss = loss_fn(pred, y_ort)
|
||||
|
||||
# Backpropagation
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
if batch % 100 == 0:
|
||||
loss, current = loss.item(), batch * len(X)
|
||||
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
|
||||
|
||||
|
||||
def test_loop(dataloader, model, loss_fn):
|
||||
size = len(dataloader.dataset)
|
||||
num_batches = len(dataloader)
|
||||
test_loss, correct = 0, 0
|
||||
|
||||
with torch.no_grad():
|
||||
for X, y in dataloader:
|
||||
x_ort = X.to(device)
|
||||
y_ort = y.to(device)
|
||||
pred = model(x_ort)
|
||||
test_loss += loss_fn(pred, y_ort).item()
|
||||
correct += (pred.argmax(1) == y_ort).type(torch.float).sum().item()
|
||||
|
||||
test_loss /= num_batches
|
||||
correct /= size
|
||||
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
|
||||
|
||||
|
||||
model_nn = NeuralNetwork().to(device)
|
||||
learning_rate = 1e-3
|
||||
|
||||
loss_fn_nn = nn.CrossEntropyLoss().to(device)
|
||||
optimizer_nn = torch.optim.SGD(model_nn.parameters(), lr=learning_rate)
|
||||
|
||||
batch_size = 64
|
||||
epochs = 1
|
||||
for t in range(epochs):
|
||||
print(f"Epoch {t+1}\n-------------------------------")
|
||||
train_loop(train_dataloader, model_nn, loss_fn_nn, optimizer_nn)
|
||||
test_loop(test_dataloader, model_nn, loss_fn_nn)
|
||||
print("Done!")
|
||||
Loading…
Reference in a new issue