Hard-code input types for DropoutGrad on BERT

2026-07-17 18:40:28 +00:00 · 2020-11-03 14:40:32 -08:00 · 2020-11-03 14:40:32 -08:00 · d4917f2d65
commit d4917f2d65
parent 3b267d1d60
3 changed files with 54 additions and 22 deletions
--- a/orttraining/orttraining/python/training/ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule.py
@ -616,7 +616,12 @@ class ORTModule(torch.nn.Module):
        for input in backward_graph_inputs:
            if input in forward_graph_outputs:
                # inputs of backward graph that are also outputs from forward graph need to be added to backward graph input
-                add_input(backward_model, input, tensor_elem_types[input] if input in tensor_elem_types else 1)
+                # TODO: thiagofc: BERT: Remove this once graph splitter can handle unspecified optional input (without type)
+                input_type = tensor_elem_types[input] if input in tensor_elem_types else 1
+                if input in {'1835', '1813', '1781','1760', '1683','1651','1630','1553','1521','1500','1423','1391','1370','1293','1261','1240','1163','1131','1110','1033','1001','980','871',
+                             '267','330','351','383','460','481','513','590','611','643','720','741','773','850','903'}:
+                    input_type = 9
+                add_input(backward_model, input, input_type)
            elif input in forward_graph_initializer_names:
                # inputs from forward graph initializers need to be added to backward graph input
                add_input_from_initializer(backward_model, initializers[input])
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_basic.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_basic.py
@ -136,6 +136,10 @@ def main():
        print('Training MNIST on ORTModule....')
        model = ORTModule(model)

+        # TODO: change it to False to stop saving ONNX models
+        model._save_onnx = True
+        model._save_onnx_prefix = 'MNIST'
+
        # Set log level
        numeric_level = getattr(logging, args.log_level.upper(), None)
        if not isinstance(numeric_level, int):
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
@ -1,3 +1,7 @@
+
+import pdb
+
+import argparse
 import torch
 import wget
 import os
@ -18,17 +22,28 @@ import datetime
 import onnxruntime
 from onnxruntime.training import ORTModule

-# 1. Device setup
-# TODO: Hard-coding for CPU for ORTModule

-# if torch.cuda.is_available():
-#     device = torch.device("cuda")
-#     print('There are %d GPU(s) available.' % torch.cuda.device_count())
-#     print('We will use the GPU:', torch.cuda.get_device_name(0))
-# else:
-#     print('No GPU available, using the CPU instead.')
-#     device = torch.device("cpu")
-device = torch.device("cpu")
+# 0. Common stuff
+parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+parser.add_argument('--pytorch-only', action='store_true', default=False,
+                    help='disables ONNX Runtime training')
+parser.add_argument('--view-graphs', action='store_true', default=False,
+                    help='views forward and backward graphs')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+parser.add_argument('--epochs', type=int, default=4, metavar='N',
+                        help='number of epochs to train (default: 4)')
+args = parser.parse_args()
+
+
+# 1. Device setup
+if torch.cuda.is_available() and not args.no_cuda:
+    device = torch.device("cuda")
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")

 # 2. Loading CoLA Dataset
 print('Downloading dataset...')
@ -141,15 +156,17 @@ model = BertForSequenceClassification.from_pretrained(
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
 )
-model = ORTModule(model)
+
+if not args.pytorch_only:
+    model = ORTModule(model)

 # TODO: change it to False to stop saving ONNX models
 model._save_onnx = True
 model._save_onnx_prefix = 'BertForSequenceClassification'

 # Tell pytorch to run this model on the GPU.
-# TODO: Hard coding it to CPU for ORTModule
-# model.cuda()
+if torch.cuda.is_available() and not args.no_cuda:
+    model.cuda()

 # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
 optimizer = AdamW(model.parameters(),
@ -157,11 +174,9 @@ optimizer = AdamW(model.parameters(),
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

-# Number of training epochs (authors recommend between 2 and 4)
-epochs = 4
-
+# Authors recommend between 2 and 4 epochs
 # Total number of training steps is number of batches * number of epochs.
-total_steps = len(train_dataloader) * epochs
+total_steps = len(train_dataloader) * args.epochs

 # Create the learning rate scheduler.
 scheduler = get_linear_schedule_with_warmup(optimizer,
@ -193,8 +208,8 @@ random.seed(seed_val)
 np.random.seed(seed_val)
 torch.manual_seed(seed_val)
 onnxruntime.set_seed(seed_val)
-# TODO: We are not using CUDA for ORTModule just yet
-# torch.cuda.manual_seed_all(seed_val)
+if torch.cuda.is_available() and not args.no_cuda:
+    torch.cuda.manual_seed_all(seed_val)

 # Store the average loss after each epoch so we can plot them.
 loss_values = []
@ -202,10 +217,10 @@ loss_values = []
 # ========================================
 #               Training
 # ========================================
-for epoch_i in range(0, epochs):
+for epoch_i in range(0, args.epochs):
    # Perform one full pass over the training set.
    print("")
-    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, args.epochs))

    # Measure how long the training epoch takes.
    t0 = time.time()
@ -259,14 +274,22 @@ for epoch_i in range(0, epochs):
                    attention_mask=b_input_mask,
                    labels=b_labels)

+        if args.view_graphs:
+            import torchviz
+            pytorch_backward_graph = torchviz.make_dot(outputs[0], params=dict(list(model.named_parameters())))
+            pytorch_backward_graph.view()
+
        # The call to `model` always returns a tuple, so we need to pull the
        # loss value out of the tuple.
+        # pdb.set_trace()
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
+        print(loss.shape)
+        print(loss)
        total_loss += loss.item()
        # total_loss += loss