mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Summary:
Try running this script through `nvprof`:
```py
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import brew, core, optimizer, workspace
from caffe2.python.model_helper import ModelHelper
do = core.DeviceOption(caffe2_pb2.CUDA, 0)
with core.DeviceScope(do):
model = ModelHelper(arg_scope={'order': 'NCHW'})
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
fc3 = brew.relu(model, fc3, fc3)
pred = brew.fc(model, fc3, 'pred', 500, 10)
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
model.AddGradientOperators([loss])
optimizer.build_sgd(model, 0.01,
policy='step', stepsize=1, gamma=0.999,
momentum=0.9, nesterov=False)
workspace.FeedBlob('data', np.zeros((1, 1, 28, 28), dtype=np.float32))
workspace.FeedBlob('label', np.zeros((1, 1), dtype=np.int32))
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
for _ in range(100):
workspace.RunNet(model.net)
```
Before this change:
```
1.55% 1.4185ms 837 1.6940us 1.6630us 2.4000us [CUDA memcpy HtoD]
0.72% 656.03us 200 3.2800us 3.1350us 3.5840us [CUDA memcpy DtoD]
0.39% 7.1574ms 1034 6.9220us 3.8300us 18.677us cudaMemcpyAsync
0.00% 34.180us 3 11.393us 9.0960us 12.910us cudaMemcpy
```
And after it (look at the third column):
```
0.73% 657.15us 200 3.2850us 3.1040us 3.6160us [CUDA memcpy DtoD]
0.26% 235.07us 137 1.7150us 1.6640us 2.3680us [CUDA memcpy HtoD]
0.20% 3.4493ms 334 10.327us 6.4220us 16.958us cudaMemcpyAsync
0.00% 37.376us 3 12.458us 9.4120us 15.412us cudaMemcpy
```
That makes a pretty big difference in performance. Is there any particular reason you decided to have a separate `LearningRate` op for every parameter in
|
||
|---|---|---|
| .. | ||
| docs | ||
| examples | ||
| helpers | ||
| layers | ||
| mint | ||
| mkl | ||
| modeling | ||
| models | ||
| operator_test | ||
| predictor | ||
| rnn | ||
| _import_c_extension.py | ||
| attention.py | ||
| brew.py | ||
| brew_test.py | ||
| caffe_translator.py | ||
| caffe_translator_test.py | ||
| checkpoint.py | ||
| checkpoint_test.py | ||
| CMakeLists.txt | ||
| cnn.py | ||
| context.py | ||
| context_test.py | ||
| control.py | ||
| control_test.py | ||
| convnet_benchmarks.py | ||
| convnet_benchmarks_test.py | ||
| core.py | ||
| core_gradients_test.py | ||
| core_test.py | ||
| crf.py | ||
| data_parallel_model.py | ||
| data_parallel_model_test.py | ||
| data_workers.py | ||
| data_workers_test.py | ||
| dataio.py | ||
| dataio_test.py | ||
| dataset.py | ||
| db_test.py | ||
| device_checker.py | ||
| dyndep.py | ||
| empty.so | ||
| experiment_util.py | ||
| extension_loader.py | ||
| gradient_check_test.py | ||
| gradient_checker.py | ||
| gru_cell.py | ||
| hsm_util.py | ||
| hypothesis_test.py | ||
| hypothesis_test_util.py | ||
| layer_model_helper.py | ||
| layer_model_instantiator.py | ||
| layer_test_util.py | ||
| layers_test.py | ||
| load_save_test.py | ||
| lstm_benchmark.py | ||
| memonger.py | ||
| memonger_test.py | ||
| mkl_test_util.py | ||
| model_device_test.py | ||
| model_helper.py | ||
| mpi_python.cc | ||
| muji.py | ||
| muji_test.py | ||
| net_builder.py | ||
| net_builder_test.py | ||
| net_drawer.py | ||
| net_printer.py | ||
| net_printer_test.py | ||
| optimizer.py | ||
| optimizer_context.py | ||
| optimizer_test.py | ||
| optimizer_test_util.py | ||
| parallelize_gpu_bmuf_distributed_test.py | ||
| pipeline.py | ||
| predictor_constants.py | ||
| pybind_state.cc | ||
| pybind_state.h | ||
| pybind_state_gpu.cc | ||
| pybind_state_mkl.cc | ||
| python_op_test.py | ||
| queue_util.py | ||
| record_queue.py | ||
| recurrent.py | ||
| rnn_cell.py | ||
| schema.py | ||
| schema_test.py | ||
| scope.py | ||
| scope_test.py | ||
| session.py | ||
| session_test.py | ||
| sparse_to_dense_mask_test.py | ||
| task.py | ||
| test_util.py | ||
| text_file_reader.py | ||
| timeout_guard.py | ||
| toy_regression_test.py | ||
| tt_core.py | ||
| tt_core_test.py | ||
| utils.py | ||
| visualize.py | ||
| workspace.py | ||
| workspace_test.py | ||