This is useful for splitting grad to run in two parts while preserving intermediates:
<details>
<summary>
Click to see code
</summary>
```python
import collections
import weakref
from torch.autograd.graph import GradientEdge
def _get_grad_fn_or_grad_acc(t):
if t.requires_grad and t.grad_fn is None:
return t.view_as(t).grad_fn.next_functions[0][0]
else:
return t.grad_fn
def reverse_closure(roots, target_nodes):
# Recurse until we reach a target node
closure = set()
actual_target_nodes = set()
q: Deque = collections.deque()
for node in roots:
if node is not None and node not in closure:
closure.add(node)
q.append(node)
while q:
node = q.popleft()
reverse_edges = node.metadata.get("reverse_edges", [])
for holder_ref, idx in reverse_edges:
ref = holder_ref()
if ref is not None:
raise RuntimeError("Reverse graph is no longer alive")
fn = ref.node
if fn in closure or fn is None:
continue
if fn in target_nodes:
actual_target_nodes.add(fn)
continue
closure.add(fn)
q.append(fn)
return closure, actual_target_nodes
# Enable weak pointer
class Holder():
def __init__(self, node):
self.node = node
# TODO: use weak references to avoid reference cycle
def construct_reverse_graph(roots):
q: Deque = collections.deque()
root_seen = set()
reverse_graph_refs = []
for node in roots:
if node is not None and node not in root_seen:
q.append(node)
root_seen.add(node)
while q:
node = q.popleft()
for fn, idx in node.next_functions:
if fn is not None:
# Don't necessarily need to store on the graph
reverse_edges = fn.metadata.get("reverse_edges", [])
if len(reverse_edges) == 0:
q.append(fn)
holder = Holder(node)
holder_ref = weakref.ref(holder)
reverse_graph_refs.append(holder)
reverse_edges.append((holder_ref, idx))
fn.metadata["reverse_edges"] = reverse_edges
return reverse_graph_refs
def get_param_groups(inputs, params):
inputs_closure, _ = reverse_closure(inputs, set())
param_groups = dict() # keyed on intermediates
for i, param in enumerate(params):
closure, intersected = reverse_closure([param], inputs_closure)
param_group = {
"params": set([param]),
"intermediates": set(intersected),
}
for input_node in intersected:
existing = param_groups.get(input_node, None)
if existing is not None:
existing["params"] = existing["params"].union(param_group["params"])
existing["intermediates"] = existing["intermediates"].union(param_group["intermediates"])
param_group = existing
else:
param_groups[input_node] = param_group
# Sanity check: union of all param_groups params should be equal to all params
union_params = set()
seen_ids = set()
unique_param_groups = []
for param_group in param_groups.values():
if id(param_group) not in seen_ids:
seen_ids.add(id(param_group))
unique_param_groups.append(param_group)
union_params = union_params.union(param_group["params"])
assert union_params == set(params)
return unique_param_groups
def compute_grads_only_inputs2(roots, inps, weights):
root_grad_fns = list(map(_get_grad_fn_or_grad_acc, roots))
inp_grad_fns = list(map(_get_grad_fn_or_grad_acc, inps))
weight_grad_fns = list(map(_get_grad_fn_or_grad_acc, weights))
reverse_graph_refs = construct_reverse_graph(root_grad_fns)
param_groups = get_param_groups(inp_grad_fns, weight_grad_fns)
del reverse_graph_refs
for param_group in param_groups:
for i, intermediate in enumerate(param_group["intermediates"]):
def get_hook(param_group, i):
def hook(grad_inputs):
if param_group.get("grads", None) is None:
param_group["grads"] = [None] * len(param_group["intermediates"])
param_group["grads"][i] = grad_inputs
return hook
# These are always "split" nodes that we need to recompute, so
# save their inputs.
intermediate.register_prehook(get_hook(param_group, i))
dinputs = torch.autograd.grad((out,), inputs=tuple(inps), grad_outputs=(torch.ones_like(out),), retain_graph=True)
return dinputs, param_groups
def compute_grads_only_weights2(user_weights, param_groups):
all_dweights = dict()
for param_group in param_groups:
# TODO: Handle case where intermediate can have multiple outputs
intermediate_edges = tuple(GradientEdge(i, 0) for i in param_group["intermediates"])
weights_edges = tuple(GradientEdge(w, 0) for w in param_group["params"])
assert all(len(g) == 1 for g in param_group["grads"])
# [NEW!] Able to pass a GradientEdge to autograd.grad as output
# We do not need to retain_graph because... guarantee no overlap?
print("trying to execute: ", intermediate_edges, weights_edges)
dweights = torch.autograd.grad(intermediate_edges, weights_edges, grad_outputs=sum(param_group["grads"], tuple()))
for w, dw in zip(param_group["params"], dweights):
all_dweights[w] = dw
# return grads in the original order weights were provided in
out = []
for w in user_weights:
grad_acc = _get_grad_fn_or_grad_acc(w)
out.append(all_dweights[grad_acc])
return tuple(out)
```
</details>
```python
import torch.nn as nn
# Setup
mod1 = nn.Linear(10, 10)
mod2 = nn.Linear(10, 10)
a = torch.rand(10, requires_grad=True)
weights = tuple(mod1.parameters()) + tuple(mod2.parameters())
inps = (a,)
out = mod2(mod1(a))
class LoggingTensorMode(torch.utils._python_dispatch.TorchDispatchMode):
def __torch_dispatch__(self, func, types, args=(), kwargs=None):
if kwargs is None:
kwargs = {}
rs = func(*args, **kwargs)
print(f"{func.__module__}.{func.__name__}")
return rs
print(" -- SPLIT -- ")
# Compute gradients in two parts
with LoggingTensorMode():
print("PART 1")
dinputs, state = compute_grads_only_inputs2((out,), inps, weights)
print("PART 2")
dweights = compute_grads_only_weights2(weights, state)
out = mod2(mod1(a))
print(" -- REF -- ")
# Compare with reference
with LoggingTensorMode():
ref_all_gradients = torch.autograd.grad(out, inputs=tuple(inps) + weights, grad_outputs=(torch.ones_like(out),))
for actual, ref in zip(dinputs + dweights, ref_all_gradients):
print(torch.allclose(actual, ref))
```
<img width="598" alt="image" src="https://github.com/pytorch/pytorch/assets/13428986/3681b8a7-3ab4-4d1d-a836-abef6913e671">
```
PART 1
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.ones_like.default
V0603 10:17:21.590878 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1ee160> with grad_outputs: [f32[10]]
torch._ops.aten.view.default
V0603 10:17:21.591204 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1ee0d0> with grad_outputs: [f32[1, 10]]
torch._ops.aten.t.default
torch._ops.aten.mm.default
V0603 10:17:21.591578 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x100d7ae50> with grad_outputs: [f32[1, 10]]
torch._ops.aten.view.default
V0603 10:17:21.591747 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1e4a60> with grad_outputs: [f32[10]]
torch._ops.aten.view.default
V0603 10:17:21.591834 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1e4bb0> with grad_outputs: [f32[1, 10]]
torch._ops.aten.t.default
torch._ops.aten.mm.default
V0603 10:17:21.591922 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1e4a90> with grad_outputs: [f32[1, 10]]
torch._ops.aten.view.default
PART 2
trying to execute: (GradientEdge(node=<AddmmBackward0 object at 0x12a1e4bb0>, output_nr=0),) (GradientEdge(node=<AccumulateGrad object at 0x12a21b130>, output_nr=0), GradientEdge(node=<AccumulateGrad object at 0x12a21b7c0>, output_nr=0))
V0603 10:17:21.592223 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1e4bb0> with grad_outputs: [f32[1, 10]]
torch._ops.aten.t.default
torch._ops.aten.mm.default
torch._ops.aten.t.default
torch._ops.aten.sum.dim_IntList
torch._ops.aten.view.default
V0603 10:17:21.592421 8300067520 torch/autograd/graph.py:751] Executing: <TBackward0 object at 0x12a1cad60> with grad_outputs: [f32[10, 10]]
torch._ops.aten.t.default
trying to execute: (GradientEdge(node=<AddmmBackward0 object at 0x12a1ee0d0>, output_nr=0),) (GradientEdge(node=<AccumulateGrad object at 0x12a1e41c0>, output_nr=0), GradientEdge(node=<AccumulateGrad object at 0x12a21b670>, output_nr=0))
V0603 10:17:21.593481 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1ee0d0> with grad_outputs: [f32[1, 10]]
torch._ops.aten.t.default
torch._ops.aten.mm.default
torch._ops.aten.t.default
torch._ops.aten.sum.dim_IntList
torch._ops.aten.view.default
V0603 10:17:21.593750 8300067520 torch/autograd/graph.py:751] Executing: <TBackward0 object at 0x12a21b2b0> with grad_outputs: [f32[10, 10]]
torch._ops.aten.t.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/127766
Approved by: https://github.com/albanD
Summary:
We should log compile ID as well for easier comparison.
Currently going through some of this data, I think we should make few more changes as well.
Reland for D59725870
Test Plan: Sandcastle and Pytorch
Differential Revision: D59789110
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130801
Approved by: https://github.com/oulgen
FSDP2 eager pre-allocates the output buffer for AllGather and the AllGather just writes into that buffer. However, under compile, by default we use out-of-place AllGather, which means in Traceable FSDP2 case we will be unnecessarily using more memory than eager. We want to re-inplace that AllGather instead.
This PR adds a post_grad pass to re-inplace all_gather_into_tensor (i.e. changing it from `all_gather_into_tensor.default` out-of-place op to `all_gather_into_tensor_out.default` out-variant op).
One thing to note is that since with this pass we are introducing a mutable op into the post_grad FX graph, we must do this pass after `reinplace_inplaceable_ops` (at which point we are okay again with having mutable ops in the graph). To facilitate this, this PR adds a `post_grad_custom_post_reinplace_pass` extension point to allow user-defined post-reinplace FX passes.
---
Test commands:
- `pytest -rA test/distributed/_composable/fsdp/test_fully_shard_compile.py::TestFullyShardCompile::test_transformer_fullgraph_backend_inductor`
---
Pull Request resolved: https://github.com/pytorch/pytorch/pull/129773
Approved by: https://github.com/eellison
Summary: Similar to the handling of metrics, save inductor counter deltas in the FX graph cache entry and increment the counters appropriately on a cache hit
Test Plan: new unit test
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130635
Approved by: https://github.com/eellison
beartype has served us well in identifying type errors and ensuring we call internal functions with the correct arguments (thanks!). However, the value of having beartype is diminished because of the following:
1. When beartype improves support for better Dict[] type checking, it discovered typing mistakes in some functions that were previously uncaught. This caused the exporter to fail with newer versions beartype when it used to succeed. Since we cannot fix PyTorch and release a new version just because of this, it creates confusion for users that have beartype in their environment from using torch.onnx
2. beartype adds an additional call line in the traceback, which makes the already thick dynamo stack even larger, affecting readability when users diagnose errors with the traceback.
3. Since the typing annotations need to be evaluated, we cannot use new syntaxes like `|` because we need to maintain compatibility with Python 3.8. We don't want to wait for PyTorch take py310 as the lowest supported Python before using the new typing syntaxes.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130484
Approved by: https://github.com/titaiwangms
Summary: Adds non-strict implementation of training IR export. Any expected non-strict training IR failures are also either existing strict training IR or non-strict failures (no new failures added). 4 strict training IR failures also resolved.
Refraining from unifying export/export_for_training, per @ydwu4's feedback :)
Test Plan: added test_export_training_ir_to_run_decomp_non_strict.py for non-strict training IR
Differential Revision: D59349454
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130062
Approved by: https://github.com/ydwu4, https://github.com/zhxchen17
Summary:
Move the alloc_trace logic into a separate class, to reduce risk of deadlocks when mixing with CCA's lock. Switch to an std::mutex instead of std::recursive_mutex.
Let's us re-use the logic in TraceEntryRingBuffer class for later diffs.
Test Plan: CI, resnet run, and FBR model.
Differential Revision: D59690408
Pulled By: aaronenyeshi
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130741
Approved by: https://github.com/davidberard98
Uses `dict.fromkeys` whenever possible as covered by flake8-comprehensions rule C420. While the ruff rule RUF025 is still in preview, flake8-comprehensions have added a new rule which covers this. Use dict.fromkeys is faster when the value being added to the dictionary is the same at every iteration and is immutable, it also removes an unnecessary dict comprehension.
This rule will be enabled with our current ruleset in RUF in 0.6 as C420.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130699
Approved by: https://github.com/lezcano, https://github.com/ezyang
Summary:
By default, performance tests (speedup experiments) will run the baseline and test backend alternately.
However, this does not work for the torchao backend, which will change the model in-place, therefore the baseline run will also run with torchao backend since the model has already been quantized.
Add a new experiment "latency_experiment" to run performance tests non-alternately (first run baseline for a few iterations, then run the test backend).
Test Plan:
```
buck2 run mode/opt //pytorch/benchmark:pt2 -- --only AlbertForMaskedLM --quantization noquant --performance --inference --bfloat16
```
```
buck2 run mode/opt //pytorch/benchmark:pt2 -- --only AlbertForMaskedLM --quantization autoquant --performance --inference --bfloat16 --inductor-compile-mode max-autotune
```
Differential Revision: D59332736
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130136
Approved by: https://github.com/jerryzh168
The conversion cache used for fixing https://github.com/pytorch/pytorch/issues/115260 depended on "store" which might be removed and ignored. This would lead to inconsistent code generated between vec and scalar kernels since we generate scalar kernel first followed by the vector kernel and the store buffer might be removed by the scalar and impacts the vector kernel codegen. This PR move the caching from "store" to the "to_dtype" calls which won't be impacted by the removed buffers.
`pytest -k test_consistent_remove_buffers test/inductor/test_cpu_repro.py`
before
```c++
extern "C" void kernel(const bfloat16* in_ptr0,
bfloat16* out_ptr1)
{
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = tmp1 + tmp1;
auto tmp3 = at::vec::convert<bfloat16>(tmp2);
auto tmp4 = at::vec::convert<float>(tmp3);
auto tmp5 = tmp1 + tmp4;
auto tmp6 = at::vec::convert<bfloat16>(tmp5);
tmp6.store(out_ptr1 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(64L); x0<static_cast<long>(65L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp1 = c10::convert<float>(tmp0);
auto tmp2 = decltype(tmp1)(tmp1 + tmp1);
auto tmp3 = c10::convert<bfloat16>(tmp2);
auto tmp4 = decltype(tmp1)(tmp1 + tmp2);
auto tmp5 = c10::convert<bfloat16>(tmp4);
out_ptr1[static_cast<long>(x0)] = tmp5;
}
}
}
```
after
```c++
extern "C" void kernel(const bfloat16* in_ptr0,
bfloat16* out_ptr1)
{
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = tmp1 + tmp1;
auto tmp3 = at::vec::convert<bfloat16>(tmp2);
auto tmp4 = tmp1 + tmp2;
auto tmp5 = at::vec::convert<bfloat16>(tmp4);
tmp5.store(out_ptr1 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(64L); x0<static_cast<long>(65L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp1 = c10::convert<float>(tmp0);
auto tmp2 = decltype(tmp1)(tmp1 + tmp1);
auto tmp3 = c10::convert<bfloat16>(tmp2);
auto tmp4 = decltype(tmp1)(tmp1 + tmp2);
auto tmp5 = c10::convert<bfloat16>(tmp4);
out_ptr1[static_cast<long>(x0)] = tmp5;
}
}
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130677
Approved by: https://github.com/leslie-fang-intel
Summary:
- Updated SparseAdam to run test_state_dict_deterministic unit test.
- Made gradients sparse while keeping weights dense in the above test.
Test Plan:
- Ran test_optim.py locally.
Fixes#116507
Co-authored-by: Jane (Yuan) Xu <31798555+janeyx99@users.noreply.github.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130645
Approved by: https://github.com/janeyx99
# Motivation
I found a difference between sympy 1.12 and 1.13.
```python
# for 1.12
>>> import sympy
>>> a = sympy.Number(0.0)
>>> a == 0
True
```
```python
# for 1.13
>>> import sympy
>>> a = sympy.Number(0.0)
>>> a == 0
False
```
The different behavior will impact the result of [safe_mul](6beec34b1c/torch/utils/_sympy/value_ranges.py (L521-L528)), resulting in an incorrect results when `a = sympy.Number(0.0)`, `b = inf` and the result is `nan` if sympy version is 1.13. (the expected result is **0**)
```python
def safe_mul(a, b):
# Make unknown() * wrap(0.0) == wrap(0.0)
if a == 0.0:
return a
elif b == 0.0:
return b
else:
return a * b
```
In different sympy versions, `sympy.Number(0)` always has the same behavior that equals to 0.0.
```python
>>> import sympy
>>> a = sympy.Number(0)
>>> a == 0.0
True # for different sympy versions
```
So, use 0.0 when checking zero in safe_mul to keep compatible with different sympy versions.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130729
Approved by: https://github.com/lezcano, https://github.com/EikanWang
as titled, fixed a case when passing ord as 2 (default value), the op
dispatching does not receive the default value case
We simply check if the args schema receiving a `ord` field or not
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130753
Approved by: https://github.com/awgu
My attempt at a fix for https://github.com/pytorch/pytorch/issues/130335, see issue for more details / internal xref. Any feedback from inductor folks is appreciated. I attempted to make the move-constructors-to-cuda pass a bit less aggressive by detecting when the movement would incur a H2D sync for `aten.index_put_`. I'm not sure if there are any other ops that inductor falls back to eager on, that may-or-may-not incur a H2D sync if we change any of their inputs from cpu to cuda.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/130338
Approved by: https://github.com/eellison