pytorch

mirror of https://github.com/saymrwulf/pytorch.git synced 2026-05-14 20:57:59 +00:00

Author	SHA1	Message	Date
soulitzer	2eec02523b	[autograd] Support GradientEdge as output for torch.autograd.grad (#127766 ) This is useful for splitting grad to run in two parts while preserving intermediates: <details> <summary> Click to see code </summary> ```python import collections import weakref from torch.autograd.graph import GradientEdge def _get_grad_fn_or_grad_acc(t): if t.requires_grad and t.grad_fn is None: return t.view_as(t).grad_fn.next_functions[0][0] else: return t.grad_fn def reverse_closure(roots, target_nodes): # Recurse until we reach a target node closure = set() actual_target_nodes = set() q: Deque = collections.deque() for node in roots: if node is not None and node not in closure: closure.add(node) q.append(node) while q: node = q.popleft() reverse_edges = node.metadata.get("reverse_edges", []) for holder_ref, idx in reverse_edges: ref = holder_ref() if ref is not None: raise RuntimeError("Reverse graph is no longer alive") fn = ref.node if fn in closure or fn is None: continue if fn in target_nodes: actual_target_nodes.add(fn) continue closure.add(fn) q.append(fn) return closure, actual_target_nodes # Enable weak pointer class Holder(): def __init__(self, node): self.node = node # TODO: use weak references to avoid reference cycle def construct_reverse_graph(roots): q: Deque = collections.deque() root_seen = set() reverse_graph_refs = [] for node in roots: if node is not None and node not in root_seen: q.append(node) root_seen.add(node) while q: node = q.popleft() for fn, idx in node.next_functions: if fn is not None: # Don't necessarily need to store on the graph reverse_edges = fn.metadata.get("reverse_edges", []) if len(reverse_edges) == 0: q.append(fn) holder = Holder(node) holder_ref = weakref.ref(holder) reverse_graph_refs.append(holder) reverse_edges.append((holder_ref, idx)) fn.metadata["reverse_edges"] = reverse_edges return reverse_graph_refs def get_param_groups(inputs, params): inputs_closure, _ = reverse_closure(inputs, set()) param_groups = dict() # keyed on intermediates for i, param in enumerate(params): closure, intersected = reverse_closure([param], inputs_closure) param_group = { "params": set([param]), "intermediates": set(intersected), } for input_node in intersected: existing = param_groups.get(input_node, None) if existing is not None: existing["params"] = existing["params"].union(param_group["params"]) existing["intermediates"] = existing["intermediates"].union(param_group["intermediates"]) param_group = existing else: param_groups[input_node] = param_group # Sanity check: union of all param_groups params should be equal to all params union_params = set() seen_ids = set() unique_param_groups = [] for param_group in param_groups.values(): if id(param_group) not in seen_ids: seen_ids.add(id(param_group)) unique_param_groups.append(param_group) union_params = union_params.union(param_group["params"]) assert union_params == set(params) return unique_param_groups def compute_grads_only_inputs2(roots, inps, weights): root_grad_fns = list(map(_get_grad_fn_or_grad_acc, roots)) inp_grad_fns = list(map(_get_grad_fn_or_grad_acc, inps)) weight_grad_fns = list(map(_get_grad_fn_or_grad_acc, weights)) reverse_graph_refs = construct_reverse_graph(root_grad_fns) param_groups = get_param_groups(inp_grad_fns, weight_grad_fns) del reverse_graph_refs for param_group in param_groups: for i, intermediate in enumerate(param_group["intermediates"]): def get_hook(param_group, i): def hook(grad_inputs): if param_group.get("grads", None) is None: param_group["grads"] = [None] * len(param_group["intermediates"]) param_group["grads"][i] = grad_inputs return hook # These are always "split" nodes that we need to recompute, so # save their inputs. intermediate.register_prehook(get_hook(param_group, i)) dinputs = torch.autograd.grad((out,), inputs=tuple(inps), grad_outputs=(torch.ones_like(out),), retain_graph=True) return dinputs, param_groups def compute_grads_only_weights2(user_weights, param_groups): all_dweights = dict() for param_group in param_groups: # TODO: Handle case where intermediate can have multiple outputs intermediate_edges = tuple(GradientEdge(i, 0) for i in param_group["intermediates"]) weights_edges = tuple(GradientEdge(w, 0) for w in param_group["params"]) assert all(len(g) == 1 for g in param_group["grads"]) # [NEW!] Able to pass a GradientEdge to autograd.grad as output # We do not need to retain_graph because... guarantee no overlap? print("trying to execute: ", intermediate_edges, weights_edges) dweights = torch.autograd.grad(intermediate_edges, weights_edges, grad_outputs=sum(param_group["grads"], tuple())) for w, dw in zip(param_group["params"], dweights): all_dweights[w] = dw # return grads in the original order weights were provided in out = [] for w in user_weights: grad_acc = _get_grad_fn_or_grad_acc(w) out.append(all_dweights[grad_acc]) return tuple(out) ``` </details> ```python import torch.nn as nn # Setup mod1 = nn.Linear(10, 10) mod2 = nn.Linear(10, 10) a = torch.rand(10, requires_grad=True) weights = tuple(mod1.parameters()) + tuple(mod2.parameters()) inps = (a,) out = mod2(mod1(a)) class LoggingTensorMode(torch.utils._python_dispatch.TorchDispatchMode): def __torch_dispatch__(self, func, types, args=(), kwargs=None): if kwargs is None: kwargs = {} rs = func(args, *kwargs) print(f"{func.__module__}.{func.__name__}") return rs print(" -- SPLIT -- ") # Compute gradients in two parts with LoggingTensorMode(): print("PART 1") dinputs, state = compute_grads_only_inputs2((out,), inps, weights) print("PART 2") dweights = compute_grads_only_weights2(weights, state) out = mod2(mod1(a)) print(" -- REF -- ") # Compare with reference with LoggingTensorMode(): ref_all_gradients = torch.autograd.grad(out, inputs=tuple(inps) + weights, grad_outputs=(torch.ones_like(out),)) for actual, ref in zip(dinputs + dweights, ref_all_gradients): print(torch.allclose(actual, ref)) ``` <img width="598" alt="image" src="https://github.com/pytorch/pytorch/assets/13428986/3681b8a7-3ab4-4d1d-a836-abef6913e671"> ``` PART 1 torch._ops.aten.view.default torch._ops.aten.view.default torch._ops.aten.view.default torch._ops.aten.view.default torch._ops.aten.view.default torch._ops.aten.ones_like.default V0603 10:17:21.590878 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1ee160> with grad_outputs: [f32[10]] torch._ops.aten.view.default V0603 10:17:21.591204 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1ee0d0> with grad_outputs: [f32[1, 10]] torch._ops.aten.t.default torch._ops.aten.mm.default V0603 10:17:21.591578 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x100d7ae50> with grad_outputs: [f32[1, 10]] torch._ops.aten.view.default V0603 10:17:21.591747 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1e4a60> with grad_outputs: [f32[10]] torch._ops.aten.view.default V0603 10:17:21.591834 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1e4bb0> with grad_outputs: [f32[1, 10]] torch._ops.aten.t.default torch._ops.aten.mm.default V0603 10:17:21.591922 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1e4a90> with grad_outputs: [f32[1, 10]] torch._ops.aten.view.default PART 2 trying to execute: (GradientEdge(node=<AddmmBackward0 object at 0x12a1e4bb0>, output_nr=0),) (GradientEdge(node=<AccumulateGrad object at 0x12a21b130>, output_nr=0), GradientEdge(node=<AccumulateGrad object at 0x12a21b7c0>, output_nr=0)) V0603 10:17:21.592223 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1e4bb0> with grad_outputs: [f32[1, 10]] torch._ops.aten.t.default torch._ops.aten.mm.default torch._ops.aten.t.default torch._ops.aten.sum.dim_IntList torch._ops.aten.view.default V0603 10:17:21.592421 8300067520 torch/autograd/graph.py:751] Executing: <TBackward0 object at 0x12a1cad60> with grad_outputs: [f32[10, 10]] torch._ops.aten.t.default trying to execute: (GradientEdge(node=<AddmmBackward0 object at 0x12a1ee0d0>, output_nr=0),) (GradientEdge(node=<AccumulateGrad object at 0x12a1e41c0>, output_nr=0), GradientEdge(node=<AccumulateGrad object at 0x12a21b670>, output_nr=0)) V0603 10:17:21.593481 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1ee0d0> with grad_outputs: [f32[1, 10]] torch._ops.aten.t.default torch._ops.aten.mm.default torch._ops.aten.t.default torch._ops.aten.sum.dim_IntList torch._ops.aten.view.default V0603 10:17:21.593750 8300067520 torch/autograd/graph.py:751] Executing: <TBackward0 object at 0x12a21b2b0> with grad_outputs: [f32[10, 10]] torch._ops.aten.t.default torch._ops.aten.view.default torch._ops.aten.view.default torch._ops.aten.view.default torch._ops.aten.view.default ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/127766 Approved by: https://github.com/albanD	2024-07-16 21:46:19 +00:00
PyTorch MergeBot	c1e7e40f24	Revert "[Traceable FSDP2][Inductor] Re-inplace all_gather_into_tensor (#129773 )" This reverts commit `f2f31027ce`. Reverted https://github.com/pytorch/pytorch/pull/129773 on behalf of https://github.com/clee2000 due to failed inductor/test_torchinductor_dynamic_shapes.py on mac https://github.com/pytorch/pytorch/actions/runs/9963396991/job/27530249256 `f2f31027ce`. The build failed on PR so test jobs didn't run ([comment](https://github.com/pytorch/pytorch/pull/129773#issuecomment-2231808437))	2024-07-16 20:54:14 +00:00
Atul Jangra	4e479568df	[PT2] Log compile ID in the signpost event (#130801 ) Summary: We should log compile ID as well for easier comparison. Currently going through some of this data, I think we should make few more changes as well. Reland for D59725870 Test Plan: Sandcastle and Pytorch Differential Revision: D59789110 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130801 Approved by: https://github.com/oulgen	2024-07-16 20:47:36 +00:00
Yifu Wang	2ceade37c5	[SymmetricMemory] put socket files in /tmp (#130757 ) Currently the socket files are put in the current directory, which may not be writable in all environments. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130757 Approved by: https://github.com/Chillee ghstack dependencies: #130756	2024-07-16 20:21:05 +00:00
Yifu Wang	0468f2616a	[SymmetricMemory] make sure different subgroups with the same name use different store prefixes (#130756 ) This fixes a race condition in which different subgroups with the same name on the same host would use the same store. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130756 Approved by: https://github.com/Chillee	2024-07-16 20:21:05 +00:00
Will Feng	f2f31027ce	[Traceable FSDP2][Inductor] Re-inplace all_gather_into_tensor (#129773 ) FSDP2 eager pre-allocates the output buffer for AllGather and the AllGather just writes into that buffer. However, under compile, by default we use out-of-place AllGather, which means in Traceable FSDP2 case we will be unnecessarily using more memory than eager. We want to re-inplace that AllGather instead. This PR adds a post_grad pass to re-inplace all_gather_into_tensor (i.e. changing it from `all_gather_into_tensor.default` out-of-place op to `all_gather_into_tensor_out.default` out-variant op). One thing to note is that since with this pass we are introducing a mutable op into the post_grad FX graph, we must do this pass after `reinplace_inplaceable_ops` (at which point we are okay again with having mutable ops in the graph). To facilitate this, this PR adds a `post_grad_custom_post_reinplace_pass` extension point to allow user-defined post-reinplace FX passes. --- Test commands: - `pytest -rA test/distributed/_composable/fsdp/test_fully_shard_compile.py::TestFullyShardCompile::test_transformer_fullgraph_backend_inductor` --- Pull Request resolved: https://github.com/pytorch/pytorch/pull/129773 Approved by: https://github.com/eellison	2024-07-16 20:07:41 +00:00
Sam Larsen	156b99cfb1	[inductor] Handle inductor counters in fx graph cache (#130635 ) Summary: Similar to the handling of metrics, save inductor counter deltas in the FX graph cache entry and increment the counters appropriately on a cache hit Test Plan: new unit test Pull Request resolved: https://github.com/pytorch/pytorch/pull/130635 Approved by: https://github.com/eellison	2024-07-16 20:07:16 +00:00
David Berard	d548417d95	[NJT] throw an exception if nested_tensor_from_jagged is fx-traced without being fx.wrapped (#130702 ) The NJT constructor can't be fx-traced safely due to the dummy nt used: `774ca93fd2/torch/nested/_internal/nested_tensor.py (L501-L508)` The error doesn't appear immediately, but appears if you try to move a module with an fx-traced NJT constructor onto a different device, or try to serialize it. Let's throw an error if we try to fx-trace the NJT constructor so users know to wrap the call. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130702 Approved by: https://github.com/jbschlosser, https://github.com/soulitzer	2024-07-16 19:21:10 +00:00
PyTorch MergeBot	0851de5b16	Revert "[ONNX] Remove beartype usage (#130484 )" This reverts commit `1794c35912`. Reverted https://github.com/pytorch/pytorch/pull/130484 on behalf of https://github.com/clee2000 due to test_sympy_utils failure is real https://github.com/pytorch/pytorch/actions/runs/9961499559/job/27523758780 `1794c35912`. Dr CI is matching with commits in current commit? ([comment](https://github.com/pytorch/pytorch/pull/130484#issuecomment-2231575577))	2024-07-16 18:41:51 +00:00
Joel Schlosser	09b1b113f5	Cache min / max seq len for torch.nested.as_nested_tensor(t) (#130766 ) For the `torch.nested.as_nested_tensor(t)` constructor, computing min / max seq len is trivial since the sequence lengths are all the same. Might as well cache them during construction. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130766 Approved by: https://github.com/YuqingJ, https://github.com/soulitzer	2024-07-16 18:32:47 +00:00
Edward Z. Yang	408c921d96	Make hashing a SymInt raise an error again (#130548 ) See https://github.com/pytorch/pytorch/issues/130547 Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/130548 Approved by: https://github.com/Skylion007, https://github.com/albanD, https://github.com/lezcano	2024-07-16 18:30:30 +00:00
Xu Zhao	1d8baa4df2	[torchbench][servicelab] Fix servicelab test failures (#130781 ) Fix servicelab test failures Pull Request resolved: https://github.com/pytorch/pytorch/pull/130781 Approved by: https://github.com/desertfire	2024-07-16 17:35:13 +00:00
Justin Chu	1794c35912	[ONNX] Remove beartype usage (#130484 ) beartype has served us well in identifying type errors and ensuring we call internal functions with the correct arguments (thanks!). However, the value of having beartype is diminished because of the following: 1. When beartype improves support for better Dict[] type checking, it discovered typing mistakes in some functions that were previously uncaught. This caused the exporter to fail with newer versions beartype when it used to succeed. Since we cannot fix PyTorch and release a new version just because of this, it creates confusion for users that have beartype in their environment from using torch.onnx 2. beartype adds an additional call line in the traceback, which makes the already thick dynamo stack even larger, affecting readability when users diagnose errors with the traceback. 3. Since the typing annotations need to be evaluated, we cannot use new syntaxes like `\|` because we need to maintain compatibility with Python 3.8. We don't want to wait for PyTorch take py310 as the lowest supported Python before using the new typing syntaxes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130484 Approved by: https://github.com/titaiwangms	2024-07-16 17:34:36 +00:00
Jiashen Cao	67e22d6c61	[Fix]: Convert operator that does specialization to its symbolic counterpart (#129578 ) #### Issue During conversion, use symbolic operator when exist. #### Test Plan `pytest test/export/test_converter.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/129578 Approved by: https://github.com/angelayi	2024-07-16 17:19:57 +00:00
Pian Pawakapan	e8998d68c8	[export] add non-strict training IR (#130062 ) Summary: Adds non-strict implementation of training IR export. Any expected non-strict training IR failures are also either existing strict training IR or non-strict failures (no new failures added). 4 strict training IR failures also resolved. Refraining from unifying export/export_for_training, per @ydwu4's feedback :) Test Plan: added test_export_training_ir_to_run_decomp_non_strict.py for non-strict training IR Differential Revision: D59349454 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130062 Approved by: https://github.com/ydwu4, https://github.com/zhxchen17	2024-07-16 17:08:00 +00:00
Sidney Tsang	d2f44eabe7	[Export] Support aten.full.default and aten.full_like.default (#130639 ) Summary: Add operator tests for full & full_like operators Test Plan: Rerun kernel test using ``` buck2 run //glow/fba/tests:run_kernel mode/dev -- --kernel splat --config "input=1;dtype=fp32;fill_value=42.0" -tl_time ``` {F1752274071} Operator tests ``` buck2 run mode/{opt,inplace} //caffe2/torch/fb/test_library:afg_operator_test -- -k __full__ ``` {F1752340913} Differential Revision: D59593849 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130639 Approved by: https://github.com/StellarrZ	2024-07-16 16:50:04 +00:00
Colin Peppler	f272e0ab4a	[inductor] support unbacked symint divisors in vars_and_sizes (#130595 ) Scenario: ``` >>> nodes IterationRangesEntry( x2, divisor=192u0 + 192576, length=s1, (xindex//(192u0 + 192576)), {x0: 192, x1: u0 + 1003, x2: s1, x3: 192s1u0 + 192576s1, x4: 192u0 + 192576}) IterationRangesEntry( x1, divisor=192, length=u0 + 1003, ModularIndexing(xindex, 192, u0 + 1003), {x0: 192, x1: u0 + 1003, x2: s1, x3: 192s1u0 + 192576s1, x4: 192u0 + 192576}) IterationRangesEntry( x0, divisor=1, length=192, ModularIndexing(xindex, 1, 192), {x0: 192, x1: u0 + 1003, x2: s1, x3: 192s1u0 + 192576s1, x4: 192u0 + 192576}) ``` Think about whether using fallback is safe here. I think it's safe because the divisor of one IterationRangesEntry should be the product of the lengths of the preceding IterationRangesEntry? Unless, one of the lengths divides by an unbacked symint? Pull Request resolved: https://github.com/pytorch/pytorch/pull/130595 Approved by: https://github.com/aakhundov, https://github.com/ezyang	2024-07-16 16:21:38 +00:00
drisspg	2b43d339fe	Make FlexAttention API public (#130755 ) # Summary Makes the prototype API flex_attention public Pull Request resolved: https://github.com/pytorch/pytorch/pull/130755 Approved by: https://github.com/Chillee	2024-07-16 16:21:25 +00:00
PyTorch MergeBot	cbda8be537	Revert "Propagate buffer and parameter indices through AOT (#130393 )" This reverts commit `69a77389e2`. Reverted https://github.com/pytorch/pytorch/pull/130393 on behalf of https://github.com/clee2000 due to broke lint for torch/_functorch/_aot_autograd/subclass_utils.py https://github.com/pytorch/pytorch/actions/runs/9948630877/job/27483551649 `80236dca90` lint was green on PR, probably a landrace ([comment](https://github.com/pytorch/pytorch/pull/130393#issuecomment-2231263753))	2024-07-16 15:43:34 +00:00
PyTorch MergeBot	9cb23ba85b	Revert "Add buffer static input tests to cudagraph trees (#130402 )" This reverts commit `80236dca90`. Reverted https://github.com/pytorch/pytorch/pull/130402 on behalf of https://github.com/clee2000 due to broke lint for torch/_functorch/_aot_autograd/subclass_utils.py https://github.com/pytorch/pytorch/actions/runs/9948630877/job/27483551649 `80236dca90` lint was green on PR, probably a landrace ([comment](https://github.com/pytorch/pytorch/pull/130393#issuecomment-2231263753))	2024-07-16 15:43:34 +00:00
Sam Larsen	c509319210	[inductor] Disable remote fx graph cache in test_snode_runtime (#130655 ) Summary: Unfortunately we can't save / restore metrics.metrics.node_runtimes in the cache entries because these contain objects that don't pickle: `TypeError: cannot pickle 'PyCapsule' object`. Test Plan: `buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:snode_runtime -- --exact 'caffe2/test/inductor:snode_runtime - test_mm (caffe2.test.inductor.test_snode_runtime.ComputeBoundedTests)' --run-disabled --jobs 18 --stress-runs 10` Differential Revision: D59705654 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130655 Approved by: https://github.com/oulgen	2024-07-16 15:11:17 +00:00
Aaron Enye Shi	aa4ad711ef	[CCA][Memory Snapshot] Create TraceEntryRingBuffer class for alloc_trace logic (#130741 ) Summary: Move the alloc_trace logic into a separate class, to reduce risk of deadlocks when mixing with CCA's lock. Switch to an std::mutex instead of std::recursive_mutex. Let's us re-use the logic in TraceEntryRingBuffer class for later diffs. Test Plan: CI, resnet run, and FBR model. Differential Revision: D59690408 Pulled By: aaronenyeshi Pull Request resolved: https://github.com/pytorch/pytorch/pull/130741 Approved by: https://github.com/davidberard98	2024-07-16 15:01:48 +00:00
eellison	e11c41035c	Directly use empty strided in cudagraph copy (#130777 ) We had an issue with the `-1` somehow ending up in negative num elements required. not sure why the original didn't work - we should land if CI is green. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130777 Approved by: https://github.com/BoyuanFeng	2024-07-16 14:37:30 +00:00
Aaron Orenstein	4c3348932c	typing: convert_frame (#130670 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/130670 Approved by: https://github.com/Skylion007 ghstack dependencies: #130669	2024-07-16 14:31:35 +00:00
Aaron Orenstein	ea25febfab	typing: storage (#130669 ) This isn't a full typing of the file - it just fixes some uses of unbound 'T' (if you use a TypeVar as an output it also needs to be an input). Pull Request resolved: https://github.com/pytorch/pytorch/pull/130669 Approved by: https://github.com/oulgen, https://github.com/Skylion007	2024-07-16 14:31:35 +00:00
Isuru Fernando	8390843eba	Invalidate StorageImpl instances when tensor is overwritten with cudagraphs (#125264 ) Fixes #104435 Pull Request resolved: https://github.com/pytorch/pytorch/pull/125264 Approved by: https://github.com/ezyang	2024-07-16 14:29:29 +00:00
David Berard	1fbfb3202d	[docs][TorchScript] document c10::AliasAnalysisKind::CONSERVATIVE (#130765 ) I spent a while trying to search this to remember what this was called. Adding it to the OVERVIEW.md docs so it's easier to search Pull Request resolved: https://github.com/pytorch/pytorch/pull/130765 Approved by: https://github.com/nmacchioni, https://github.com/eellison, https://github.com/aaronenyeshi	2024-07-16 14:20:31 +00:00
Xu Han	69e9917245	[inductor] adapte windows file path (#130713 ) This PR is depends on https://github.com/pytorch/pytorch/pull/130132 can be landed successful. The detailed log: https://github.com/pytorch/pytorch/issues/124245#issuecomment-2211889758 After the file path was adapted for Windows, the first Windows inductor case was run successful. ```python import torch def foo(x, y): a = torch.sin(x) b = torch.cos(x) return a + b opt_foo1 = torch.compile(foo) print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10))) ``` Result: ![image](https://github.com/user-attachments/assets/4944df47-e74d-476b-8eb5-1d1fd5abeb41) Co-authored-by: Jiong Gong <jiong.gong@intel.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/130713 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire	2024-07-16 13:53:39 +00:00
Aaron Gokaslan	53e5b8ac5b	[BE]: Update flake8-comprehensions and enable C420 (#130699 ) Uses `dict.fromkeys` whenever possible as covered by flake8-comprehensions rule C420. While the ruff rule RUF025 is still in preview, flake8-comprehensions have added a new rule which covers this. Use dict.fromkeys is faster when the value being added to the dictionary is the same at every iteration and is immutable, it also removes an unnecessary dict comprehension. This rule will be enabled with our current ruleset in RUF in 0.6 as C420. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130699 Approved by: https://github.com/lezcano, https://github.com/ezyang	2024-07-16 13:47:49 +00:00
Xu Zhao	213685ba97	[torchao][pt2 benchmark runner] Run performance test non-alternately (#130136 ) Summary: By default, performance tests (speedup experiments) will run the baseline and test backend alternately. However, this does not work for the torchao backend, which will change the model in-place, therefore the baseline run will also run with torchao backend since the model has already been quantized. Add a new experiment "latency_experiment" to run performance tests non-alternately (first run baseline for a few iterations, then run the test backend). Test Plan: ``` buck2 run mode/opt //pytorch/benchmark:pt2 -- --only AlbertForMaskedLM --quantization noquant --performance --inference --bfloat16 ``` ``` buck2 run mode/opt //pytorch/benchmark:pt2 -- --only AlbertForMaskedLM --quantization autoquant --performance --inference --bfloat16 --inductor-compile-mode max-autotune ``` Differential Revision: D59332736 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130136 Approved by: https://github.com/jerryzh168	2024-07-16 13:38:17 +00:00
eellison	67c6941b4e	Update torch.cat decomp for 0-dim (#130763 ) Fix for https://github.com/pytorch/pytorch/issues/130615 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130763 Approved by: https://github.com/Skylion007, https://github.com/mlazos	2024-07-16 13:34:01 +00:00
Jiong Gong	705da70f2c	[inductor][cpp] align dtype convert cache between vec and scalar kernels (#130677 ) The conversion cache used for fixing https://github.com/pytorch/pytorch/issues/115260 depended on "store" which might be removed and ignored. This would lead to inconsistent code generated between vec and scalar kernels since we generate scalar kernel first followed by the vector kernel and the store buffer might be removed by the scalar and impacts the vector kernel codegen. This PR move the caching from "store" to the "to_dtype" calls which won't be impacted by the removed buffers. `pytest -k test_consistent_remove_buffers test/inductor/test_cpu_repro.py` before ```c++ extern "C" void kernel(const bfloat16* in_ptr0, bfloat16* out_ptr1) { { for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L)) { auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x0), 16); auto tmp1 = at::vec::convert<float>(tmp0); auto tmp2 = tmp1 + tmp1; auto tmp3 = at::vec::convert<bfloat16>(tmp2); auto tmp4 = at::vec::convert<float>(tmp3); auto tmp5 = tmp1 + tmp4; auto tmp6 = at::vec::convert<bfloat16>(tmp5); tmp6.store(out_ptr1 + static_cast<long>(x0), 16); } #pragma omp simd simdlen(8) for(long x0=static_cast<long>(64L); x0<static_cast<long>(65L); x0+=static_cast<long>(1L)) { auto tmp0 = in_ptr0[static_cast<long>(x0)]; auto tmp1 = c10::convert<float>(tmp0); auto tmp2 = decltype(tmp1)(tmp1 + tmp1); auto tmp3 = c10::convert<bfloat16>(tmp2); auto tmp4 = decltype(tmp1)(tmp1 + tmp2); auto tmp5 = c10::convert<bfloat16>(tmp4); out_ptr1[static_cast<long>(x0)] = tmp5; } } } ``` after ```c++ extern "C" void kernel(const bfloat16* in_ptr0, bfloat16* out_ptr1) { { for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L)) { auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x0), 16); auto tmp1 = at::vec::convert<float>(tmp0); auto tmp2 = tmp1 + tmp1; auto tmp3 = at::vec::convert<bfloat16>(tmp2); auto tmp4 = tmp1 + tmp2; auto tmp5 = at::vec::convert<bfloat16>(tmp4); tmp5.store(out_ptr1 + static_cast<long>(x0), 16); } #pragma omp simd simdlen(8) for(long x0=static_cast<long>(64L); x0<static_cast<long>(65L); x0+=static_cast<long>(1L)) { auto tmp0 = in_ptr0[static_cast<long>(x0)]; auto tmp1 = c10::convert<float>(tmp0); auto tmp2 = decltype(tmp1)(tmp1 + tmp1); auto tmp3 = c10::convert<bfloat16>(tmp2); auto tmp4 = decltype(tmp1)(tmp1 + tmp2); auto tmp5 = c10::convert<bfloat16>(tmp4); out_ptr1[static_cast<long>(x0)] = tmp5; } } } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/130677 Approved by: https://github.com/leslie-fang-intel	2024-07-16 13:25:05 +00:00
PyTorch MergeBot	68a4f2a3df	Revert "Tighten torch.library.infer_schema input types (#130705 )" This reverts commit `ca2d424c6e`. Reverted https://github.com/pytorch/pytorch/pull/130705 on behalf of https://github.com/atalman due to Failing internal CI ([comment](https://github.com/pytorch/pytorch/pull/130705#issuecomment-2230821876))	2024-07-16 12:57:11 +00:00
Andrea Frittoli	dee0f43fde	Add a CI job to check runner det sync (#129746 ) Add a new CI job that runs only when the runner determinator files are modified. The jobs checks that the runner_determinator.py script is in sync with the version embedded in _runner-determinator.yaml. Fixes TBD Pull Request resolved: https://github.com/pytorch/pytorch/pull/129746 Approved by: https://github.com/zxiiro, https://github.com/ZainRizvi, https://github.com/jeanschmidt	2024-07-16 11:44:55 +00:00
Jovian Anthony Jaison	e57101d927	Add testing regarding SparseAdam state_dicts (#130645 ) Summary: - Updated SparseAdam to run test_state_dict_deterministic unit test. - Made gradients sparse while keeping weights dense in the above test. Test Plan: - Ran test_optim.py locally. Fixes #116507 Co-authored-by: Jane (Yuan) Xu <31798555+janeyx99@users.noreply.github.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/130645 Approved by: https://github.com/janeyx99	2024-07-16 11:29:22 +00:00
cyy	168e41009b	[structural binding][10/N] Replace std::tie with structural binding (#130784 ) Follows #130404 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130784 Approved by: https://github.com/malfet	2024-07-16 10:28:14 +00:00
Xuehai Pan	747b38c131	[BE][Easy][2/19] enforce style for empty lines in import segments in `.ci/` and `.github/` (#129753 ) See https://github.com/pytorch/pytorch/pull/129751#issue-2380881501. Most changes are auto-generated by linter. You can review these PRs via: ```bash git diff --ignore-all-space --ignore-blank-lines HEAD~1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/129753 Approved by: https://github.com/malfet ghstack dependencies: #129752	2024-07-16 09:40:00 +00:00
Yu, Guangye	096dc444ce	Keep zero check be compatible with different sympy versions (#130729 ) # Motivation I found a difference between sympy 1.12 and 1.13. ```python # for 1.12 >>> import sympy >>> a = sympy.Number(0.0) >>> a == 0 True ``` ```python # for 1.13 >>> import sympy >>> a = sympy.Number(0.0) >>> a == 0 False ``` The different behavior will impact the result of [safe_mul](`6beec34b1c/torch/utils/_sympy/value_ranges.py (L521-L528)`), resulting in an incorrect results when `a = sympy.Number(0.0)`, `b = inf` and the result is `nan` if sympy version is 1.13. (the expected result is 0) ```python def safe_mul(a, b): # Make unknown() * wrap(0.0) == wrap(0.0) if a == 0.0: return a elif b == 0.0: return b else: return a * b ``` In different sympy versions, `sympy.Number(0)` always has the same behavior that equals to 0.0. ```python >>> import sympy >>> a = sympy.Number(0) >>> a == 0.0 True # for different sympy versions ``` So, use 0.0 when checking zero in safe_mul to keep compatible with different sympy versions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130729 Approved by: https://github.com/lezcano, https://github.com/EikanWang	2024-07-16 08:39:00 +00:00
Animesh Jain	fedae41c57	[dynamo] Do not mark nn.module containers as BuiltinNNModuleVariable (#130773 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/130773 Approved by: https://github.com/williamwen42, https://github.com/mlazos	2024-07-16 06:55:46 +00:00
Aaron Gokaslan	83eedf66b9	Update libfmt submodule to 11.0.1 (#130628 ) Update libfmt to 11.0.1 reopen of https://github.com/pytorch/pytorch/pull/129962. Requires a kineto update and moves fmt::join into a separate include so added it where necessary. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130628 Approved by: https://github.com/aaronenyeshi	2024-07-16 06:12:11 +00:00
chuanqiw	c549629696	[CD] Fix xpu nightly wheel test failure (#130742 ) The xpu nightly wheel test met permission issue on `linux.idc.xpu` runner. Because those runners onboarded with `jenkins` user but the binary test in docker container with `root` directly. The temp files can't be deleted, refer https://github.com/pytorch/pytorch/actions/runs/9935452320/job/27448053625#step:8:91 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130742 Approved by: https://github.com/atalman	2024-07-16 05:31:20 +00:00
cyy	95dbbf713e	[Distributed] [9/N] Fix clang-tidy warnings in torch/csrc/distributed/rpc (#130109 ) Follows #125102 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130109 Approved by: https://github.com/ezyang	2024-07-16 04:23:42 +00:00
Wanchao Liang	7b2e802f31	[dtensor] add a few dunder methods to pointwise ops (#130754 ) fixes https://github.com/pytorch/pytorch/issues/130671 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130754 Approved by: https://github.com/Skylion007, https://github.com/awgu, https://github.com/msaroufim ghstack dependencies: #130753	2024-07-16 02:53:35 +00:00
Wanchao Liang	2b2671a7b1	[dtensor] fix foreach_norm when ord is 2 (#130753 ) as titled, fixed a case when passing ord as 2 (default value), the op dispatching does not receive the default value case We simply check if the args schema receiving a `ord` field or not Pull Request resolved: https://github.com/pytorch/pytorch/pull/130753 Approved by: https://github.com/awgu	2024-07-16 02:53:35 +00:00
Aaron Gokaslan	a29052a0bf	[BE][Ez]: Update ruff to 0.5.2 (#130698 ) Update ruff to 0.5.2 which bugfixes and performance improvements Pull Request resolved: https://github.com/pytorch/pytorch/pull/130698 Approved by: https://github.com/ezyang	2024-07-16 01:31:30 +00:00
Adrian Wälchli	ad314a2f05	Pass `torch.load(weights_only=)` internally to avoid FutureWarning (#130663 ) Fixes #130658 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130663 Approved by: https://github.com/malfet, https://github.com/LucasLLC	2024-07-16 01:24:38 +00:00
Sam Larsen	3cd2ae331a	Use inductor TestCase for distributed tests (#129494 ) Summary: At least some of the tests deriving from MultiProcessTestCase exercise inductor. Using the inductor TestCase class makes sure we always get a clean cache dir. Pull Request resolved: https://github.com/pytorch/pytorch/pull/129494 Approved by: https://github.com/eellison	2024-07-16 01:24:35 +00:00
Brian Hirsh	39eeaac4e5	inductor: avoiding moving constructor to cuda when it would cause h2d sync in index_put_ fallback (#130338 ) My attempt at a fix for https://github.com/pytorch/pytorch/issues/130335, see issue for more details / internal xref. Any feedback from inductor folks is appreciated. I attempted to make the move-constructors-to-cuda pass a bit less aggressive by detecting when the movement would incur a H2D sync for `aten.index_put_`. I'm not sure if there are any other ops that inductor falls back to eager on, that may-or-may-not incur a H2D sync if we change any of their inputs from cpu to cuda. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130338 Approved by: https://github.com/eellison	2024-07-16 00:48:58 +00:00
Jiang, Yanbing	93a03edcf9	Update error message in meta__convert_weight_to_int4pack (#130707 ) This PR is to fix error message in https://github.com/pytorch/pytorch/pull/129940. Pull Request resolved: https://github.com/pytorch/pytorch/pull/130707 Approved by: https://github.com/lezcano, https://github.com/malfet	2024-07-16 00:44:35 +00:00
Xuehai Pan	a3abfa5cb5	[BE][Easy][1/19] enforce style for empty lines in import segments (#129752 ) See https://github.com/pytorch/pytorch/pull/129751#issue-2380881501. Most changes are auto-generated by linter. You can review these PRs via: ```bash git diff --ignore-all-space --ignore-blank-lines HEAD~1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/129752 Approved by: https://github.com/ezyang, https://github.com/malfet	2024-07-16 00:42:56 +00:00

1 2 3 4 5 ...

75611 commits