diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index 643fca3e408..795d216f080 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -241,6 +241,12 @@ function checkout_install_torchbench() {
   popd
 }
 
+function install_torchao() {
+  local commit
+  commit=$(get_pinned_commit torchao)
+  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${commit}"
+}
+
 function print_sccache_stats() {
   echo 'PyTorch Build Statistics'
   sccache --show-stats
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index e237661d2ed..6709be483ba 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -615,6 +615,11 @@ test_single_dynamo_benchmark() {
 }
 
 test_inductor_micro_benchmark() {
+  # torchao requires cuda 8.0 or above for bfloat16 support
+  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;8.6"
+  fi
+  install_torchao
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   if [[ "${TEST_CONFIG}" == *cpu* ]]; then
     test_inductor_set_cpu_affinity
diff --git a/.github/ci_commit_pins/torchao.txt b/.github/ci_commit_pins/torchao.txt
new file mode 100644
index 00000000000..d12c20e6a11
--- /dev/null
+++ b/.github/ci_commit_pins/torchao.txt
@@ -0,0 +1 @@
+51c87b6ead6b7e098ada95d6a7609ee873b854cf
diff --git a/benchmarks/gpt_fast/benchmark.py b/benchmarks/gpt_fast/benchmark.py
index 6270f9744f6..95d378aa355 100644
--- a/benchmarks/gpt_fast/benchmark.py
+++ b/benchmarks/gpt_fast/benchmark.py
@@ -265,9 +265,16 @@ DEFAULT_OUTPUT_FILE = "gpt_fast_benchmark.csv"
 
 all_experiments = {
     # A list of GPT models: LlaMa, Mixtral, etc.
+    # waiting for A100-80G machine to be available in CI
+    # https://github.com/pytorch/pytorch/actions/runs/12018005803/job/33503683582?pr=140627
+    # before we can turn on autoquant
+    # or alterantively, we can save the model after autoquant and just load here to track
+    # the performance
+    # run_llama2_7b_autoquant,
     run_llama2_7b_bf16,
     run_llama2_7b_int8,
     run_mixtral_8x7b_int8,
+    # run_mixtral_8x7b_autoquant,
     # A list of micro-benchmarks.
     run_mlp_layer_norm_gelu,
     run_layer_norm,
@@ -286,6 +293,7 @@ def main(output_file=DEFAULT_OUTPUT_FILE):
             # This happens when torch is compiled with CUDA turning off completely
             device = "cpu"
 
+        torch.compiler.cudagraph_mark_step_begin()
         lst = func(device)
         for x in lst:
             results.append(dataclasses.astuple(x))
diff --git a/benchmarks/gpt_fast/generate.py b/benchmarks/gpt_fast/generate.py
index 56e6cff1cf8..8ea90d20b85 100644
--- a/benchmarks/gpt_fast/generate.py
+++ b/benchmarks/gpt_fast/generate.py
@@ -4,6 +4,7 @@ import platform
 import time
 from typing import Optional, Tuple
 
+import torchao
 from mixtral_moe_model import ConditionalFeedForward, Transformer as MixtralMoE
 from mixtral_moe_quantize import (
     ConditionalFeedForwardInt8,
@@ -21,6 +22,8 @@ torch._inductor.config.triton.unique_kernel_names = True
 torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
 torch._inductor.config.assert_indirect_indexing = False
 
+compiled = False
+
 
 @dataclasses.dataclass
 class GPTModelConfig:
@@ -31,6 +34,7 @@ class GPTModelConfig:
     token_per_sec: float
     memory_bandwidth: float
     compilation_time: float
+    batch_size: Optional[int] = None
 
 
 def device_sync(device):
@@ -74,7 +78,6 @@ def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
     return idx_next, probs
 
 
-@torch.compile(fullgraph=True)
 def prefill(
     model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
 ) -> torch.Tensor:
@@ -83,7 +86,6 @@ def prefill(
     return sample(logits, **sampling_kwargs)[0]
 
 
-@torch.compile(fullgraph=True, mode="reduce-overhead")
 def decode_one_token(
     model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -223,9 +225,48 @@ def run_experiment(
     start = -1
     compilation_time = None
 
+    if x.mode == "autoquant":
+        print("Using autoquant")
+        model = torchao.autoquant(model, manual=True, error_on_unseen=False)
+        generate(model, prompt, max_new_tokens, temperature=temperature, top_k=top_k)
+        model.finalize_autoquant()
+
+    if x.mode == "autoquant_v2":
+        print("Using autoquant_v2")
+        from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
+
+        p = prompt.view(1, -1)
+        T = prompt.size(0)
+        T_new = T + max_new_tokens
+        max_seq_length = min(T_new, model.config.block_size)
+        input_pos = torch.arange(0, T, device=device)
+        example_input = (p, input_pos)
+
+        with torch.device(device):
+            model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
+        model = autoquant_v2(
+            model,
+            manual=True,
+            error_on_unseen=False,
+            example_input=example_input,
+            batch_size=x.batch_size,
+        )
+        torch.compiler.cudagraph_mark_step_begin()
+        generate(model, prompt, max_new_tokens, temperature=temperature, top_k=top_k)
+        model.finalize_autoquant()
+
+    global decode_one_token, prefill, compiled
+    if not compiled:
+        compiled = True
+        decode_one_token = torch.compile(
+            decode_one_token, mode="reduce-overhead", fullgraph=True
+        )
+        prefill = torch.compile(prefill, fullgraph=True)
+
     for i in range(start, num_samples):
         device_sync(device=device)  # MKG
 
+        torch.compiler.cudagraph_mark_step_begin()
         t0 = time.perf_counter()
         y = generate(
             model, prompt, max_new_tokens, temperature=temperature, top_k=top_k
@@ -402,3 +443,207 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
             True,
         ),
     ]
+
+
+# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
+def run_llama2_7b_autoquant(device: str = "cuda"):
+    from benchmark import Experiment
+
+    model = GPTModelConfig(
+        "Llama-2-7b-chat-hf",
+        LLaMA,
+        "autoquant",
+        None,
+        144,
+        957,
+        136,
+    )
+    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
+        model, device=device
+    )
+    return [
+        Experiment(
+            model.name,
+            "token_per_sec",
+            model.token_per_sec,
+            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+        Experiment(
+            model.name,
+            "memory_bandwidth(GB/s)",
+            model.memory_bandwidth,
+            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+        Experiment(
+            model.name,
+            "compilation_time(s)",
+            model.compilation_time,
+            f"{compilation_time:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+    ]
+
+
+# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
+def run_mixtral_8x7b_autoquant(device: str = "cuda"):
+    from benchmark import Experiment
+
+    # We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
+    model = GPTModelConfig(
+        "Mixtral-8x7B-v0.1",
+        MixtralMoE,
+        "autoquant",
+        None,
+        175,
+        1130,
+        133,
+    )
+    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
+        model, device=device
+    )
+    return [
+        Experiment(
+            model.name,
+            "token_per_sec",
+            model.token_per_sec,
+            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+        Experiment(
+            model.name,
+            "memory_bandwidth(GB/s)",
+            model.memory_bandwidth,
+            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+        Experiment(
+            model.name,
+            "compilation_time(s)",
+            model.compilation_time,
+            f"{compilation_time:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+    ]
+
+
+# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
+def run_llama2_7b_autoquant_v2(device: str = "cuda"):
+    from benchmark import Experiment
+
+    model = GPTModelConfig(
+        "Llama-2-7b-chat-hf",
+        LLaMA,
+        "autoquant_v2",
+        None,
+        144,
+        957,
+        136,
+        6,  # batch_size
+    )
+    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
+        model, device=device
+    )
+    return [
+        Experiment(
+            model.name,
+            "token_per_sec",
+            model.token_per_sec,
+            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+        Experiment(
+            model.name,
+            "memory_bandwidth(GB/s)",
+            model.memory_bandwidth,
+            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+        Experiment(
+            model.name,
+            "compilation_time(s)",
+            model.compilation_time,
+            f"{compilation_time:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+    ]
+
+
+# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
+def run_mixtral_8x7b_autoquant_v2(device: str = "cuda"):
+    from benchmark import Experiment
+
+    # We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
+    model = GPTModelConfig(
+        "Mixtral-8x7B-v0.1",
+        MixtralMoE,
+        "autoquant_v2",
+        None,
+        175,
+        1130,
+        133,
+        6,  # batch_size
+    )
+    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
+        model, device=device
+    )
+    return [
+        Experiment(
+            model.name,
+            "token_per_sec",
+            model.token_per_sec,
+            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+        Experiment(
+            model.name,
+            "memory_bandwidth(GB/s)",
+            model.memory_bandwidth,
+            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+        Experiment(
+            model.name,
+            "compilation_time(s)",
+            model.compilation_time,
+            f"{compilation_time:.02f}",
+            model.mode,
+            device,
+            get_arch_name(),
+            True,
+        ),
+    ]