diff --git a/.circleci/config.yml b/.circleci/config.yml
index de3fe5daece..10ab1614d0c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -611,6 +611,7 @@ jobs:
           # =================== The following code will be executed inside Docker container ===================
           set -ex
           export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}
+          export SCRIBE_GRAPHQL_ACCESS_TOKEN="${SCRIBE_GRAPHQL_ACCESS_TOKEN}"
           ${PARALLEL_FLAGS}
           source ./workspace/env
           cd workspace
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 5239eb8d9cc..84eb3275c04 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -159,6 +159,7 @@ jobs:
           # =================== The following code will be executed inside Docker container ===================
           set -ex
           export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}
+          export SCRIBE_GRAPHQL_ACCESS_TOKEN="${SCRIBE_GRAPHQL_ACCESS_TOKEN}"
           ${PARALLEL_FLAGS}
           source ./workspace/env
           cd workspace
diff --git a/.gitignore b/.gitignore
index 7f196c61b80..d1001b29bd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@
 aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
+benchmarks/.data
 caffe2/cpp_test/
 dist/
 docs/cpp/src
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 6b17e08e08a..f68ba874293 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -340,6 +340,18 @@ test_bazel() {
   tools/bazel test --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA :all_tests
 }
 
+test_benchmarks() {
+  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+    pip_install --user "pytest-benchmark==3.2.3"
+    pip_install --user "requests"
+    BENCHMARK_DATA="benchmarks/.data"
+    mkdir -p ${BENCHMARK_DATA}
+    pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns.json
+    python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns.json
+    assert_git_not_dirty
+  fi
+}
+
 test_cpp_extensions() {
   # This is to test whether cpp extension build is compatible with current env. No need to test both ninja and no-ninja build
   time python test/run_test.py --include test_cpp_extensions_aot_ninja --verbose --determine-from="$DETERMINE_FROM"
@@ -392,5 +404,6 @@ else
   test_custom_backend
   test_torch_function_benchmark
   test_distributed
+  test_benchmarks
   test_rpc
 fi
diff --git a/benchmarks/fastrnns/factory.py b/benchmarks/fastrnns/factory.py
index 769e31f802b..056bcd746ae 100644
--- a/benchmarks/fastrnns/factory.py
+++ b/benchmarks/fastrnns/factory.py
@@ -48,8 +48,8 @@ def simple_backward_setup(output, seed=None):
     return output, grad_output
 
 
-def simple_backward(output, grad_output):
-    return output.backward(grad_output)
+def simple_backward(output, grad_output, **kwargs):
+    return output.backward(grad_output, **kwargs)
 
 
 def pytorch_lstm_creator(**kwargs):
diff --git a/benchmarks/fastrnns/test_bench.py b/benchmarks/fastrnns/test_bench.py
new file mode 100644
index 00000000000..988c7e3fefd
--- /dev/null
+++ b/benchmarks/fastrnns/test_bench.py
@@ -0,0 +1,54 @@
+from __future__ import print_function
+import pytest
+import torch
+from .runner import get_nn_runners
+
+default_rnns = ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_premul_bias', 'jit_simple',
+                         'jit_multilayer', 'py']
+default_cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
+all_nets = default_rnns + default_cnns
+
+def pytest_generate_tests(metafunc):
+    # This creates lists of tests to generate, can be customized
+    if metafunc.cls.__name__ == "TestBenchNetwork":
+        metafunc.parametrize('net_name', all_nets, scope="class")
+
+@pytest.fixture(scope='class')
+def modeldef(request, net_name):
+    # Given a 'net_name' provided by generate_tests, build the thing
+    name, rnn_creator, context = get_nn_runners(net_name)[0]
+    creator_args = creator_args = {
+        'seqLength': 100, 'numLayers': 1,
+        'inputSize': 512, 'hiddenSize': 512,
+        'miniBatch': 64, 'device': 'cuda', 'seed': None
+    }
+    return rnn_creator(**creator_args)
+
+def cuda_sync(func, *args, **kwargs):
+    out = func(*args, **kwargs)
+    torch.cuda.synchronize()
+    return out
+
+@pytest.mark.benchmark(
+    warmup=True,
+    warmup_iterations=3,
+    disable_gc=True,
+    max_time=0.1,
+    group="fastrnns",
+)
+class TestBenchNetwork:
+    # See 'modeldef' fixture, which provides the things to benchmark
+    def test_forward(self, modeldef, benchmark):
+        forward_output = benchmark(cuda_sync, modeldef.forward, *modeldef.inputs)
+
+    def test_backward(self, modeldef, benchmark):
+        backward_input = modeldef.forward(*modeldef.inputs)
+        if modeldef.backward_setup is not None:
+            backward_input = modeldef.backward_setup(backward_input)
+
+        if modeldef.backward is not None:
+            benchmark(cuda_sync, modeldef.backward, *backward_input, retain_graph=True)
+
+            for param in modeldef.params:
+                assert param.grad is not None
+                param.grad.data.zero_()
diff --git a/benchmarks/upload_scribe.py b/benchmarks/upload_scribe.py
new file mode 100644
index 00000000000..785bd55a766
--- /dev/null
+++ b/benchmarks/upload_scribe.py
@@ -0,0 +1,130 @@
+"""Scribe Uploader for Pytorch Benchmark Data
+
+Currently supports data in pytest-benchmark format but can be extended.
+
+New fields can be added just by modifying the schema in this file, schema
+checking is only here to encourage reusing existing fields and avoiding typos.
+"""
+
+import argparse
+import time
+import json
+import os
+import requests
+import subprocess
+from collections import defaultdict
+
+
+class ScribeUploader:
+    def __init__(self, category):
+        self.category = category
+
+    def format_message(self, field_dict):
+        assert 'time' in field_dict, "Missing required Scribe field 'time'"
+        message = defaultdict(dict)
+        for field, value in field_dict.items():
+            if field in self.schema['normal']:
+                message['normal'][field] = str(value)
+            elif field in self.schema['int']:
+                message['int'][field] = int(value)
+            elif field in self.schema['float']:
+                message['float'][field] = float(value)
+            else:
+
+                raise ValueError("Field {} is not currently used, "
+                                 "be intentional about adding new fields".format(field))
+        return message
+
+    def _upload_intern(self, messages):
+        for m in messages:
+            json_str = json.dumps(m)
+            cmd = ['scribe_cat', self.category, json_str]
+            subprocess.run(cmd)
+
+    def upload(self, messages):
+        if os.environ.get('SCRIBE_INTERN'):
+            return self._upload_intern(messages)
+        access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN")
+        if not access_token:
+            raise ValueError("Can't find access token from environment variable")
+        url = "https://graph.facebook.com/scribe_logs"
+        r = requests.post(
+            url,
+            data={
+                "access_token": access_token,
+                "logs": json.dumps(
+                    [
+                        {
+                            "category": self.category,
+                            "message": json.dumps(message),
+                            "line_escape": False,
+                        }
+                        for message in messages
+                    ]
+                ),
+            },
+        )
+        print(r.text)
+        r.raise_for_status()
+
+class PytorchBenchmarkUploader(ScribeUploader):
+    def __init__(self):
+        super().__init__('perfpipe_pytorch_benchmarks')
+        self.schema = {
+            'int': [
+                'time', 'rounds',
+            ],
+            'normal': [
+                'benchmark_group', 'benchmark_name', 'benchmark_class', 'benchmark_time',
+                'pytorch_commit_id', 'pytorch_branch', 'pytorch_commit_time', 'pytorch_version',
+                'pytorch_git_dirty',
+                'machine_kernel', 'machine_processor', 'machine_hostname',
+                'circle_build_num', 'circle_project_reponame',
+            ],
+            'float': [
+                'stddev', 'min', 'median', 'max', 'mean',
+            ]
+        }
+
+    def post_pytest_benchmarks(self, pytest_json):
+        machine_info = pytest_json['machine_info']
+        commit_info = pytest_json['commit_info']
+        upload_time = int(time.time())
+        messages = []
+        for b in pytest_json['benchmarks']:
+            m = self.format_message({
+                "time": upload_time,
+                "benchmark_group": b['group'],
+                "benchmark_name": b['name'],
+                "benchmark_class": b['fullname'],
+                "benchmark_time": pytest_json['datetime'],
+                "pytorch_commit_id": commit_info['id'],
+                "pytorch_branch": commit_info['branch'],
+                "pytorch_commit_time": commit_info['time'],
+                "pytorch_version": None,
+                "pytorch_git_dirty": commit_info['dirty'],
+                "machine_kernel": machine_info['release'],
+                "machine_processor": machine_info['processor'],
+                "machine_hostname": machine_info['node'],
+                "circle_build_num": os.environ.get("CIRCLE_BUILD_NUM"),
+                "circle_project_reponame": os.environ.get("CIRCLE_PROJECT_REPONAME"),
+                "stddev": b['stats']['stddev'],
+                "rounds": b['stats']['rounds'],
+                "min": b['stats']['min'],
+                "median": b['stats']['median'],
+                "max": b['stats']['max'],
+                "mean": b['stats']['mean'],
+            })
+            messages.append(m)
+        self.upload(messages)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--pytest_bench_json", type=argparse.FileType('r'),
+                        help='Upload json data formatted by pytest-benchmark module')
+    args = parser.parse_args()
+    if args.pytest_bench_json:
+        benchmark_uploader = PytorchBenchmarkUploader()
+        json_data = json.load(args.pytest_bench_json)
+        benchmark_uploader.post_pytest_benchmarks(json_data)