From f2e41257e4e72060084073308a9ef1b4841a2ed5 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Thu, 28 Jan 2021 19:27:29 -0800
Subject: [PATCH] Back out "Revert D26077905: Back out "Revert D25850783: Add
 torch::deploy, an embedded torch-python interpreter"" (#51267)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51267

Original commit changeset: b70185916502

Test Plan: test locally, oss ci-all, fbcode incl deferred

Reviewed By: suo

Differential Revision: D26121251

fbshipit-source-id: 4315b7fd5476914c8e5d6f547e1cfbcf0c227781
---
 .github/workflows/lint.yml                    |   6 +
 .gitignore                                    |   3 +
 .jenkins/pytorch/build.sh                     |  11 +
 .jenkins/pytorch/test.sh                      |   8 +
 CMakeLists.txt                                |  10 +-
 torch/__init__.py                             |  10 +-
 torch/_ops.py                                 |   3 +-
 torch/_utils_internal.py                      |  16 +-
 torch/csrc/Module.cpp                         |   2 +
 torch/csrc/deploy/.gitignore                  |   1 +
 torch/csrc/deploy/CMakeLists.txt              |   3 +
 torch/csrc/deploy/README.md                   |  10 +
 torch/csrc/deploy/example/simple.pt           | Bin 0 -> 2432 bytes
 torch/csrc/deploy/example/trace_simple.py     |  20 ++
 torch/csrc/deploy/interpreter/CMakeLists.txt  | 115 +++++++
 .../deploy/interpreter/CMakePythonModules.txt |  69 ++++
 torch/csrc/deploy/interpreter/freeze.py       | 269 +++++++++++++++
 .../deploy/interpreter/hide_symbols.script    |   5 +
 torch/csrc/deploy/interpreter/interpreter.cpp | 324 ++++++++++++++++++
 torch/csrc/deploy/interpreter/interpreter.h   |  67 ++++
 .../deploy/interpreter/interpreter_impl.h     |  26 ++
 torch/csrc/deploy/interpreter/test_main.cpp   |  49 +++
 .../deploy/interpreter/third_party/README.md  |   2 +
 torch/cuda/__init__.py                        |   4 +
 torch/utils/__init__.py                       |   8 +-
 25 files changed, 1027 insertions(+), 14 deletions(-)
 create mode 100644 torch/csrc/deploy/.gitignore
 create mode 100644 torch/csrc/deploy/CMakeLists.txt
 create mode 100644 torch/csrc/deploy/README.md
 create mode 100644 torch/csrc/deploy/example/simple.pt
 create mode 100644 torch/csrc/deploy/example/trace_simple.py
 create mode 100644 torch/csrc/deploy/interpreter/CMakeLists.txt
 create mode 100644 torch/csrc/deploy/interpreter/CMakePythonModules.txt
 create mode 100644 torch/csrc/deploy/interpreter/freeze.py
 create mode 100644 torch/csrc/deploy/interpreter/hide_symbols.script
 create mode 100644 torch/csrc/deploy/interpreter/interpreter.cpp
 create mode 100644 torch/csrc/deploy/interpreter/interpreter.h
 create mode 100644 torch/csrc/deploy/interpreter/interpreter_impl.h
 create mode 100644 torch/csrc/deploy/interpreter/test_main.cpp
 create mode 100644 torch/csrc/deploy/interpreter/third_party/README.md

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 54acbe7b1c6..9c215540108 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -170,6 +170,8 @@ jobs:
           # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed
           # in a follow up PR.
           # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
+          # deploy/interpreter files are excluded due to using macros and other techniquies
+          # that are not easily converted to accepted c++
           python tools/clang_tidy.py                               \
             --verbose                                              \
             --paths torch/csrc/                                    \
@@ -186,6 +188,10 @@ jobs:
             -g"-torch/csrc/autograd/FunctionsManual.cpp"           \
             -g"-torch/csrc/generic/*.cpp"                          \
             -g"-torch/csrc/jit/codegen/cuda/runtime/*"             \
+            -g"-torch/csrc/deploy/interpreter/interpreter.cpp"     \
+            -g"-torch/csrc/deploy/interpreter/interpreter.h"  \
+            -g"-torch/csrc/deploy/interpreter/interpreter_impl.h"  \
+            -g"-torch/csrc/deploy/interpreter/test_main.cpp"  \
             "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
 
           cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
diff --git a/.gitignore b/.gitignore
index e1fe94cb9bf..a3a832ce755 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,9 @@ torch/csrc/autograd/generated/*
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
 torch/csrc/cudnn/cuDNN.cpp
+torch/csrc/deploy/interpreter/cpython
+torch/csrc/deploy/interpreter/frozen
+torch/csrc/deploy/interpreter/third_party/typing_extensions.py
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
 torch/csrc/jit/generated/*
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index fad9c8e49e6..1b605c53ea7 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -23,6 +23,17 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-code-analysis* ]]; then
   exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile-code-analysis.sh" "$@"
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7* ]]; then
+  # Enabling DEPLOY build (embedded torch python interpreter, experimental)
+  # only on one config for now, can expand later
+  export USE_DEPLOY=ON
+
+  # Deploy feature builds cpython. It requires these packages.
+  # TODO move this to dockerfile?
+  sudo apt-get -qq update
+  sudo apt-get -qq install libffi-dev libbz2-dev libreadline-dev libncurses5-dev libncursesw5-dev libgdbm-dev libsqlite3-dev uuid-dev tk-dev
+fi
+
 echo "Python version:"
 python --version
 
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 73563f145eb..d70a377ec08 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -354,6 +354,11 @@ test_vec256() {
   fi
 }
 
+test_torch_deploy() {
+  SIMPLE_MODEL_PATH=torch/csrc/deploy/example/simple.pt LIBINTERPRETER_PATH=build/lib/libinterpreter.so build/bin/interpreter_test
+  assert_git_not_dirty
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -371,6 +376,9 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then
+  if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test1 ]]; then
+    test_torch_deploy
+  fi
   install_torchvision
   test_python_shard1
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a23208752af..550b7812f34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,8 +249,9 @@ cmake_dependent_option(
 option(USE_TBB "Use TBB" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
-option(USE_DEPLOY "Enable torch::deploy embedded python interpreter" OFF)
-
+cmake_dependent_option(
+    USE_DEPLOY "Build embedded torch::deploy interpreter" OFF
+    "BUILD_PYTHON" OFF)
 # Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
 # On Windows platform, if user does not install libuv in build conda env and
 # does not set libuv_ROOT environment variable. Set USE_DISTRIBUTED to OFF.
@@ -919,3 +920,8 @@ endif()
 
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
+
+# ---[ Torch Deploy
+if(USE_DEPLOY)
+  add_subdirectory(torch/csrc/deploy)
+endif()
diff --git a/torch/__init__.py b/torch/__init__.py
index 3f9df8bc009..f27af91eb49 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -22,7 +22,11 @@ if sys.version_info < (3,):
 from ._utils import _import_dotted_name
 from ._utils_internal import get_file_path, prepare_multiprocessing_environment, \
     USE_RTLD_GLOBAL_WITH_LIBTORCH, USE_GLOBAL_DEPS
-from .version import __version__
+# TODO(torch_deploy) figure out how to freeze version.py in fbcode build
+if sys.executable == 'torch_deploy':
+    __version__ = "torch-deploy-1.8"
+else:
+    from .version import __version__
 from ._six import string_classes as _string_classes
 
 from typing import Set, Type, TYPE_CHECKING
@@ -134,7 +138,7 @@ if sys.platform == 'win32':
 
 # See Note [Global dependencies]
 def _load_global_deps():
-    if platform.system() == 'Windows':
+    if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
         return
 
     lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so')
@@ -516,7 +520,7 @@ from ._tensor_str import set_printoptions
 ################################################################################
 
 def manager_path():
-    if platform.system() == 'Windows':
+    if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
         return b""
     path = get_file_path('torch', 'bin', 'torch_shm_manager')
     prepare_multiprocessing_environment(get_file_path('torch'))
diff --git a/torch/_ops.py b/torch/_ops.py
index dd0c8cd19fd..96c8baac783 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -2,7 +2,6 @@ import torch._C
 
 import contextlib
 import ctypes
-import os
 import sys
 import types
 
@@ -67,7 +66,7 @@ class _OpNamespace(types.ModuleType):
         return op
 
 class _Ops(types.ModuleType):
-    __file__ = os.path.join(os.path.dirname(__file__), '_ops.py')
+    __file__ = '_ops.py'
 
     def __init__(self):
         super(_Ops, self).__init__('torch.ops')
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index be7d8fcaa68..c77e960ae65 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -1,6 +1,7 @@
 
 import os
 import inspect
+import sys
 import tempfile
 
 # this arbitrary-looking assortment of functionality is provided here
@@ -8,11 +9,16 @@ import tempfile
 # use is the FB build environment, where this source file is replaced
 # by an equivalent.
 
-if os.path.basename(os.path.dirname(__file__)) == 'shared':
-    torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+if sys.executable == 'torch_deploy':
+    # __file__ is meaningless in the context of frozen torch used in torch deploy.
+    # setting empty torch_parent should allow below functions to operate without crashing,
+    # but it's unclear if there is a valid use case for them in the context of deploy.
+    torch_parent = ""
 else:
-    torch_parent = os.path.dirname(os.path.dirname(__file__))
-
+    if os.path.basename(os.path.dirname(__file__)) == 'shared':
+        torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    else:
+        torch_parent = os.path.dirname(os.path.dirname(__file__))
 
 def get_file_path(*path_components):
     return os.path.join(torch_parent, *path_components)
@@ -60,7 +66,7 @@ def get_source_lines_and_file(obj, error_msg=None):
 
 TEST_MASTER_ADDR = '127.0.0.1'
 TEST_MASTER_PORT = 29500
-# USE_GLOBAL_DEPS controls whether __init__.py tries to load 
+# USE_GLOBAL_DEPS controls whether __init__.py tries to load
 # libtorch_global_deps, see Note [Global dependencies]
 USE_GLOBAL_DEPS = True
 # USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 88b8afc8440..ee6b6859571 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -713,6 +713,8 @@ extern "C"
 #ifdef _WIN32
 __declspec(dllexport)
 #endif
+TORCH_API PyObject* initModule();
+// separate decl and defn for msvc error C2491
 PyObject* initModule() {
   HANDLE_TH_ERRORS
   at::internal::lazy_init_num_threads();
diff --git a/torch/csrc/deploy/.gitignore b/torch/csrc/deploy/.gitignore
new file mode 100644
index 00000000000..aa484a97a20
--- /dev/null
+++ b/torch/csrc/deploy/.gitignore
@@ -0,0 +1 @@
+example/generated/*
diff --git a/torch/csrc/deploy/CMakeLists.txt b/torch/csrc/deploy/CMakeLists.txt
new file mode 100644
index 00000000000..9da31490586
--- /dev/null
+++ b/torch/csrc/deploy/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_subdirectory(interpreter)
diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md
new file mode 100644
index 00000000000..4fab5aa4ef5
--- /dev/null
+++ b/torch/csrc/deploy/README.md
@@ -0,0 +1,10 @@
+# Torch Deploy
+This is an experimental feature to embed multiple python interpreters inside the torch library,
+providing a solution to the 'GIL problem' for multithreading with the convenience of python
+and eager or torchscripted pytorch programs.
+
+# libinterpreter
+This is an internal library used behind the scenes to enable multiple python interpreters in
+a single deploy runtime.  libinterpreter.so is DLOPENed multiple times by the deploy library.
+Each copy of libinterpreter exposes a simple interpreter interface but hides its python and other
+internal symbols, preventing the different python instances from seeing each other.
diff --git a/torch/csrc/deploy/example/simple.pt b/torch/csrc/deploy/example/simple.pt
new file mode 100644
index 0000000000000000000000000000000000000000..50f9a087aa822821647a8acabe28bb84207475b2
GIT binary patch
literal 2432
zcmah~4Ny~87JmE%jN!K_C{c`n7DSRLExN!vhoWYYRE&s2)-FvT36Ky7`GEqhO3~?p
zL|_?ZsctLdMyZHOE5mjH5kaV>s1(uBHlpj!S}e8F-KAEovhVR{g6sCo+_~?)d%knu
z`R@776^k4k3BtvNc+OmjmkEO^D@UW`D^iRpe1R~=lh``jhIZRyK=*I)!cZZq8){(n
z@pmC?B^ynOG?YYeu>YNUa`b@-j{Wg<^dcXFL{|=d_n*LpxJB4{DiZ_0cLHYUD8#+K
z1H*4J@SRPF(w;cn*y4?rHxI+^Z-0Txury4*pNcYmEbge-j72SNa8-N+niN~G_qF}N
z<@_7^%N(&n#=@$r$*7cjV)N&FVM)6^KJ=2I;OA!2;OT%JqWUNghic$ze6Z|;`Ixk4
zK3-5SLHR&3X6&{=+nIhCDBc5>@l4=GrIPiSf`i2ftn~r#S$8M-R@H4N@;w^W9&!<C
zLhDGMi)<|L&q9HFH_46di#iqX8*C43gr<&b;L)|2O#7+{a%zUbxBU^o$RJcKh{Oo5
zc<jsQg*%TH<2=zL(ANf|(-+QI6jL9inQM=Y<aP)+pO4C%3_M-UN7EV|vi`+@TkA`s
zMy@mQ&gncXI=l@sEo^MNQ4V7|C+rDdh4#K26g&;W%F>18t;$kz&B73@dhIeC+LVs`
zlLq9;xM(`F6k0QwC>~BDzdtq({_3B=an%TTuO5a^Bl;k4EEHMiO5sOy27bIifQgN2
zB-}WWA20bC8nwOTsGTFcx{C#kmHi}<zbk^Bum>KbDNxWcOnTn)#y)>Gs-O0PW1JTh
zx_jV>{nvnD3B?xEILwnAA<f|jV6<g1GK70!*NS}bAcx5sULnq3wjAs83H0yUjC<~C
zab%E%X6S)m{$qw{!!7c=t07psG!ZV|^+Ac>LMW+lz(s#$;PQ1cyw7?Ix5xsJ`Ii&$
zMoAm|aArNmfEhSP6EJ$c5-*z~F>S0H@&#F7Svm^Qi5Wn6X(N+=iojq=D4xeZ!0GNS
z_)UBu4zBgWo|~V*DY*)xx09rY-*HlLX#)}-b5IW_z$M%b<64iwf$(M6o6`(+8<WtK
zW&jb#6K`J*!y$tfFWp>@-=5os3(8l*-#T1TEOK_~{^#dUxef$D5Ah52=1v4y>y4L_
zsS(C3dR}q13M{ghOQlAwUY;S9GB*@#&?<5@N~Vqoes5clgh7e=ZKWzb!>F^9PdM?U
zxki=7z?ABhskthRLTXfI8?<_9-b$v<J}5}yOd$-EXG*$K=P(Ij3N_jk<L1eaOr4{|
zj`9)coFr5>gghr##}J)2Zqhl6h#(PD#5M+r*djY)iCnJLXq0lJN}Fw9ZqzH3dZl8m
zN^WH8T!Isf3A#BFy5|`6DXMH$cDjyPlAEfV%cJS)jB2R3M6J<-ri)Frr|0|AJR)S8
zF1c2r<j=6c%PA1Xyz*<jO|wXrpJjG&J=WfTvU^FLqoy`#@spjF!jN@u{Qjl0YRSI-
zGbgW@>yq!>>$|8?x(K!lPgZp|mzPV4UDpD?SS2)b9a*EAyjQ>U<uFcm)|tl^JvH~f
z*X58rx_4jf!0LZgH*6UCrXgssmb_P$$mv&qwDVA1!OMp*v+-SLnWg&n2UpIHZEt83
zA586eXjt(e;mT+2d9o`PqQB1C*L6I^ue0X-!mD5U(4=$~9Xe;G2zmT}N%1FGFC>Mh
zppHCUI#E_GGxU(<E8L0&?Ds>y<CZ1m2MBkzmX?jQ67lcHW**=+9d^|js$MFX`?j|B
zNAK$B$9(?7p+Dg`8CR(|x6U)Z(x@(eQgACQGJ?r-Qg*0s+-ZsR-TJ9(N!->`Uv~?R
zH22tDTQeU2<}#k8qp-i&eDsqR-y4#3*9XQpA0E7W)HCUK1BpX}Y@ZMJ|J~a?yR+=^
z3hfWqUuo3J@RDhN-3~f$-}Idi{!HaOFqQZJzN^@T|FgWa4aSsgqhV4&elwX)pSbN@
zWFI_HbXoj4Nz_G1(Bs7#bt9su_sLW04ODFlW1@a#2iyH-zlAaBEWi3FeTzM{Gd;v2
zJ9?VAldZdlE4^T2HaGNasI`*ZZD2W+c&(r`SCimpVzaKec{bpK6iHLy=cG3~iuL*V
z+eB${dU2FzwK_W`>vLk;#A~3s-OOKE)xhis)?B;UL|{|hZz{sn>54@R#_}m2KZnp6
pZ@RJ4Ha(l@o=900-9IU#a1&2{b0**EWwFR<!ks~(X#Yvy{{r501)Tr@

literal 0
HcmV?d00001

diff --git a/torch/csrc/deploy/example/trace_simple.py b/torch/csrc/deploy/example/trace_simple.py
new file mode 100644
index 00000000000..8803a48b749
--- /dev/null
+++ b/torch/csrc/deploy/example/trace_simple.py
@@ -0,0 +1,20 @@
+import argparse
+import torch
+
+class MyModule(torch.nn.Module):
+    def __init__(self, N, M):
+        super(MyModule, self).__init__()
+        self.weight = torch.nn.Parameter(torch.rand(N, M))
+
+    def forward(self, input):
+        output = self.weight + input
+        return output
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("save_file", help="Where to save the model")
+    args = parser.parse_args()
+
+    my_module = MyModule(10, 20)
+    sm = torch.jit.script(my_module)
+    sm.save(args.save_file)
diff --git a/torch/csrc/deploy/interpreter/CMakeLists.txt b/torch/csrc/deploy/interpreter/CMakeLists.txt
new file mode 100644
index 00000000000..0c4c778291e
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/CMakeLists.txt
@@ -0,0 +1,115 @@
+SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" )
+SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" PARENT_SCOPE)
+
+SET(PYTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../../../")
+
+# Build cpython
+SET(PYTHON_INSTALL_DIR "${INTERPRETER_DIR}/cpython")
+SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8")
+SET(PYTHON_LIB "${PYTHON_INSTALL_DIR}/lib/libpython3.8.a")
+SET(PYTHON_BIN "${PYTHON_INSTALL_DIR}/bin/python3")
+ExternalProject_Add(
+  cpython
+  PREFIX cpython
+  GIT_REPOSITORY https://github.com/python/cpython.git
+  GIT_TAG v3.8.6
+  UPDATE_COMMAND ""
+  BUILD_IN_SOURCE True
+  CONFIGURE_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC <SOURCE_DIR>/configure --prefix ${PYTHON_INSTALL_DIR}
+  BUILD_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC make -j8
+  INSTALL_COMMAND make install
+  BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN}
+  LOG_OUTPUT_ON_FAILURE True
+)
+
+# We find the built python modules, this is confusing because python build already outputs
+# the modules in a strange nested path, and then that path is relative to the
+# Cmake ExternalProject root in the cmake build dir.
+ExternalProject_Get_property(cpython SOURCE_DIR)
+SET(PYTHON_MODULE_DIR "${SOURCE_DIR}/build/temp.linux-x86_64-3.8/${SOURCE_DIR}/Modules")
+SET(PYTHON_STDLIB_DIR "${SOURCE_DIR}/Lib")
+SET(PYTHON_STDLIB "${PYTHON_INSTALL_DIR}/lib/libpython_stdlib3.8.a")
+# Then we use a hardcoded list of expected module names and include them in our lib
+include("CMakePythonModules.txt")
+ExternalProject_Add_Step(
+  cpython
+  archive_stdlib
+  DEPENDEES install
+  BYPRODUCTS ${PYTHON_STDLIB}
+  COMMAND ar -rc ${PYTHON_STDLIB} ${PYTHON_MODULES}
+  VERBATIM
+)
+# Get python typing extension, needed by torch
+SET(TYPING_PKG "${INTERPRETER_DIR}/third_party/typing_extensions.py")
+ExternalProject_Add(
+  typing
+  PREFIX typing
+  GIT_REPOSITORY https://github.com/python/typing.git
+  GIT_TAG 3.7.4.3
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND cp ../typing/typing_extensions/src_py3/typing_extensions.py ${TYPING_PKG}
+  BYPRODUCTS ${TYPING_PKG}
+  LOG_OUTPUT_ON_FAILURE True
+)
+
+# Output files generated by freeze script, containing frozen bytecode
+SET(FROZEN_DIR "${INTERPRETER_DIR}/frozen")
+set(FROZEN_FILES
+  ${FROZEN_DIR}/main.c
+  ${FROZEN_DIR}/bytecode_0.c
+  ${FROZEN_DIR}/bytecode_1.c
+  ${FROZEN_DIR}/bytecode_2.c
+  ${FROZEN_DIR}/bytecode_3.c
+  ${FROZEN_DIR}/bytecode_4.c
+)
+# Packages to freeze: python stdlib, typing extension, and torch
+add_custom_command(
+   OUTPUT ${FROZEN_FILES}
+   WORKING_DIRECTORY ${INTERPRETER_DIR}
+   COMMAND mkdir -p ${FROZEN_DIR}
+   COMMAND ${PYTHON_BIN} freeze.py ${PYTHON_STDLIB_DIR} ${TYPING_PKG} ${PYTORCH_ROOT}/torch --oss --install_dir ${FROZEN_DIR} --verbose
+   DEPENDS cpython typing
+   VERBATIM
+)
+
+# instantiate a library based on the objects that make up torch_python
+# make sure system python isn't used here
+target_include_directories(torch_python_obj BEFORE PRIVATE ${PYTHON_INC_DIR})
+add_library(torch_python_static STATIC $<TARGET_OBJECTS:torch_python_obj>)
+# Build the interpreter lib, designed to be standalone and dlopened
+# We bake the python and torch_python binding objs into libinterpreter
+set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script")
+set(INTERPRETER_LIB_SOURCES
+  ${INTERPRETER_DIR}/interpreter.cpp
+  ${FROZEN_FILES}
+  ${LINKER_SCRIPT}
+)
+add_library(interpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
+set_property(TARGET interpreter APPEND_STRING PROPERTY
+             LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}")
+# need to ensure headers are present before any .cpp in interpreter are compiled,
+# but cpp themselves don't clearly depend on cpython so there is a race otherwise
+add_dependencies(interpreter cpython)
+target_compile_options(
+    interpreter PRIVATE
+    -fvisibility=hidden
+)
+target_include_directories(interpreter PRIVATE ${INTERPRETER_DIR})
+target_include_directories(interpreter PUBLIC ${PYTHON_INC_DIR})
+target_link_libraries(interpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
+target_link_libraries(interpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
+target_link_libraries(interpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
+
+# handy to have a standalone app to verify linkage and usage of interpreter before embedding it in another lib
+set(INTERPRETER_TEST_SOURCES
+  ${INTERPRETER_DIR}/test_main.cpp
+)
+add_executable(interpreter_test ${INTERPRETER_TEST_SOURCES})
+target_include_directories(interpreter_test PRIVATE ${PYTORCH_ROOT}/torch)
+target_include_directories(interpreter_test PRIVATE ${PYTHON_INC_DIR})
+target_link_libraries(interpreter_test PUBLIC gtest dl)
+# no-as-needed to ensure shm and torch are included to satisfy runtime dlopen
+# dependencies for libinterpreter, regardless of whether they are used in interpreter_test
+target_link_libraries(interpreter_test PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
diff --git a/torch/csrc/deploy/interpreter/CMakePythonModules.txt b/torch/csrc/deploy/interpreter/CMakePythonModules.txt
new file mode 100644
index 00000000000..c6bc9cab76f
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/CMakePythonModules.txt
@@ -0,0 +1,69 @@
+SET(PYTHON_MODULES
+  ${PYTHON_MODULE_DIR}/arraymodule.o
+  ${PYTHON_MODULE_DIR}/_asynciomodule.o
+  ${PYTHON_MODULE_DIR}/audioop.o
+  ${PYTHON_MODULE_DIR}/binascii.o
+  ${PYTHON_MODULE_DIR}/_bisectmodule.o
+  ${PYTHON_MODULE_DIR}/_blake2/blake2module.o ${PYTHON_MODULE_DIR}/_blake2/blake2b_impl.o ${PYTHON_MODULE_DIR}/_blake2/blake2s_impl.o
+  ${PYTHON_MODULE_DIR}/_bz2module.o
+  ${PYTHON_MODULE_DIR}/cmathmodule.o
+  # ${PYTHON_MODULE_DIR}/_math.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_cn.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_hk.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_iso2022.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_jp.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_kr.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_tw.o
+  ${PYTHON_MODULE_DIR}/_contextvarsmodule.o
+  ${PYTHON_MODULE_DIR}/_cryptmodule.o
+  ${PYTHON_MODULE_DIR}/_csv.o
+  ${PYTHON_MODULE_DIR}/_ctypes/_ctypes.o ${PYTHON_MODULE_DIR}/_ctypes/callbacks.o ${PYTHON_MODULE_DIR}/_ctypes/callproc.o ${PYTHON_MODULE_DIR}/_ctypes/stgdict.o ${PYTHON_MODULE_DIR}/_ctypes/cfield.o
+  ${PYTHON_MODULE_DIR}/_ctypes/_ctypes_test.o
+  ${PYTHON_MODULE_DIR}/_cursesmodule.o
+  ${PYTHON_MODULE_DIR}/_curses_panel.o
+  ${PYTHON_MODULE_DIR}/_datetimemodule.o
+  ${PYTHON_MODULE_DIR}/_decimal/_decimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/basearith.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/constants.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/context.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/convolute.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/crt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/difradix2.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fnt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fourstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/io.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/memory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/mpdecimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/numbertheory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/sixstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/transpose.o
+  ${PYTHON_MODULE_DIR}/_elementtree.o
+  ${PYTHON_MODULE_DIR}/fcntlmodule.o
+  ${PYTHON_MODULE_DIR}/grpmodule.o
+  ${PYTHON_MODULE_DIR}/_hashopenssl.o
+  ${PYTHON_MODULE_DIR}/_heapqmodule.o
+  ${PYTHON_MODULE_DIR}/_json.o
+  ${PYTHON_MODULE_DIR}/_lsprof.o
+  ${PYTHON_MODULE_DIR}/_lzmamodule.o
+  ${PYTHON_MODULE_DIR}/mathmodule.o
+  ${PYTHON_MODULE_DIR}/md5module.o
+  ${PYTHON_MODULE_DIR}/mmapmodule.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/multibytecodec.o
+  ${PYTHON_MODULE_DIR}/_multiprocessing/multiprocessing.o ${PYTHON_MODULE_DIR}/_multiprocessing/semaphore.o
+  ${PYTHON_MODULE_DIR}/nismodule.o
+  ${PYTHON_MODULE_DIR}/_opcode.o
+  ${PYTHON_MODULE_DIR}/ossaudiodev.o
+  ${PYTHON_MODULE_DIR}/parsermodule.o
+  ${PYTHON_MODULE_DIR}/_pickle.o
+  ${PYTHON_MODULE_DIR}/_posixsubprocess.o
+  ${PYTHON_MODULE_DIR}/pyexpat.o ${PYTHON_MODULE_DIR}/expat/xmlparse.o ${PYTHON_MODULE_DIR}/expat/xmlrole.o ${PYTHON_MODULE_DIR}/expat/xmltok.o
+  ${PYTHON_MODULE_DIR}/_queuemodule.o
+  ${PYTHON_MODULE_DIR}/_randommodule.o
+  ${PYTHON_MODULE_DIR}/readline.o
+  ${PYTHON_MODULE_DIR}/resource.o
+  ${PYTHON_MODULE_DIR}/selectmodule.o
+  ${PYTHON_MODULE_DIR}/sha1module.o
+  ${PYTHON_MODULE_DIR}/sha256module.o
+  ${PYTHON_MODULE_DIR}/_sha3/sha3module.o
+  ${PYTHON_MODULE_DIR}/sha512module.o
+  ${PYTHON_MODULE_DIR}/socketmodule.o
+  ${PYTHON_MODULE_DIR}/spwdmodule.o
+  ${PYTHON_MODULE_DIR}/_ssl.o
+  ${PYTHON_MODULE_DIR}/_struct.o
+  ${PYTHON_MODULE_DIR}/syslogmodule.o
+  ${PYTHON_MODULE_DIR}/termios.o
+  ${PYTHON_MODULE_DIR}/_testbuffer.o
+  ${PYTHON_MODULE_DIR}/_testcapimodule.o
+  ${PYTHON_MODULE_DIR}/_testimportmultiple.o
+  ${PYTHON_MODULE_DIR}/_testmultiphase.o
+  ${PYTHON_MODULE_DIR}/unicodedata.o
+  ${PYTHON_MODULE_DIR}/xxlimited.o
+  ${PYTHON_MODULE_DIR}/_xxtestfuzz/_xxtestfuzz.o ${PYTHON_MODULE_DIR}/_xxtestfuzz/fuzzer.o
+  ${PYTHON_MODULE_DIR}/zlibmodule.o
+)
diff --git a/torch/csrc/deploy/interpreter/freeze.py b/torch/csrc/deploy/interpreter/freeze.py
new file mode 100644
index 00000000000..459b7be9381
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/freeze.py
@@ -0,0 +1,269 @@
+"""
+Freeze Python packages.
+
+Freezing makes it possible to ship arbitrary Python modules as part of a C++
+library. The Python source of the module is compiled to bytecode and written
+to `.c` files, to be imported by Python's built-in FrozenImporter.
+
+In a normal Python installation, FrozenImporter is only used to bootstrap the
+initialization of the import machinery. Python's importers are defined in
+Python (see `_bootstrap.py` and `_bootstrap_external.py`) but need to be
+retrieved before any importers are available. Freezing the module bytecode
+resolves this circular dependency.
+
+This script will freeze the Python standard library. It produces two things:
+- Bytecode files: A set of `.c` that define C variables containing Python bytecode.
+- Main file: A `main.c` file listing all of these modules in the right form to be
+  consumed by FrozenImporter.
+
+The library that wishes to these modules make them available to the local
+Python instance by extending `PyImport_FrozenModules` appropriately (see
+https://docs.python.org/3/c-api/import.html#c.PyImport_FrozenModules).
+"""
+
+import argparse
+import functools
+import itertools
+import marshal
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List
+
+
+MAIN_INCLUDES = """#include <Python.h>
+
+"""
+
+MAIN_PREFIX = """
+// Compiled standard library modules. These should be appended to the existing
+// `PyImport_FrozenModules` that ships with CPython.
+struct _frozen _PyImport_FrozenModules_torch[] = {
+"""
+
+FAKE_PREFIX = """
+// Compiled standard library modules. These should be appended to the existing
+// `PyImport_FrozenModules` that ships with CPython.
+struct _frozen _PyImport_FrozenModules[] = {
+"""
+
+MAIN_SUFFIX = """\
+    {0, 0, 0} /* sentinel */
+};
+"""
+
+# Exclude some standard library modules to:
+# 1. Slim down the final frozen lib.
+# 2. Remove functionality we don't want to support.
+DENY_LIST = [
+    # Interface to unix databases
+    "dbm",
+    # ncurses bindings (terminal interfaces)
+    "curses",
+    # Tcl/Tk GUI
+    "tkinter",
+    "tkinter",
+    # Tests for the standard library
+    "test",
+    "tests",
+    "idle_test",
+    "__phello__.foo.py",
+    # importlib frozen modules. These are already baked into CPython.
+    "_bootstrap.py",
+    "_bootstrap_external.py",
+]
+
+NUM_BYTECODE_FILES = 5
+
+
+def indent_msg(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        args[0].indent += 1
+        ret = fn(*args, **kwargs)
+        args[0].indent -= 1
+        return ret
+
+    return wrapper
+
+
+@dataclass
+class FrozenModule:
+    # The fully qualified module name, e.g. 'foo.bar.baz'
+    module_name: str
+    # The name of the C variable that holds the bytecode, e.g. 'M_foo__bar__baz'
+    c_name: str
+    # The size of the C variable. Negative if this module is a package.
+    size: int
+    # The frozen bytecode
+    bytecode: bytes
+
+
+class Freezer:
+    def __init__(self, verbose: bool):
+        self.frozen_modules: List[FrozenModule] = []
+        self.indent: int = 0
+        self.verbose: bool = verbose
+
+    def msg(self, path: Path, code: str):
+        if not self.verbose:
+            return
+        # P: package dir
+        # F: python file
+        # S: skipped (not a package dir)
+        # X: skipped (deny-listed)
+        # N: skipped (not a python file)
+        for i in range(self.indent):
+            print("    ", end="")
+        print(f"{code} {path}")
+
+    def write_bytecode(self, install_root):
+        """
+        Write the `.c` files containing the frozen bytecode. Shard frozen
+        modules evenly across the files.
+        """
+        bytecode_file_names = [
+            f"bytecode_{i}.c" for i in range(NUM_BYTECODE_FILES)
+        ]
+        bytecode_files = [open(os.path.join(install_root, name), "w") for name in bytecode_file_names]
+        it = itertools.cycle(bytecode_files)
+        for m in self.frozen_modules:
+            self.write_frozen(m, next(it))
+
+        for f in bytecode_files:
+            f.close()
+
+    def write_main(self, install_root, oss):
+        """
+        Write the `main.c` file containing a table enumerating all the
+        frozen modules.
+        """
+        with open(os.path.join(install_root, "main.c"), "w") as outfp:
+            outfp.write(MAIN_INCLUDES)
+            for m in self.frozen_modules:
+                outfp.write(f"extern unsigned char {m.c_name}[];\n")
+
+            outfp.write(MAIN_PREFIX)
+            for m in self.frozen_modules:
+                outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n')
+            outfp.write(MAIN_SUFFIX)
+            if oss:
+                outfp.write(FAKE_PREFIX)
+                outfp.write(MAIN_SUFFIX)
+
+    def write_frozen(self, m: FrozenModule, outfp):
+        """
+        Write a single frozen module's bytecode out to a C variable.
+        """
+        outfp.write(f"unsigned char {m.c_name}[] = {{")
+        for i in range(0, len(m.bytecode), 16):
+            outfp.write("\n\t")
+            for c in bytes(m.bytecode[i : i + 16]):
+                outfp.write("%d," % c)
+        outfp.write("\n};\n")
+
+    def compile_path(self, path: Path, top_package_path: Path):
+        """Generic entry point for compiling a Path object."""
+        if path.is_dir():
+            self.compile_package(path, top_package_path)
+        else:
+            self.compile_file(path, top_package_path)
+
+    @indent_msg
+    def compile_package(self, path: Path, top_package_path: Path):
+        """Compile all the files within a Python package dir."""
+        assert path.is_dir()
+        if path.name in DENY_LIST:
+            self.msg(path, "X")
+            return
+
+        # Python packages are directories that have __init__.py in them.
+        is_package_dir = any([child.name == "__init__.py" for child in path.iterdir()])
+        if not is_package_dir:
+            self.msg(path, "S")
+            return
+
+        self.msg(path, "P")
+        # Recursively compile all children in this dir
+        for child in path.iterdir():
+            self.compile_path(child, top_package_path)
+
+    def get_module_qualname(self, file_path: Path, top_package_path: Path) -> List[str]:
+        # `path` looks like 'Lib/foo/bar/baz.py'
+
+        # chop off 'Lib/' to get something that represents a Python module hierarchy.
+        # e.g. 'foo/bar/baz.py', which maps to 'foo.bar.baz'
+        normalized_path = file_path.relative_to(top_package_path.parent)
+
+        if normalized_path.name == "__init__.py":
+            # Special handling for `__init__.py`. In this case, this file
+            # specifies that the containing directory should be treated as a package.
+            # For 'foo/bar/baz/__init__.py':
+            # - The module name is 'baz'
+            module_basename = normalized_path.parent.name
+            # - The parent is foo.bar (need to shave off the 'baz')
+            module_parent = normalized_path.parent.parent.parts
+        else:
+            module_basename = normalized_path.stem
+            module_parent = normalized_path.parent.parts
+        return list(module_parent) + [module_basename]
+
+    @indent_msg
+    def compile_file(self, path: Path, top_package_path: Path):
+        """
+        Compile a Python source file to frozen bytecode. Append the result to
+        `self.frozen_modules`.
+        """
+        assert path.is_file()
+        if path.suffix != ".py":
+            self.msg(path, "N")
+            return
+
+        if path.name in DENY_LIST:
+            self.msg(path, "X")
+            return
+
+        self.msg(path, "F")
+        module_qualname = self.get_module_qualname(path, top_package_path)
+        module_mangled_name = "__".join(module_qualname)
+        c_name = "M_" + module_mangled_name
+
+        with open(path, "r") as src_file:
+            co = compile(src_file.read(), path, "exec")
+
+        bytecode = marshal.dumps(co)
+        size = len(bytecode)
+        if path.name == '__init__.py':
+            # Python packages are signified by negative size.
+            size = -size
+        self.frozen_modules.append(
+            FrozenModule(".".join(module_qualname), c_name, size, bytecode)
+        )
+
+
+parser = argparse.ArgumentParser(description="Compile py source")
+parser.add_argument("paths", nargs="*", help="Paths to freeze.")
+parser.add_argument("--verbose", action="store_true", help="Print debug logs")
+parser.add_argument("--install_dir", help="Root directory for all output files")
+parser.add_argument("--fbcode_dir", help="Root directory for all output files")
+parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules")
+
+args = parser.parse_args()
+
+f = Freezer(args.verbose)
+
+for p in args.paths:
+    if args.fbcode_dir:
+        p = os.path.join(args.fbcode_dir, p)
+    path = Path(p)
+    if path.is_dir() and not Path.exists(path / '__init__.py'):
+        # this 'top level path p' is a standard directory containing modules,
+        # not a module itself
+        # each 'mod' could be a dir containing __init__.py or .py file
+        for mod in path.glob("*"):
+            f.compile_path(mod, mod)
+    else:
+        f.compile_path(path, path)
+
+f.write_bytecode(args.install_dir)
+f.write_main(args.install_dir, args.oss)
diff --git a/torch/csrc/deploy/interpreter/hide_symbols.script b/torch/csrc/deploy/interpreter/hide_symbols.script
new file mode 100644
index 00000000000..c748c8bfec9
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/hide_symbols.script
@@ -0,0 +1,5 @@
+INTERPRETER_0.1 {
+  global:
+    initialize_interface;
+  local: *;         # hide everything else
+};
diff --git a/torch/csrc/deploy/interpreter/interpreter.cpp b/torch/csrc/deploy/interpreter/interpreter.cpp
new file mode 100644
index 00000000000..7d685d33435
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/interpreter.cpp
@@ -0,0 +1,324 @@
+#include <dlfcn.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <iostream>
+#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
+#include <pybind11/embed.h>
+#include <cstdio>
+#include <ATen/ATen.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <map>
+#include <thread>
+#include <fmt/format.h>
+
+namespace py = pybind11;
+using namespace py::literals;
+
+// TODO this should come from cmake
+#define DEBUG 0
+template<typename T>
+const auto PYOBJ_ASSERT(T obj) {
+#if (DEBUG == 1)
+  if (NULL == obj) {
+    PyErr_Print();
+  }
+#endif
+  TORCH_INTERNAL_ASSERT(NULL != obj);
+}
+
+static wchar_t* program;
+
+#define FOREACH_LIBRARY(_) \
+  _(array)                 \
+  _(_asyncio)              \
+  _(audioop)               \
+  _(binascii)              \
+  _(_bisect)               \
+  _(_blake2)               \
+  _(_bz2)                  \
+  _(cmath)                 \
+  _(_codecs_cn)            \
+  _(_codecs_hk)            \
+  _(_codecs_iso2022)       \
+  _(_codecs_jp)            \
+  _(_codecs_kr)            \
+  _(_codecs_tw)            \
+  _(_contextvars)          \
+  _(_crypt)                \
+  _(_csv)                  \
+  _(_ctypes)               \
+  _(_ctypes_test)          \
+  _(_curses)               \
+  _(_curses_panel)         \
+  _(_datetime)             \
+  _(_decimal)              \
+  _(_elementtree)          \
+  _(fcntl)                 \
+  _(grp)                   \
+  _(_hashlib)              \
+  _(_heapq)                \
+  _(_json)                 \
+  _(_lsprof)               \
+  _(_lzma)                 \
+  _(math)                  \
+  _(_md5)                  \
+  _(mmap)                  \
+  _(_multibytecodec)       \
+  _(_multiprocessing)      \
+  _(nis)                   \
+  _(_opcode)               \
+  _(ossaudiodev)           \
+  _(parser)                \
+  _(_pickle)               \
+  _(_posixsubprocess)      \
+  _(pyexpat)               \
+  _(_queue)                \
+  _(_random)               \
+  _(readline)              \
+  _(resource)              \
+  _(select)                \
+  _(_sha1)                 \
+  _(_sha256)               \
+  _(_sha3)                 \
+  _(_sha512)               \
+  _(_socket)               \
+  _(spwd)                  \
+  _(_ssl)                  \
+  _(_struct)               \
+  _(syslog)                \
+  _(termios)               \
+  _(_testbuffer)           \
+  _(_testcapi)             \
+  _(_testimportmultiple)   \
+  _(_testmultiphase)       \
+  _(unicodedata)           \
+  _(xxlimited)             \
+  _(_xxtestfuzz)           \
+  _(zlib)
+
+#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
+FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
+#undef DECLARE_LIBRARY_INIT
+
+extern "C" __attribute__((visibility("default"))) void initialize_interface(
+    InterpreterImpl* s) {
+#define INITIALIZE_MEMBER(func) s->func = func;
+  FOREACH_INTERFACE_FUNCTION(INITIALIZE_MEMBER)
+#undef INITIALIZE_MEMBER
+}
+
+// These numbers of modules should not change as long as the cpython version
+// embedded in the build remains fixed
+static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
+static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
+
+// We need to preserve the existing FrozenModules list, since it includes
+// important importlib machinery. This code is adapted from the similar
+// `PyImport_ExtendInittab`.
+int extendFrozenModules(struct _frozen *frozenpython, struct _frozen *frozentorch) {
+    struct _frozen *p = nullptr;
+    size_t a = 0, b = 0, c = 0;
+    int res = 0;
+
+    /* Count the number of entries in both tables */
+    for (a = 0; frozenpython[a].name != nullptr; a++) {
+      // std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name << std::endl;
+    }
+    for (b = 0; frozentorch[b].name != nullptr; b++) {
+      // std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name << std::endl;
+    }
+    for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
+      // std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name << std::endl;
+    }
+
+    // Num frozen builtins shouldn't change (unless modifying the underlying cpython version)
+    TORCH_INTERNAL_ASSERT(c == NUM_FROZEN_PY_BUILTIN_MODULES, "Missing python builtin frozen modules");
+    // Check a+b together since in OSS a is empty and b contains stdlib+torch, while
+    // in fbcode they are separated due to thirdparty2 frozenpython.
+    // No fixed number of torch modules to check for, but there should be at least one.
+    TORCH_INTERNAL_ASSERT(a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1, "Missing frozen python stdlib or torch modules");
+
+    /* Allocate new memory for the combined table */
+    if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
+        size_t size = sizeof(struct _frozen) * (a + b + c + 1);
+        p = (_frozen*)PyMem_Realloc(p, size);
+    }
+    if (p == nullptr) {
+      return -1;
+    }
+
+    /* Copy the tables into the new memory */
+    memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
+    memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
+    memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
+    PyImport_FrozenModules = p;
+    return res;
+}
+
+// We need to register a custom finder because we are registering `torch._C` as
+// a built-in module, and it will otherwise get skipped by the default importer.
+const char* finder = R"RAW(
+import sys
+# Remove the path-based importer, as we don't want our isolated interpreter to read the file system
+sys.meta_path = sys.meta_path[:-1]
+
+class F:
+    def find_spec(self, fullname, path, target=None):
+        if fullname == 'torch._C':
+            return sys.meta_path[1].find_spec('torch._C', None, None)
+        return None
+sys.meta_path.insert(0, F())
+
+# make loader importable
+)RAW";
+
+const char* sysprint = R"RAW(
+import sys
+print("exec_prefix:", sys.base_exec_prefix)
+print("_base_executable:", sys._base_executable)
+print("base_prefix:", sys.base_prefix)
+print("exec_prefix:", sys.exec_prefix)
+print("executable:", sys.executable)
+print("path:", sys.path)
+print("prefix:", sys.prefix)
+
+)RAW";
+
+extern "C" PyObject* initModule(void);
+extern "C" struct _frozen _PyImport_FrozenModules[];
+extern "C" struct _frozen _PyImport_FrozenModules_torch[];
+
+static std::atomic<size_t> s_id;
+std::map<size_t, py::object> forwards;
+
+__attribute__((constructor)) void init() {
+
+}
+
+void startup() {
+#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
+  FOREACH_LIBRARY(APPEND_INIT)
+#undef APPEND_INIT
+  PyImport_AppendInittab("torch._C", initModule);
+
+  int ret = extendFrozenModules(_PyImport_FrozenModules, _PyImport_FrozenModules_torch);
+  TORCH_INTERNAL_ASSERT(ret == 0);
+
+  PyPreConfig preconfig;
+  PyPreConfig_InitIsolatedConfig(&preconfig);
+  PyStatus status = Py_PreInitialize(&preconfig);
+  TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
+
+  PyConfig config;
+  PyConfig_InitIsolatedConfig(&config);
+
+  // Completely blank out the path configuration. This ensures we have complete
+  // control of how our embedded Python searches for modules, and we will never
+  // consult the external filesystem. See:
+  // https://docs.python.org/3/c-api/init_config.html#path-configuration
+  config.site_import = 0;
+
+  status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
+  status = PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
+  status = PyConfig_SetString(&config, &config.base_prefix, L"");
+  status = PyConfig_SetString(&config, &config.exec_prefix, L"");
+  status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
+  status = PyConfig_SetString(&config, &config.prefix, L"");
+
+
+  config.module_search_paths_set = 1;
+  std::array<wchar_t*, 0> module_search_paths = {};
+  status = PyConfig_SetWideStringList(
+      &config, &config.module_search_paths, 0, module_search_paths.data());
+
+  status = Py_InitializeFromConfig(&config);
+  PyConfig_Clear(&config);
+  TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
+
+  // Uncomment to debug python config
+  // PyRun_SimpleString(sysprint);
+
+  PyRun_SimpleString(finder);
+  // Release the GIL that PyInitialize acquires
+  PyEval_SaveThread();
+}
+
+void teardown() {
+  PyGILState_Ensure();
+
+  if (Py_FinalizeEx() < 0) {
+    std::cout << "IT BROKE SO WE ARE EXITING\n";
+    exit(120);
+  }
+  PyMem_RawFree(program);
+}
+
+__attribute__((destructor)) void deinit() {}
+
+void run_some_python(const char* code) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  if (PyRun_SimpleString(code) == -1) {
+    throw std::runtime_error("python eval failed\n");
+  }
+  PyGILState_Release(gstate);
+}
+
+void run_python_file(const char* code) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  FILE* f = fopen(code, "r");
+  if (PyRun_SimpleFile(f, code) == -1) {
+    throw std::runtime_error("python eval failed\n");
+  }
+  fclose(f);
+
+  PyGILState_Release(gstate);
+}
+
+
+size_t load_model(const char* filename, bool hermetic) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
+  std::string code;
+
+  if (hermetic) {
+    code = fmt::format(R"(
+from torch.package import PackageImporter
+
+i = PackageImporter('{}')
+model = i.load_pickle('model', 'model.pkl')
+)", filename);
+  } else {
+    code = std::string("model = torch.jit.load('") +
+        std::string(filename) + std::string("')");
+  }
+    py::exec(code);
+
+  auto id = ++s_id;
+
+  PyGILState_Release(gstate);
+  return id;
+}
+
+at::Tensor forward_model(size_t model_id, at::Tensor const & input) {
+  at::Tensor output;
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  {
+    TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
+    auto forward = py::globals()["model"].attr("forward");
+
+    py::object py_output = forward(input);
+    // TODO is this going to leak?
+    // added it to prevent crash wehn using 'output' tensor in callee of
+    // forward()
+    py_output.inc_ref();
+    output = py::cast<at::Tensor>(py_output);
+  }
+
+  PyGILState_Release(gstate);
+
+  return output;
+  // return input;
+}
diff --git a/torch/csrc/deploy/interpreter/interpreter.h b/torch/csrc/deploy/interpreter/interpreter.h
new file mode 100644
index 00000000000..29e435e4497
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/interpreter.h
@@ -0,0 +1,67 @@
+#pragma once
+#include <dlfcn.h>
+#include <unistd.h>
+#include <experimental/filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
+#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
+
+
+class Interpreter : public InterpreterImpl {
+ private:
+  std::string library_name_;
+  void* handle_;
+
+ public:
+  Interpreter() : handle_(nullptr) {
+    char library_name[L_tmpnam];
+    library_name_ = library_name;
+    char* libinterpreter_path = std::getenv("LIBINTERPRETER_PATH");
+    if (libinterpreter_path == nullptr) {
+      throw std::runtime_error("libinterpreter_path is NULL, set LIBINTERPRETER_PATH env.");
+    }
+    std::tmpnam(library_name);
+    {
+      std::ifstream src(libinterpreter_path,  std::ios::binary);
+      std::ofstream dst(library_name, std::ios::binary);
+      dst << src.rdbuf();
+    }
+    handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
+    if (!handle_) {
+      throw std::runtime_error(dlerror());
+    }
+
+    // technically, we can unlike the library right after dlopen, and this is
+    // better for cleanup because even if we crash the library doesn't stick
+    // around. However, its crap for debugging because gdb can't find the
+    // symbols if the library is no longer present.
+    unlink(library_name_.c_str());
+
+    void* initialize_interface = dlsym(handle_, "initialize_interface");
+    if (!initialize_interface) {
+      throw std::runtime_error("Unable to load initialize_interface function from interpreter lib.");
+    }
+    ((void (*)(InterpreterImpl*))initialize_interface)(this);
+
+    this->startup();
+
+    // the actual torch loading process is not thread safe, by doing it
+    // in the constructor before we have multiple worker threads, then we
+    // ensure it doesn't race.
+    run_some_python("import torch");
+  }
+  ~Interpreter() {
+    if (handle_) {
+      this->teardown();
+
+      // it segfaults its face off trying to unload, but it's not clear
+      // if this is something we caused of if libtorch_python would also do the
+      // same if it were opened/closed a lot...
+      dlclose(handle_);
+    }
+  }
+  Interpreter(const Interpreter&) = delete;
+};
diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.h b/torch/csrc/deploy/interpreter/interpreter_impl.h
new file mode 100644
index 00000000000..82326bd370f
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/interpreter_impl.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <ATen/ATen.h>
+
+// NOTE- if adding new interface functions,
+// update interpreter.cpp initialize_interface.
+size_t load_model(const char* model_file, bool hermetic=false);
+at::Tensor forward_model(size_t model_id, at::Tensor const & input);
+void run_some_python(const char* code);
+void startup();
+void teardown();
+void run_python_file(const char* code);
+
+
+#define FOREACH_INTERFACE_FUNCTION(_) \
+  _(load_model)                       \
+  _(forward_model)                    \
+  _(run_some_python)                  \
+  _(startup)                          \
+  _(teardown)                         \
+  _(run_python_file)
+
+struct InterpreterImpl {
+#define DEFINE_POINTER(func) decltype(&::func) func;
+  FOREACH_INTERFACE_FUNCTION(DEFINE_POINTER)
+#undef DEFINE_POINTER
+};
diff --git a/torch/csrc/deploy/interpreter/test_main.cpp b/torch/csrc/deploy/interpreter/test_main.cpp
new file mode 100644
index 00000000000..6107267c9f2
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/test_main.cpp
@@ -0,0 +1,49 @@
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+#include <torch/script.h>
+#include <torch/torch.h>
+#include <torch/csrc/deploy/interpreter/interpreter.h>
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  int rc = RUN_ALL_TESTS();
+
+  return rc;
+}
+
+TEST(Interpreter, Sanity) {
+  ASSERT_TRUE(true);
+}
+
+TEST(Interpreter, Hello) {
+  Interpreter interp;
+  interp.run_some_python("print('hello from first interpeter!')");
+
+  Interpreter interp2;
+  interp2.run_some_python("print('hello from second interpeter!')");
+}
+
+void compare_torchpy_jit(const char* model_filename, at::Tensor const & input) {
+  Interpreter interp;
+  // Test
+  auto model_id = interp.load_model(model_filename, false);
+  at::Tensor output = interp.forward_model(model_id, input);
+
+  // Reference
+  auto ref_model = torch::jit::load(model_filename);
+  std::vector<torch::jit::IValue> ref_inputs;
+  ref_inputs.emplace_back(torch::jit::IValue(input));
+  at::Tensor ref_output = ref_model.forward(ref_inputs).toTensor();
+
+  ASSERT_TRUE(ref_output.equal(output));
+}
+
+TEST(Interpreter, SimpleModel) {
+  char* model_path = std::getenv("SIMPLE_MODEL_PATH");
+  ASSERT_NE(model_path, nullptr);
+  const int A = 10, B = 20;
+  compare_torchpy_jit(
+      model_path, torch::ones(at::IntArrayRef({A, B})));
+}
diff --git a/torch/csrc/deploy/interpreter/third_party/README.md b/torch/csrc/deploy/interpreter/third_party/README.md
new file mode 100644
index 00000000000..2c5d9241d2b
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/third_party/README.md
@@ -0,0 +1,2 @@
+Python libraries that we want to package along with the Python implementation
+bundled in libinterpreter.
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 7286387644a..faddb8cb16e 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -113,6 +113,10 @@ def _lazy_call(callable):
     if is_initialized():
         callable()
     else:
+        # TODO(torch_deploy): this accesses linecache, which attempts to read the
+        # file system to get traceback info. Patch linecache or do something
+        # else here if this ends up being important.
+
         # Don't store the actual traceback to avoid memory cycle
         _queued_calls.append((callable, traceback.format_stack()))
 
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
index df6a3793e90..73eb7f93cf1 100644
--- a/torch/utils/__init__.py
+++ b/torch/utils/__init__.py
@@ -2,6 +2,7 @@
 from .throughput_benchmark import ThroughputBenchmark
 
 import os.path as _osp
+import sys
 
 # Set the module for a given object for nicer printing
 def set_module(obj, mod):
@@ -9,5 +10,8 @@ def set_module(obj, mod):
         raise TypeError("The mod argument should be a string")
     obj.__module__ = mod
 
-#: Path to folder containing CMake definitions for Torch package
-cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake')
+if sys.executable == "torch_deploy":
+    # not valid inside torch_deploy interpreter, no paths exists for frozen modules
+    cmake_prefix_path = None
+else:
+    cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake')