diff --git a/extract_features_pytorch.py b/extract_features_pytorch.py
index 5dfed6901..8d1054f96 100644
--- a/extract_features_pytorch.py
+++ b/extract_features_pytorch.py
@@ -18,16 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import codecs
 import collections
+import logging
 import json
 import re
 
-import modeling
 import tokenization
-import tensorflow as tf
 
-import argparse
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+
+from modeling_pytorch import BertConfig, BertModel
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
 
 parser = argparse.ArgumentParser()
 
@@ -47,19 +55,14 @@ parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
 parser.add_argument("--max_seq_length", default=128, type=int,
                     help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
                          "than this will be truncated, and sequences shorter than this will be padded.")
-parser.add_argument("--do_lower_case", default=True, type=bool, 
+parser.add_argument("--do_lower_case", default=True, action='store_true', 
                     help="Whether to lower case the input text. Should be True for uncased "
                          "models and False for cased models.")
 parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
-### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
-parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
-parser.add_argument("--master", default=None, type=str, help="If using a TPU, the address of the master.")
-parser.add_argument("--num_tpu_cores", default=8, type=int, 
-                    help="Only used if `use_tpu` is True. Total number of TPU cores to use.")
-### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### 
-parser.add_argument("--use_one_hot_embeddings", default=False, type=bool, 
-                    help="If True, tf.one_hot will be used for embedding lookups, otherwise tf.nn.embedding_lookup "
-                         "will be used. On TPUs, this should be True since it is much faster.")
+parser.add_argument("--local_rank",
+                    type=int,
+                    default=-1,
+                    help = "local_rank for distributed training on gpus")
 
 args = parser.parse_args()
 
@@ -83,107 +86,6 @@ class InputFeatures(object):
         self.input_type_ids = input_type_ids
 
 
-def input_fn_builder(features, seq_length):
-    """Creates an `input_fn` closure to be passed to TPUEstimator."""
-
-    all_unique_ids = []
-    all_input_ids = []
-    all_input_mask = []
-    all_input_type_ids = []
-
-    for feature in features:
-        all_unique_ids.append(feature.unique_id)
-        all_input_ids.append(feature.input_ids)
-        all_input_mask.append(feature.input_mask)
-        all_input_type_ids.append(feature.input_type_ids)
-
-    def input_fn(params):
-        """The actual input function."""
-        batch_size = params["batch_size"]
-
-        num_examples = len(features)
-
-        # This is for demo purposes and does NOT scale to large data sets. We do
-        # not use Dataset.from_generator() because that uses tf.py_func which is
-        # not TPU compatible. The right way to load data is with TFRecordReader.
-        d = tf.data.Dataset.from_tensor_slices({
-            "unique_ids":
-                tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
-            "input_ids":
-                tf.constant(
-                    all_input_ids, shape=[num_examples, seq_length],
-                    dtype=tf.int32),
-            "input_mask":
-                tf.constant(
-                    all_input_mask,
-                    shape=[num_examples, seq_length],
-                    dtype=tf.int32),
-            "input_type_ids":
-                tf.constant(
-                    all_input_type_ids,
-                    shape=[num_examples, seq_length],
-                    dtype=tf.int32),
-        })
-
-        d = d.batch(batch_size=batch_size, drop_remainder=False)
-        return d
-
-    return input_fn
-
-
-def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
-                     use_one_hot_embeddings):
-    """Returns `model_fn` closure for TPUEstimator."""
-
-    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
-        """The `model_fn` for TPUEstimator."""
-
-        unique_ids = features["unique_ids"]
-        input_ids = features["input_ids"]
-        input_mask = features["input_mask"]
-        input_type_ids = features["input_type_ids"]
-
-        model = modeling.BertModel(
-            config=bert_config,
-            is_training=False,
-            input_ids=input_ids,
-            input_mask=input_mask,
-            token_type_ids=input_type_ids,
-            use_one_hot_embeddings=use_one_hot_embeddings)
-
-        if mode != tf.estimator.ModeKeys.PREDICT:
-            raise ValueError("Only PREDICT modes are supported: %s" % (mode))
-
-        tvars = tf.trainable_variables()
-        scaffold_fn = None
-        (assignment_map, _) = modeling.get_assigment_map_from_checkpoint(
-            tvars, init_checkpoint)
-        if use_tpu:
-
-            def tpu_scaffold():
-                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-                return tf.train.Scaffold()
-
-            scaffold_fn = tpu_scaffold
-        else:
-            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-
-        all_layers = model.get_all_encoder_layers()
-
-        predictions = {
-            "unique_id": unique_ids,
-        }
-
-        for (i, layer_index) in enumerate(layer_indexes):
-            predictions["layer_output_%d" % i] = all_layers[layer_index]
-
-        output_spec = tf.contrib.tpu.TPUEstimatorSpec(
-            mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
-        return output_spec
-
-    return model_fn
-
-
 def convert_examples_to_features(examples, seq_length, tokenizer):
     """Loads a data file into a list of `InputBatch`s."""
 
@@ -257,12 +159,12 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
         assert len(input_type_ids) == seq_length
 
         if ex_index < 5:
-            tf.logging.info("*** Example ***")
-            tf.logging.info("unique_id: %s" % (example.unique_id))
-            tf.logging.info("tokens: %s" % " ".join([str(x) for x in tokens]))
-            tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            tf.logging.info(
+            logger.info("*** Example ***")
+            logger.info("unique_id: %s" % (example.unique_id))
+            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
                 "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
 
         features.append(
@@ -296,7 +198,7 @@ def read_examples(input_file):
     """Read a list of `InputExample`s from an input file."""
     examples = []
     unique_id = 0
-    with tf.gfile.GFile(input_file, "r") as reader:
+    with open(input_file, "r") as reader:
         while True:
             line = tokenization.convert_to_unicode(reader.readline())
             if not line:
@@ -317,22 +219,22 @@ def read_examples(input_file):
 
 
 def main():
-    tf.logging.set_verbosity(tf.logging.INFO)
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # print("Initializing the distributed backend: NCCL")
+    print("device", device, "n_gpu", n_gpu)
 
     layer_indexes = [int(x) for x in args.layers.split(",")]
 
-    bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)
+    bert_config = BertConfig.from_json_file(args.bert_config_file)
 
     tokenizer = tokenization.FullTokenizer(
         vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
 
-    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
-    run_config = tf.contrib.tpu.RunConfig(
-        master=args.master,
-        tpu_config=tf.contrib.tpu.TPUConfig(
-            num_shards=args.num_tpu_cores,
-            per_host_input_for_training=is_per_host))
-
     examples = read_examples(args.input_file)
 
     features = convert_examples_to_features(
@@ -342,48 +244,55 @@ def main():
     for feature in features:
         unique_id_to_feature[feature.unique_id] = feature
 
-    model_fn = model_fn_builder(
-        bert_config=bert_config,
-        init_checkpoint=args.init_checkpoint,
-        layer_indexes=layer_indexes,
-        use_tpu=args.use_tpu,
-        use_one_hot_embeddings=args.use_one_hot_embeddings)
+    model = BertModel(bert_config)
+    if args.init_checkpoint is not None:
+        model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
+    model.to(device)
 
-    # If TPU is not available, this will fall back to normal Estimator on CPU
-    # or GPU.
-    estimator = tf.contrib.tpu.TPUEstimator(
-        use_tpu=args.use_tpu,
-        model_fn=model_fn,
-        config=run_config,
-        predict_batch_size=args.batch_size)
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
 
-    input_fn = input_fn_builder(
-        features=features, seq_length=args.max_seq_length)
+    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
+    if args.local_rank == -1:
+        eval_sampler = SequentialSampler(eval_data)
+    else:
+        eval_sampler = DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
 
-    with codecs.getwriter("utf-8")(tf.gfile.Open(args.output_file,
-                                                 "w")) as writer:
-        for result in estimator.predict(input_fn, yield_single_examples=True):
-            unique_id = int(result["unique_id"])
-            feature = unique_id_to_feature[unique_id]
-            output_json = collections.OrderedDict()
-            output_json["linex_index"] = unique_id
-            all_features = []
-            for (i, token) in enumerate(feature.tokens):
-                all_layers = []
-                for (j, layer_index) in enumerate(layer_indexes):
-                    layer_output = result["layer_output_%d" % j]
-                    layers = collections.OrderedDict()
-                    layers["index"] = layer_index
-                    layers["values"] = [
-                        round(float(x), 6) for x in layer_output[i:(i + 1)].flat
-                    ]
-                    all_layers.append(layers)
-                features = collections.OrderedDict()
-                features["token"] = token
-                features["layers"] = all_layers
-                all_features.append(features)
-            output_json["features"] = all_features
-            writer.write(json.dumps(output_json) + "\n")
+    model.eval()
+    with open(args.output_file, "w", encoding='utf-8') as writer:
+        for input_ids, input_mask, segment_ids, example_indices in eval_dataloader:
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.float().to(device)
+            segment_ids = segment_ids.to(device)
+
+            all_encoder_layers, _ = model(input_ids, segment_ids, input_mask)
+
+            for enc_layers, example_index in zip(all_encoder_layers, example_indices):
+                feature = features[example_index.item()]
+                unique_id = int(feature.unique_id)
+                # feature = unique_id_to_feature[unique_id]
+                output_json = collections.OrderedDict()
+                output_json["linex_index"] = unique_id
+                all_features = []
+                for (i, token) in enumerate(feature.tokens):
+                    all_layers = []
+                    for (j, layer_index) in enumerate(layer_indexes):
+                        layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()
+                        layers = collections.OrderedDict()
+                        layers["index"] = layer_index
+                        layers["values"] = [
+                            round(float(x), 6) for x in layer_output[i:(i + 1)].flat
+                        ]
+                        all_layers.append(layers)
+                    features = collections.OrderedDict()
+                    features["token"] = token
+                    features["layers"] = all_layers
+                    all_features.append(features)
+                output_json["features"] = all_features
+                writer.write(json.dumps(output_json) + "\n")
 
 
 if __name__ == "__main__":
diff --git a/run_squad_pytorch.py b/run_squad_pytorch.py
index a0f4bcd4d..2cd736556 100644
--- a/run_squad_pytorch.py
+++ b/run_squad_pytorch.py
@@ -23,8 +23,6 @@ import logging
 import json
 import math
 import os
-import modeling
-import optimization
 import tokenization
 import six
 import argparse
@@ -57,7 +55,7 @@ parser.add_argument("--predict_file", default=None, type=str,
                     help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
 parser.add_argument("--init_checkpoint", default=None, type=str,
                     help="Initial checkpoint (usually from a pre-trained BERT model).")
-parser.add_argument("--do_lower_case", default=True, type=bool,
+parser.add_argument("--do_lower_case", default=True, action='store_true',
                     help="Whether to lower case the input text. Should be True for uncased "
                          "models and False for cased models.")
 parser.add_argument("--max_seq_length", default=384, type=int,
@@ -68,8 +66,8 @@ parser.add_argument("--doc_stride", default=128, type=int,
 parser.add_argument("--max_query_length", default=64, type=int,
                     help="The maximum number of tokens for the question. Questions longer than this will "
                          "be truncated to this length.")
-parser.add_argument("--do_train", default=False, type=bool, help="Whether to run training.")
-parser.add_argument("--do_predict", default=False, type=bool, help="Whether to run eval on the dev set.")
+parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
+parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
 parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
 parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
 parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
@@ -87,19 +85,19 @@ parser.add_argument("--max_answer_length", default=30, type=int,
                          "and end predictions are not conditioned on one another.")
 
 ### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
-parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
-parser.add_argument("--tpu_name", default=None, type=str,
-                    help="The Cloud TPU to use for training. This should be either the name used when creating the "
-                         "Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
-parser.add_argument("--tpu_zone", default=None, type=str,
-                    help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
-                         "to automatically detect the GCE project from metadata.")
-parser.add_argument("--gcp_project", default=None, type=str,
-                    help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
-                         "to automatically detect the GCE project from metadata.")
-parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.")
-parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. "
-                                                                 "Total number of TPU cores to use.")
+# parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
+# parser.add_argument("--tpu_name", default=None, type=str,
+#                     help="The Cloud TPU to use for training. This should be either the name used when creating the "
+#                          "Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
+# parser.add_argument("--tpu_zone", default=None, type=str,
+#                     help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
+#                          "to automatically detect the GCE project from metadata.")
+# parser.add_argument("--gcp_project", default=None, type=str,
+#                     help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
+#                          "to automatically detect the GCE project from metadata.")
+# parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.")
+# parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. "
+#                                                                  "Total number of TPU cores to use.")
 ### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
 
 parser.add_argument("--verbose_logging", default=False, type=bool,
@@ -864,7 +862,7 @@ def main():
             eval_sampler = SequentialSampler(eval_data)
         else:
             eval_sampler = DistributedSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
 
         model.eval()
         all_results = []