diff --git a/extract_features_pytorch.py b/extract_features_pytorch.py index 5dfed6901..8d1054f96 100644 --- a/extract_features_pytorch.py +++ b/extract_features_pytorch.py @@ -18,16 +18,24 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import argparse import codecs import collections +import logging import json import re -import modeling import tokenization -import tensorflow as tf -import argparse +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler + +from modeling_pytorch import BertConfig, BertModel + +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) parser = argparse.ArgumentParser() @@ -47,19 +55,14 @@ parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded.") -parser.add_argument("--do_lower_case", default=True, type=bool, +parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") -### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### -parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.") -parser.add_argument("--master", default=None, type=str, help="If using a TPU, the address of the master.") -parser.add_argument("--num_tpu_cores", default=8, type=int, - help="Only used if `use_tpu` is True. Total number of TPU cores to use.") -### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### -parser.add_argument("--use_one_hot_embeddings", default=False, type=bool, - help="If True, tf.one_hot will be used for embedding lookups, otherwise tf.nn.embedding_lookup " - "will be used. On TPUs, this should be True since it is much faster.") +parser.add_argument("--local_rank", + type=int, + default=-1, + help = "local_rank for distributed training on gpus") args = parser.parse_args() @@ -83,107 +86,6 @@ class InputFeatures(object): self.input_type_ids = input_type_ids -def input_fn_builder(features, seq_length): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" - - all_unique_ids = [] - all_input_ids = [] - all_input_mask = [] - all_input_type_ids = [] - - for feature in features: - all_unique_ids.append(feature.unique_id) - all_input_ids.append(feature.input_ids) - all_input_mask.append(feature.input_mask) - all_input_type_ids.append(feature.input_type_ids) - - def input_fn(params): - """The actual input function.""" - batch_size = params["batch_size"] - - num_examples = len(features) - - # This is for demo purposes and does NOT scale to large data sets. We do - # not use Dataset.from_generator() because that uses tf.py_func which is - # not TPU compatible. The right way to load data is with TFRecordReader. - d = tf.data.Dataset.from_tensor_slices({ - "unique_ids": - tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), - "input_ids": - tf.constant( - all_input_ids, shape=[num_examples, seq_length], - dtype=tf.int32), - "input_mask": - tf.constant( - all_input_mask, - shape=[num_examples, seq_length], - dtype=tf.int32), - "input_type_ids": - tf.constant( - all_input_type_ids, - shape=[num_examples, seq_length], - dtype=tf.int32), - }) - - d = d.batch(batch_size=batch_size, drop_remainder=False) - return d - - return input_fn - - -def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, - use_one_hot_embeddings): - """Returns `model_fn` closure for TPUEstimator.""" - - def model_fn(features, labels, mode, params): # pylint: disable=unused-argument - """The `model_fn` for TPUEstimator.""" - - unique_ids = features["unique_ids"] - input_ids = features["input_ids"] - input_mask = features["input_mask"] - input_type_ids = features["input_type_ids"] - - model = modeling.BertModel( - config=bert_config, - is_training=False, - input_ids=input_ids, - input_mask=input_mask, - token_type_ids=input_type_ids, - use_one_hot_embeddings=use_one_hot_embeddings) - - if mode != tf.estimator.ModeKeys.PREDICT: - raise ValueError("Only PREDICT modes are supported: %s" % (mode)) - - tvars = tf.trainable_variables() - scaffold_fn = None - (assignment_map, _) = modeling.get_assigment_map_from_checkpoint( - tvars, init_checkpoint) - if use_tpu: - - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() - - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - - all_layers = model.get_all_encoder_layers() - - predictions = { - "unique_id": unique_ids, - } - - for (i, layer_index) in enumerate(layer_indexes): - predictions["layer_output_%d" % i] = all_layers[layer_index] - - output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) - return output_spec - - return model_fn - - def convert_examples_to_features(examples, seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" @@ -257,12 +159,12 @@ def convert_examples_to_features(examples, seq_length, tokenizer): assert len(input_type_ids) == seq_length if ex_index < 5: - tf.logging.info("*** Example ***") - tf.logging.info("unique_id: %s" % (example.unique_id)) - tf.logging.info("tokens: %s" % " ".join([str(x) for x in tokens])) - tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - tf.logging.info( + logger.info("*** Example ***") + logger.info("unique_id: %s" % (example.unique_id)) + logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) features.append( @@ -296,7 +198,7 @@ def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 - with tf.gfile.GFile(input_file, "r") as reader: + with open(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: @@ -317,22 +219,22 @@ def read_examples(input_file): def main(): - tf.logging.set_verbosity(tf.logging.INFO) + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # print("Initializing the distributed backend: NCCL") + print("device", device, "n_gpu", n_gpu) layer_indexes = [int(x) for x in args.layers.split(",")] - bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) + bert_config = BertConfig.from_json_file(args.bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) - is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 - run_config = tf.contrib.tpu.RunConfig( - master=args.master, - tpu_config=tf.contrib.tpu.TPUConfig( - num_shards=args.num_tpu_cores, - per_host_input_for_training=is_per_host)) - examples = read_examples(args.input_file) features = convert_examples_to_features( @@ -342,48 +244,55 @@ def main(): for feature in features: unique_id_to_feature[feature.unique_id] = feature - model_fn = model_fn_builder( - bert_config=bert_config, - init_checkpoint=args.init_checkpoint, - layer_indexes=layer_indexes, - use_tpu=args.use_tpu, - use_one_hot_embeddings=args.use_one_hot_embeddings) + model = BertModel(bert_config) + if args.init_checkpoint is not None: + model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) + model.to(device) - # If TPU is not available, this will fall back to normal Estimator on CPU - # or GPU. - estimator = tf.contrib.tpu.TPUEstimator( - use_tpu=args.use_tpu, - model_fn=model_fn, - config=run_config, - predict_batch_size=args.batch_size) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - input_fn = input_fn_builder( - features=features, seq_length=args.max_seq_length) + eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) + if args.local_rank == -1: + eval_sampler = SequentialSampler(eval_data) + else: + eval_sampler = DistributedSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) - with codecs.getwriter("utf-8")(tf.gfile.Open(args.output_file, - "w")) as writer: - for result in estimator.predict(input_fn, yield_single_examples=True): - unique_id = int(result["unique_id"]) - feature = unique_id_to_feature[unique_id] - output_json = collections.OrderedDict() - output_json["linex_index"] = unique_id - all_features = [] - for (i, token) in enumerate(feature.tokens): - all_layers = [] - for (j, layer_index) in enumerate(layer_indexes): - layer_output = result["layer_output_%d" % j] - layers = collections.OrderedDict() - layers["index"] = layer_index - layers["values"] = [ - round(float(x), 6) for x in layer_output[i:(i + 1)].flat - ] - all_layers.append(layers) - features = collections.OrderedDict() - features["token"] = token - features["layers"] = all_layers - all_features.append(features) - output_json["features"] = all_features - writer.write(json.dumps(output_json) + "\n") + model.eval() + with open(args.output_file, "w", encoding='utf-8') as writer: + for input_ids, input_mask, segment_ids, example_indices in eval_dataloader: + input_ids = input_ids.to(device) + input_mask = input_mask.float().to(device) + segment_ids = segment_ids.to(device) + + all_encoder_layers, _ = model(input_ids, segment_ids, input_mask) + + for enc_layers, example_index in zip(all_encoder_layers, example_indices): + feature = features[example_index.item()] + unique_id = int(feature.unique_id) + # feature = unique_id_to_feature[unique_id] + output_json = collections.OrderedDict() + output_json["linex_index"] = unique_id + all_features = [] + for (i, token) in enumerate(feature.tokens): + all_layers = [] + for (j, layer_index) in enumerate(layer_indexes): + layer_output = enc_layers[int(layer_index)].detach().cpu().numpy() + layers = collections.OrderedDict() + layers["index"] = layer_index + layers["values"] = [ + round(float(x), 6) for x in layer_output[i:(i + 1)].flat + ] + all_layers.append(layers) + features = collections.OrderedDict() + features["token"] = token + features["layers"] = all_layers + all_features.append(features) + output_json["features"] = all_features + writer.write(json.dumps(output_json) + "\n") if __name__ == "__main__": diff --git a/run_squad_pytorch.py b/run_squad_pytorch.py index a0f4bcd4d..2cd736556 100644 --- a/run_squad_pytorch.py +++ b/run_squad_pytorch.py @@ -23,8 +23,6 @@ import logging import json import math import os -import modeling -import optimization import tokenization import six import argparse @@ -57,7 +55,7 @@ parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") -parser.add_argument("--do_lower_case", default=True, type=bool, +parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--max_seq_length", default=384, type=int, @@ -68,8 +66,8 @@ parser.add_argument("--doc_stride", default=128, type=int, parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") -parser.add_argument("--do_train", default=False, type=bool, help="Whether to run training.") -parser.add_argument("--do_predict", default=False, type=bool, help="Whether to run eval on the dev set.") +parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") +parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") @@ -87,19 +85,19 @@ parser.add_argument("--max_answer_length", default=30, type=int, "and end predictions are not conditioned on one another.") ### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### -parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.") -parser.add_argument("--tpu_name", default=None, type=str, - help="The Cloud TPU to use for training. This should be either the name used when creating the " - "Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.") -parser.add_argument("--tpu_zone", default=None, type=str, - help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt " - "to automatically detect the GCE project from metadata.") -parser.add_argument("--gcp_project", default=None, type=str, - help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt " - "to automatically detect the GCE project from metadata.") -parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.") -parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. " - "Total number of TPU cores to use.") +# parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.") +# parser.add_argument("--tpu_name", default=None, type=str, +# help="The Cloud TPU to use for training. This should be either the name used when creating the " +# "Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.") +# parser.add_argument("--tpu_zone", default=None, type=str, +# help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt " +# "to automatically detect the GCE project from metadata.") +# parser.add_argument("--gcp_project", default=None, type=str, +# help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt " +# "to automatically detect the GCE project from metadata.") +# parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.") +# parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. " +# "Total number of TPU cores to use.") ### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### parser.add_argument("--verbose_logging", default=False, type=bool, @@ -864,7 +862,7 @@ def main(): eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = []