mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
updated extract_features
This commit is contained in:
parent
9af479b3b9
commit
ebfffa0ab2
2 changed files with 95 additions and 188 deletions
|
|
@ -18,16 +18,24 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import codecs
|
||||
import collections
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
|
||||
import modeling
|
||||
import tokenization
|
||||
import tensorflow as tf
|
||||
|
||||
import argparse
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
from modeling_pytorch import BertConfig, BertModel
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
|
|
@ -47,19 +55,14 @@ parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
|
|||
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
|
||||
"than this will be truncated, and sequences shorter than this will be padded.")
|
||||
parser.add_argument("--do_lower_case", default=True, type=bool,
|
||||
parser.add_argument("--do_lower_case", default=True, action='store_true',
|
||||
help="Whether to lower case the input text. Should be True for uncased "
|
||||
"models and False for cased models.")
|
||||
parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
|
||||
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
|
||||
parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
|
||||
parser.add_argument("--master", default=None, type=str, help="If using a TPU, the address of the master.")
|
||||
parser.add_argument("--num_tpu_cores", default=8, type=int,
|
||||
help="Only used if `use_tpu` is True. Total number of TPU cores to use.")
|
||||
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
|
||||
parser.add_argument("--use_one_hot_embeddings", default=False, type=bool,
|
||||
help="If True, tf.one_hot will be used for embedding lookups, otherwise tf.nn.embedding_lookup "
|
||||
"will be used. On TPUs, this should be True since it is much faster.")
|
||||
parser.add_argument("--local_rank",
|
||||
type=int,
|
||||
default=-1,
|
||||
help = "local_rank for distributed training on gpus")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
@ -83,107 +86,6 @@ class InputFeatures(object):
|
|||
self.input_type_ids = input_type_ids
|
||||
|
||||
|
||||
def input_fn_builder(features, seq_length):
|
||||
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
|
||||
|
||||
all_unique_ids = []
|
||||
all_input_ids = []
|
||||
all_input_mask = []
|
||||
all_input_type_ids = []
|
||||
|
||||
for feature in features:
|
||||
all_unique_ids.append(feature.unique_id)
|
||||
all_input_ids.append(feature.input_ids)
|
||||
all_input_mask.append(feature.input_mask)
|
||||
all_input_type_ids.append(feature.input_type_ids)
|
||||
|
||||
def input_fn(params):
|
||||
"""The actual input function."""
|
||||
batch_size = params["batch_size"]
|
||||
|
||||
num_examples = len(features)
|
||||
|
||||
# This is for demo purposes and does NOT scale to large data sets. We do
|
||||
# not use Dataset.from_generator() because that uses tf.py_func which is
|
||||
# not TPU compatible. The right way to load data is with TFRecordReader.
|
||||
d = tf.data.Dataset.from_tensor_slices({
|
||||
"unique_ids":
|
||||
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
|
||||
"input_ids":
|
||||
tf.constant(
|
||||
all_input_ids, shape=[num_examples, seq_length],
|
||||
dtype=tf.int32),
|
||||
"input_mask":
|
||||
tf.constant(
|
||||
all_input_mask,
|
||||
shape=[num_examples, seq_length],
|
||||
dtype=tf.int32),
|
||||
"input_type_ids":
|
||||
tf.constant(
|
||||
all_input_type_ids,
|
||||
shape=[num_examples, seq_length],
|
||||
dtype=tf.int32),
|
||||
})
|
||||
|
||||
d = d.batch(batch_size=batch_size, drop_remainder=False)
|
||||
return d
|
||||
|
||||
return input_fn
|
||||
|
||||
|
||||
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
|
||||
use_one_hot_embeddings):
|
||||
"""Returns `model_fn` closure for TPUEstimator."""
|
||||
|
||||
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
|
||||
"""The `model_fn` for TPUEstimator."""
|
||||
|
||||
unique_ids = features["unique_ids"]
|
||||
input_ids = features["input_ids"]
|
||||
input_mask = features["input_mask"]
|
||||
input_type_ids = features["input_type_ids"]
|
||||
|
||||
model = modeling.BertModel(
|
||||
config=bert_config,
|
||||
is_training=False,
|
||||
input_ids=input_ids,
|
||||
input_mask=input_mask,
|
||||
token_type_ids=input_type_ids,
|
||||
use_one_hot_embeddings=use_one_hot_embeddings)
|
||||
|
||||
if mode != tf.estimator.ModeKeys.PREDICT:
|
||||
raise ValueError("Only PREDICT modes are supported: %s" % (mode))
|
||||
|
||||
tvars = tf.trainable_variables()
|
||||
scaffold_fn = None
|
||||
(assignment_map, _) = modeling.get_assigment_map_from_checkpoint(
|
||||
tvars, init_checkpoint)
|
||||
if use_tpu:
|
||||
|
||||
def tpu_scaffold():
|
||||
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
||||
return tf.train.Scaffold()
|
||||
|
||||
scaffold_fn = tpu_scaffold
|
||||
else:
|
||||
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
||||
|
||||
all_layers = model.get_all_encoder_layers()
|
||||
|
||||
predictions = {
|
||||
"unique_id": unique_ids,
|
||||
}
|
||||
|
||||
for (i, layer_index) in enumerate(layer_indexes):
|
||||
predictions["layer_output_%d" % i] = all_layers[layer_index]
|
||||
|
||||
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
|
||||
mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
|
||||
return output_spec
|
||||
|
||||
return model_fn
|
||||
|
||||
|
||||
def convert_examples_to_features(examples, seq_length, tokenizer):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
|
||||
|
|
@ -257,12 +159,12 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
|
|||
assert len(input_type_ids) == seq_length
|
||||
|
||||
if ex_index < 5:
|
||||
tf.logging.info("*** Example ***")
|
||||
tf.logging.info("unique_id: %s" % (example.unique_id))
|
||||
tf.logging.info("tokens: %s" % " ".join([str(x) for x in tokens]))
|
||||
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||
tf.logging.info(
|
||||
logger.info("*** Example ***")
|
||||
logger.info("unique_id: %s" % (example.unique_id))
|
||||
logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
|
||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||
logger.info(
|
||||
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
|
||||
|
||||
features.append(
|
||||
|
|
@ -296,7 +198,7 @@ def read_examples(input_file):
|
|||
"""Read a list of `InputExample`s from an input file."""
|
||||
examples = []
|
||||
unique_id = 0
|
||||
with tf.gfile.GFile(input_file, "r") as reader:
|
||||
with open(input_file, "r") as reader:
|
||||
while True:
|
||||
line = tokenization.convert_to_unicode(reader.readline())
|
||||
if not line:
|
||||
|
|
@ -317,22 +219,22 @@ def read_examples(input_file):
|
|||
|
||||
|
||||
def main():
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
else:
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
n_gpu = 1
|
||||
# print("Initializing the distributed backend: NCCL")
|
||||
print("device", device, "n_gpu", n_gpu)
|
||||
|
||||
layer_indexes = [int(x) for x in args.layers.split(",")]
|
||||
|
||||
bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)
|
||||
bert_config = BertConfig.from_json_file(args.bert_config_file)
|
||||
|
||||
tokenizer = tokenization.FullTokenizer(
|
||||
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
|
||||
|
||||
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
|
||||
run_config = tf.contrib.tpu.RunConfig(
|
||||
master=args.master,
|
||||
tpu_config=tf.contrib.tpu.TPUConfig(
|
||||
num_shards=args.num_tpu_cores,
|
||||
per_host_input_for_training=is_per_host))
|
||||
|
||||
examples = read_examples(args.input_file)
|
||||
|
||||
features = convert_examples_to_features(
|
||||
|
|
@ -342,48 +244,55 @@ def main():
|
|||
for feature in features:
|
||||
unique_id_to_feature[feature.unique_id] = feature
|
||||
|
||||
model_fn = model_fn_builder(
|
||||
bert_config=bert_config,
|
||||
init_checkpoint=args.init_checkpoint,
|
||||
layer_indexes=layer_indexes,
|
||||
use_tpu=args.use_tpu,
|
||||
use_one_hot_embeddings=args.use_one_hot_embeddings)
|
||||
model = BertModel(bert_config)
|
||||
if args.init_checkpoint is not None:
|
||||
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
|
||||
model.to(device)
|
||||
|
||||
# If TPU is not available, this will fall back to normal Estimator on CPU
|
||||
# or GPU.
|
||||
estimator = tf.contrib.tpu.TPUEstimator(
|
||||
use_tpu=args.use_tpu,
|
||||
model_fn=model_fn,
|
||||
config=run_config,
|
||||
predict_batch_size=args.batch_size)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
||||
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
|
||||
|
||||
input_fn = input_fn_builder(
|
||||
features=features, seq_length=args.max_seq_length)
|
||||
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
|
||||
if args.local_rank == -1:
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
else:
|
||||
eval_sampler = DistributedSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
|
||||
|
||||
with codecs.getwriter("utf-8")(tf.gfile.Open(args.output_file,
|
||||
"w")) as writer:
|
||||
for result in estimator.predict(input_fn, yield_single_examples=True):
|
||||
unique_id = int(result["unique_id"])
|
||||
feature = unique_id_to_feature[unique_id]
|
||||
output_json = collections.OrderedDict()
|
||||
output_json["linex_index"] = unique_id
|
||||
all_features = []
|
||||
for (i, token) in enumerate(feature.tokens):
|
||||
all_layers = []
|
||||
for (j, layer_index) in enumerate(layer_indexes):
|
||||
layer_output = result["layer_output_%d" % j]
|
||||
layers = collections.OrderedDict()
|
||||
layers["index"] = layer_index
|
||||
layers["values"] = [
|
||||
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
|
||||
]
|
||||
all_layers.append(layers)
|
||||
features = collections.OrderedDict()
|
||||
features["token"] = token
|
||||
features["layers"] = all_layers
|
||||
all_features.append(features)
|
||||
output_json["features"] = all_features
|
||||
writer.write(json.dumps(output_json) + "\n")
|
||||
model.eval()
|
||||
with open(args.output_file, "w", encoding='utf-8') as writer:
|
||||
for input_ids, input_mask, segment_ids, example_indices in eval_dataloader:
|
||||
input_ids = input_ids.to(device)
|
||||
input_mask = input_mask.float().to(device)
|
||||
segment_ids = segment_ids.to(device)
|
||||
|
||||
all_encoder_layers, _ = model(input_ids, segment_ids, input_mask)
|
||||
|
||||
for enc_layers, example_index in zip(all_encoder_layers, example_indices):
|
||||
feature = features[example_index.item()]
|
||||
unique_id = int(feature.unique_id)
|
||||
# feature = unique_id_to_feature[unique_id]
|
||||
output_json = collections.OrderedDict()
|
||||
output_json["linex_index"] = unique_id
|
||||
all_features = []
|
||||
for (i, token) in enumerate(feature.tokens):
|
||||
all_layers = []
|
||||
for (j, layer_index) in enumerate(layer_indexes):
|
||||
layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()
|
||||
layers = collections.OrderedDict()
|
||||
layers["index"] = layer_index
|
||||
layers["values"] = [
|
||||
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
|
||||
]
|
||||
all_layers.append(layers)
|
||||
features = collections.OrderedDict()
|
||||
features["token"] = token
|
||||
features["layers"] = all_layers
|
||||
all_features.append(features)
|
||||
output_json["features"] = all_features
|
||||
writer.write(json.dumps(output_json) + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -23,8 +23,6 @@ import logging
|
|||
import json
|
||||
import math
|
||||
import os
|
||||
import modeling
|
||||
import optimization
|
||||
import tokenization
|
||||
import six
|
||||
import argparse
|
||||
|
|
@ -57,7 +55,7 @@ parser.add_argument("--predict_file", default=None, type=str,
|
|||
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
|
||||
parser.add_argument("--init_checkpoint", default=None, type=str,
|
||||
help="Initial checkpoint (usually from a pre-trained BERT model).")
|
||||
parser.add_argument("--do_lower_case", default=True, type=bool,
|
||||
parser.add_argument("--do_lower_case", default=True, action='store_true',
|
||||
help="Whether to lower case the input text. Should be True for uncased "
|
||||
"models and False for cased models.")
|
||||
parser.add_argument("--max_seq_length", default=384, type=int,
|
||||
|
|
@ -68,8 +66,8 @@ parser.add_argument("--doc_stride", default=128, type=int,
|
|||
parser.add_argument("--max_query_length", default=64, type=int,
|
||||
help="The maximum number of tokens for the question. Questions longer than this will "
|
||||
"be truncated to this length.")
|
||||
parser.add_argument("--do_train", default=False, type=bool, help="Whether to run training.")
|
||||
parser.add_argument("--do_predict", default=False, type=bool, help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
|
||||
parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
|
||||
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
|
|
@ -87,19 +85,19 @@ parser.add_argument("--max_answer_length", default=30, type=int,
|
|||
"and end predictions are not conditioned on one another.")
|
||||
|
||||
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
|
||||
parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
|
||||
parser.add_argument("--tpu_name", default=None, type=str,
|
||||
help="The Cloud TPU to use for training. This should be either the name used when creating the "
|
||||
"Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
|
||||
parser.add_argument("--tpu_zone", default=None, type=str,
|
||||
help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
|
||||
"to automatically detect the GCE project from metadata.")
|
||||
parser.add_argument("--gcp_project", default=None, type=str,
|
||||
help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
|
||||
"to automatically detect the GCE project from metadata.")
|
||||
parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.")
|
||||
parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. "
|
||||
"Total number of TPU cores to use.")
|
||||
# parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
|
||||
# parser.add_argument("--tpu_name", default=None, type=str,
|
||||
# help="The Cloud TPU to use for training. This should be either the name used when creating the "
|
||||
# "Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
|
||||
# parser.add_argument("--tpu_zone", default=None, type=str,
|
||||
# help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
|
||||
# "to automatically detect the GCE project from metadata.")
|
||||
# parser.add_argument("--gcp_project", default=None, type=str,
|
||||
# help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
|
||||
# "to automatically detect the GCE project from metadata.")
|
||||
# parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.")
|
||||
# parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. "
|
||||
# "Total number of TPU cores to use.")
|
||||
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
|
||||
|
||||
parser.add_argument("--verbose_logging", default=False, type=bool,
|
||||
|
|
@ -864,7 +862,7 @@ def main():
|
|||
eval_sampler = SequentialSampler(eval_data)
|
||||
else:
|
||||
eval_sampler = DistributedSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
|
||||
|
||||
model.eval()
|
||||
all_results = []
|
||||
|
|
|
|||
Loading…
Reference in a new issue