diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css index 4adf8f753..808f8005f 100644 --- a/docs/source/_static/css/huggingface.css +++ b/docs/source/_static/css/huggingface.css @@ -1,3 +1,25 @@ +/* Our DOM objects */ + +.framework-selector { + display: flex; + flex-direction: row; + justify-content: flex-end; +} + +.framework-selector > button { + background-color: white; + color: #6670FF; + border: 1px solid #6670FF; + padding: 5px; +} + +.framework-selector > button.selected{ + background-color: #6670FF; + color: white; + border: 1px solid #6670FF; + padding: 5px; +} + /* The literal code blocks */ .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { color: #6670FF; diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js index 04cdfc1de..ac9388531 100644 --- a/docs/source/_static/js/custom.js +++ b/docs/source/_static/js/custom.js @@ -68,6 +68,74 @@ function addHfMenu() { document.body.insertAdjacentHTML('afterbegin', div); } +function platformToggle() { + const codeBlocks = Array.from(document.getElementsByClassName("highlight")); + const pytorchIdentifier = "## PYTORCH CODE"; + const tensorflowIdentifier = "## TENSORFLOW CODE"; + const pytorchSpanIdentifier = `${pytorchIdentifier}`; + const tensorflowSpanIdentifier = `${tensorflowIdentifier}`; + + const getFrameworkSpans = filteredCodeBlock => { + const spans = filteredCodeBlock.element.innerHTML; + const pytorchSpanPosition = spans.indexOf(pytorchSpanIdentifier); + const tensorflowSpanPosition = spans.indexOf(tensorflowSpanIdentifier); + + let pytorchSpans; + let tensorflowSpans; + + if(pytorchSpanPosition < tensorflowSpanPosition){ + pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition); + tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length); + }else{ + tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition); + pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length); + } + + return { + ...filteredCodeBlock, + pytorchSample: pytorchSpans , + tensorflowSample: tensorflowSpans + } + }; + + const createFrameworkButtons = sample => { + const pytorchButton = document.createElement("button"); + pytorchButton.innerText = "PyTorch"; + + const tensorflowButton = document.createElement("button"); + tensorflowButton.innerText = "TensorFlow"; + + const selectorDiv = document.createElement("div"); + selectorDiv.classList.add("framework-selector"); + selectorDiv.appendChild(pytorchButton); + selectorDiv.appendChild(tensorflowButton); + sample.element.parentElement.prepend(selectorDiv); + + // Init on PyTorch + sample.element.innerHTML = sample.pytorchSample; + pytorchButton.classList.add("selected"); + tensorflowButton.classList.remove("selected"); + + pytorchButton.addEventListener("click", () => { + sample.element.innerHTML = sample.pytorchSample; + pytorchButton.classList.add("selected"); + tensorflowButton.classList.remove("selected"); + }); + tensorflowButton.addEventListener("click", () => { + sample.element.innerHTML = sample.tensorflowSample; + tensorflowButton.classList.add("selected"); + pytorchButton.classList.remove("selected"); + }); + }; + + codeBlocks + .map(element => {return {element: element.firstChild, innerText: element.innerText}}) + .filter(codeBlock => codeBlock.innerText.includes(pytorchIdentifier) && codeBlock.innerText.includes(tensorflowIdentifier)) + .map(getFrameworkSpans) + .forEach(createFrameworkButtons); +} + + /*! * github-buttons v2.2.10 * (c) 2019 なつき @@ -85,6 +153,7 @@ function onLoad() { addGithubButton(); parseGithubButtons(); addHfMenu(); + platformToggle(); } window.addEventListener("load", onLoad); diff --git a/docs/source/_static/js/huggingface_logo.svg b/docs/source/_static/js/huggingface_logo.svg index 84974866c..79a9e5d8a 100644 --- a/docs/source/_static/js/huggingface_logo.svg +++ b/docs/source/_static/js/huggingface_logo.svg @@ -1,47 +1 @@ - - - icon - Created with Sketch. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 18d52e9f2..763b3ac70 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,7 +20,7 @@ sys.path.insert(0, os.path.abspath('../../src')) # -- Project information ----------------------------------------------------- project = u'transformers' -copyright = u'2019, huggingface' +copyright = u'2020, huggingface' author = u'huggingface' # The short X.Y version @@ -105,6 +105,12 @@ html_static_path = ['_static'] # # html_sidebars = {} +# This must be the name of an image file (path relative to the configuration +# directory) that is the favicon of the docs. Modern browsers use this as +# the icon for tabs, windows and bookmarks. It should be a Windows-style +# icon file (.ico). +html_favicon = 'favicon.ico' + # -- Options for HTMLHelp output --------------------------------------------- diff --git a/docs/source/favicon.ico b/docs/source/favicon.ico new file mode 100644 index 000000000..424101de7 Binary files /dev/null and b/docs/source/favicon.ico differ diff --git a/docs/source/index.rst b/docs/source/index.rst index 215e6cba6..5c593eacf 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -61,6 +61,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train quickstart glossary pretrained_models + usage model_sharing examples notebooks diff --git a/docs/source/usage.rst b/docs/source/usage.rst new file mode 100644 index 000000000..8fb7a4472 --- /dev/null +++ b/docs/source/usage.rst @@ -0,0 +1,597 @@ +Usage +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This page shows the most frequent use-cases when using the library. The models available allow for many different +configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage +for tasks such as question answering, sequence classification, named entity recognition and others. + +These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint, +automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation +for more information. +Feel free to modify the code to be more specific and adapt it to your specific use-case. + +In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These +checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the +following: + +- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage + one of the `run_$TASK.py` script in the + `examples `_ directory. +- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case + and domain. As mentioned previously, you may leverage the + `examples `_ scripts to fine-tune your model, or you + may create your own training script. + +In order to do an inference on a task, several mechanisms are made available by the library: + +- Pipelines: very easy-to-use abstractions, which require as little as two lines of code. +- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction, + but much more powerful. + +Both approaches are showcased here. + +.. note:: + + All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a + checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the + additional head that is used for the task, initializing the weights of that head randomly. + + This would produce random output. + +Sequence Classification +-------------------------- + +Sequence classification is the task of classifying sequences according to a given number of classes. An example +of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune +a model on a GLUE sequence classification task, you may leverage the +`run_glue.py `_ or +`run_tf_glue.py `_ scripts. + +Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative. +It leverages a fine-tuned model on sst2, which is a GLUE task. + +:: + + from transformers import pipeline + + nlp = pipeline("sentiment-analysis") + + print(nlp("I hate you")) + print(nlp("I love you")) + +This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows: + +:: + + [{'label': 'NEGATIVE', 'score': 0.9991129}] + [{'label': 'POSITIVE', 'score': 0.99986565}] + + +Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases +of each other. The process is the following: + +- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it + with the weights stored in the checkpoint. +- Build a sequence from the two sentences, with the correct model-specific separators token type ids + and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and + :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this) +- Pass this sequence through the model so that it is classified in one of the two available classes: 0 + (not a paraphrase) and 1 (is a paraphrase) +- Compute the softmax of the result to get probabilities over the classes +- Print the results + +:: + + ## PYTORCH CODE + from transformers import AutoTokenizer, AutoModelForSequenceClassification + import torch + + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") + model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") + + classes = ["not paraphrase", "is paraphrase"] + + sequence_0 = "The company HuggingFace is based in New York City" + sequence_1 = "Apples are especially bad for your health" + sequence_2 = "HuggingFace's headquarters are situated in Manhattan" + + paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt") + not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt") + + paraphrase_classification_logits = model(**paraphrase)[0] + not_paraphrase_classification_logits = model(**not_paraphrase)[0] + + paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0] + not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0] + + print("Should be paraphrase") + for i in range(len(classes)): + print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%") + + print("\nShould not be paraphrase") + for i in range(len(classes)): + print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%") + ## TENSORFLOW CODE + from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + import tensorflow as tf + + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") + model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") + + classes = ["not paraphrase", "is paraphrase"] + + sequence_0 = "The company HuggingFace is based in New York City" + sequence_1 = "Apples are especially bad for your health" + sequence_2 = "HuggingFace's headquarters are situated in Manhattan" + + paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf") + not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf") + + paraphrase_classification_logits = model(paraphrase)[0] + not_paraphrase_classification_logits = model(not_paraphrase)[0] + + paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0] + not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0] + + print("Should be paraphrase") + for i in range(len(classes)): + print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%") + + print("\nShould not be paraphrase") + for i in range(len(classes)): + print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%") + +This outputs the following results: + +:: + + Should be paraphrase + not paraphrase: 10% + is paraphrase: 90% + + Should not be paraphrase + not paraphrase: 94% + is paraphrase: 6% + +Extractive Question Answering +---------------------------------------------------- + +Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a +question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune +a model on a SQuAD task, you may leverage the `run_squad.py`. + +Here is an example using the pipelines do to question answering: extracting an answer from a text given a question. +It leverages a fine-tuned model on SQuAD. + +:: + + from transformers import pipeline + + nlp = pipeline("question-answering") + + context = r""" + Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a + question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune + a model on a SQuAD task, you may leverage the `run_squad.py`. + """ + + print(nlp(question="What is extractive question answering?", context=context)) + print(nlp(question="What is a good example of a question answering dataset?", context=context)) + +This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which +are the positions of the extracted answer in the text. + +:: + + {'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'} + {'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'} + + +Here is an example of question answering using a model and a tokenizer. The process is the following: + +- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it + with the weights stored in the checkpoint. +- Define a text and a few questions. +- Iterate over the questions and build a sequence from the text and the current question, with the correct + model-specific separators token type ids and attention masks +- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and + text), for both the start and end positions. +- Compute the softmax of the result to get probabilities over the tokens +- Fetch the tokens from the identified start and stop values, convert those tokens to a string. +- Print the results + +:: + + ## PYTORCH CODE + from transformers import AutoTokenizer, AutoModelForQuestionAnswering + import torch + + tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") + model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") + + text = r""" + 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose + architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural + Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between + TensorFlow 2.0 and PyTorch. + """ + + questions = [ + "How many pretrained models are available in Transformers?", + "What does Transformers provide?", + "Transformers provides interoperability between which frameworks?", + ] + + for question in questions: + inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt") + input_ids = inputs["input_ids"].tolist()[0] + + text_tokens = tokenizer.convert_ids_to_tokens(input_ids) + answer_start_scores, answer_end_scores = model(**inputs) + + answer_start = torch.argmax( + answer_start_scores + ) # Get the most likely beginning of answer with the argmax of the score + answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score + + answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) + + print(f"Question: {question}") + print(f"Answer: {answer}\n") + ## TENSORFLOW CODE + from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering + import tensorflow as tf + + tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") + model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") + + text = r""" + 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose + architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural + Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between + TensorFlow 2.0 and PyTorch. + """ + + questions = [ + "How many pretrained models are available in Transformers?", + "What does Transformers provide?", + "Transformers provides interoperability between which frameworks?", + ] + + for question in questions: + inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf") + input_ids = inputs["input_ids"].numpy()[0] + + text_tokens = tokenizer.convert_ids_to_tokens(input_ids) + answer_start_scores, answer_end_scores = model(inputs) + + answer_start = tf.argmax( + answer_start_scores, axis=1 + ).numpy()[0] # Get the most likely beginning of answer with the argmax of the score + answer_end = ( + tf.argmax(answer_end_scores, axis=1) + 1 + ).numpy()[0] # Get the most likely end of answer with the argmax of the score + answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) + + print(f"Question: {question}") + print(f"Answer: {answer}\n") + +This outputs the questions followed by the predicted answers: + +:: + + Question: How many pretrained models are available in Transformers? + Answer: over 32 + + + Question: What does Transformers provide? + Answer: general - purpose architectures + + Question: Transformers provides interoperability between which frameworks? + Answer: tensorflow 2 . 0 and pytorch + + + +Language Modeling +---------------------------------------------------- + +Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer +based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with +causal language modeling. + +Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be +domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset +or on scientific papers e.g. `LysandreJik/arxiv-nlp `__. + +Masked Language Modeling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to +fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the +right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis +for downstream tasks requiring bi-directional context such as SQuAD (question answering, +see `Lewis, Lui, Goyal et al. `__, part 4.2). + +Here is an example of using pipelines to replace a mask from a sequence: + +:: + + from transformers import pipeline + + nlp = pipeline("fill-mask") + print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks.")) + +This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer +vocabulary: + +:: + + [ + {'sequence': ' HuggingFace is creating a tool that the community uses to solve NLP tasks.', 'score': 0.15627853572368622, 'token': 3944}, + {'sequence': ' HuggingFace is creating a framework that the community uses to solve NLP tasks.', 'score': 0.11690319329500198, 'token': 7208}, + {'sequence': ' HuggingFace is creating a library that the community uses to solve NLP tasks.', 'score': 0.058063216507434845, 'token': 5560}, + {'sequence': ' HuggingFace is creating a database that the community uses to solve NLP tasks.', 'score': 0.04211743175983429, 'token': 8503}, + {'sequence': ' HuggingFace is creating a prototype that the community uses to solve NLP tasks.', 'score': 0.024718601256608963, 'token': 17715} + ] + +Here is an example doing masked language modeling using a model and a tokenizer. The process is the following: + +- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and + loads it with the weights stored in the checkpoint. +- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word. +- Encode that sequence into IDs and find the position of the masked token in that list of IDs. +- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the + values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that + context. +- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods. +- Replace the mask token by the tokens and print the results + +:: + + ## PYTORCH CODE + from transformers import AutoModelWithLMHead, AutoTokenizer + import torch + + tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") + model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased") + + sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint." + + input = tokenizer.encode(sequence, return_tensors="pt") + mask_token_index = torch.where(input == tokenizer.mask_token_id)[1] + + token_logits = model(input)[0] + mask_token_logits = token_logits[0, mask_token_index, :] + + top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() + + for token in top_5_tokens: + print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token]))) + ## TENSORFLOW CODE + from transformers import TFAutoModelWithLMHead, AutoTokenizer + import tensorflow as tf + + tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") + model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased") + + sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint." + + input = tokenizer.encode(sequence, return_tensors="tf") + mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1] + + token_logits = model(input)[0] + mask_token_logits = token_logits[0, mask_token_index, :] + + top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy() + + for token in top_5_tokens: + print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token]))) + +This prints five sequences, with the top 5 tokens predicted by the model: + +:: + + Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint. + Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint. + Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint. + Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint. + Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint. + + +Causal Language Modeling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the +model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting +for generation tasks. + +There is currently no pipeline to do causal language modeling/generation. + +Here is an example using the tokenizer and model. leveraging the :func:`~transformers.PreTrainedModel.generate` method +to generate the tokens following the initial sequence in PyTorch, and creating a simple loop in TensorFlow. + +:: + + ## PYTORCH CODE + from transformers import AutoModelWithLMHead, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = AutoModelWithLMHead.from_pretrained("gpt2") + + sequence = f"Hugging Face is based in DUMBO, New York City, and is" + + input = tokenizer.encode(sequence, return_tensors="pt") + generated = model.generate(input, max_length=50) + + resulting_string = tokenizer.decode(generated.tolist()[0]) + print(resulting_string) + ## TENSORFLOW CODE + from transformers import TFAutoModelWithLMHead, AutoTokenizer + import tensorflow as tf + + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = TFAutoModelWithLMHead.from_pretrained("gpt2") + + sequence = f"Hugging Face is based in DUMBO, New York City, and is" + generated = tokenizer.encode(sequence) + + for i in range(50): + predictions = model(tf.constant([generated]))[0] + token = tf.argmax(predictions[0], axis=1)[-1].numpy() + generated += [token] + + resulting_string = tokenizer.decode(generated) + print(resulting_string) + + +This outputs a (hopefully) coherent string from the original sequence, as the +:func:`~transformers.PreTrainedModel.generate` samples from a top_p/tok_k distribution: + +:: + + Hugging Face is based in DUMBO, New York City, and is a live-action TV series based on the novel by John + Carpenter, and its producers, David Kustlin and Steve Pichar. The film is directed by! + + +Named Entity Recognition +---------------------------------------------------- + +Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a +token as a person, an organisation or a location. +An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task. +If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch), +`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts. + +Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one +of 9 classes: + +- O, Outside of a named entity +- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity +- I-MIS, Miscellaneous entity +- B-PER, Beginning of a person's name right after another person's name +- I-PER, Person's name +- B-ORG, Beginning of an organisation right after another organisation +- I-ORG, Organisation +- B-LOC, Beginning of a location right after another location +- I-LOC, Location + +It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it `__ from +`dbmdz `__. + +:: + + from transformers import pipeline + + nlp = pipeline("ner") + + sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \ + "close to the Manhattan Bridge which is visible from the window." + + print(nlp(sequence)) + +This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the +expected results: + +:: + + [ + {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'}, + {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'}, + {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'}, + {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'}, + {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'}, + {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'}, + {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'}, + {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'}, + {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'}, + {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'}, + {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'}, + {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'} + ] + +Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and +"Manhattan Bridge" have been identified as locations. + +Here is an example doing named entity recognition using a model and a tokenizer. The process is the following: + +- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and + loads it with the weights stored in the checkpoint. +- Define the label list with which the model was trained on. +- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location. +- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely + encoding and decoding the sequence, so that we're left with a string that contains the special tokens. +- Encode that sequence into IDs (special tokens are added automatically). +- Retrieve the predictions by passing the input to the model and getting the first output. This results in a + distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class + for each token. +- Zip together each token with its prediction and print it. + +:: + + ## PYTORCH CODE + from transformers import AutoModelForTokenClassification, AutoTokenizer + import torch + + model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + + label_list = [ + "O", # Outside of a named entity + "B-MISC", # Beginning of a miscellaneous entity right after another miscellaneous entity + "I-MISC", # Miscellaneous entity + "B-PER", # Beginning of a person's name right after another person's name + "I-PER", # Person's name + "B-ORG", # Beginning of an organisation right after another organisation + "I-ORG", # Organisation + "B-LOC", # Beginning of a location right after another location + "I-LOC" # Location + ] + + sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \ + "close to the Manhattan Bridge." + + # Bit of a hack to get the tokens with the special tokens + tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence))) + inputs = tokenizer.encode(sequence, return_tensors="pt") + + outputs = model(inputs)[0] + predictions = torch.argmax(outputs, dim=2) + + print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())]) + ## TENSORFLOW CODE + from transformers import TFAutoModelForTokenClassification, AutoTokenizer + import tensorflow as tf + + model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + + label_list = [ + "O", # Outside of a named entity + "B-MISC", # Beginning of a miscellaneous entity right after another miscellaneous entity + "I-MISC", # Miscellaneous entity + "B-PER", # Beginning of a person's name right after another person's name + "I-PER", # Person's name + "B-ORG", # Beginning of an organisation right after another organisation + "I-ORG", # Organisation + "B-LOC", # Beginning of a location right after another location + "I-LOC" # Location + ] + + sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \ + "close to the Manhattan Bridge." + + # Bit of a hack to get the tokens with the special tokens + tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence))) + inputs = tokenizer.encode(sequence, return_tensors="tf") + + outputs = model(inputs)[0] + predictions = tf.argmax(outputs, axis=2) + + print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())]) + +This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has +a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The +following array should be the output: + +:: + + [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]