diff --git a/model_cards/huggingface/CodeBERTa-language-id/README.md b/model_cards/huggingface/CodeBERTa-language-id/README.md new file mode 100644 index 000000000..6a90dad0c --- /dev/null +++ b/model_cards/huggingface/CodeBERTa-language-id/README.md @@ -0,0 +1,298 @@ +--- +language: code +thumbnail: https://hf-dinosaur.huggingface.co/CodeBERTa/CodeBERTa.png +--- + +# CodeBERTa-language-id: The World’s fanciest programming language identification algo 🤯 + + +To demonstrate the usefulness of our CodeBERTa pretrained model on downstream tasks beyond language modeling, we fine-tune the [`CodeBERTa-small-v1`](https://huggingface.co/huggingface/CodeBERTa-small-v1) checkpoint on the task of classifying a sample of code into the programming language it's written in (*programming language identification*). + +We add a sequence classification head on top of the model. + +On the evaluation dataset, we attain an eval accuracy and F1 > 0.999 which is not surprising given that the task of language identification is relatively easy (see an intuition why, below). + +## Quick start: using the raw model + +```python +CODEBERTA_LANGUAGE_ID = "huggingface/CodeBERTa-language-id" + +tokenizer = RobertaTokenizer.from_pretrained(CODEBERTA_LANGUAGE_ID) +model = RobertaForSequenceClassification.from_pretrained(CODEBERTA_LANGUAGE_ID) + +input_ids = tokenizer.encode(CODE_TO_IDENTIFY) +logits = model(input_ids)[0] + +language_idx = logits.argmax() # index for the resulting label +``` + + +## Quick start: using Pipelines 💪 + +```python +from transformers import TextClassificationPipeline + +pipeline = TextClassificationPipeline( + model=RobertaForSequenceClassification.from_pretrained(CODEBERTA_LANGUAGE_ID), + tokenizer=RobertaTokenizer.from_pretrained(CODEBERTA_LANGUAGE_ID) +) + +pipeline(CODE_TO_IDENTIFY) +``` + +Let's start with something very easy: + +```python +pipeline(""" +def f(x): + return x**2 +""") +# [{'label': 'python', 'score': 0.9999965}] +``` + +Now let's probe shorter code samples: + +```python +pipeline("const foo = 'bar'") +# [{'label': 'javascript', 'score': 0.9977546}] +``` + +What if I remove the `const` token from the assignment? +```python +pipeline("foo = 'bar'") +# [{'label': 'javascript', 'score': 0.7176245}] +``` + +For some reason, this is still statistically detected as JS code, even though it's also valid Python code. However, if we slightly tweak it: + +```python +pipeline("foo = u'bar'") +# [{'label': 'python', 'score': 0.7638422}] +``` +This is now detected as Python (Notice the `u` string modifier). + +Okay, enough with the JS and Python domination already! Let's try fancier languages: + +```python +pipeline("echo $FOO") +# [{'label': 'php', 'score': 0.9995257}] +``` + +(Yes, I used the word "fancy" to describe PHP 😅) + +```python +pipeline("outcome := rand.Intn(6) + 1") +# [{'label': 'go', 'score': 0.9936151}] +``` + +Why is the problem of language identification so easy (with the correct toolkit)? Because code's syntax is rigid, and simple tokens such as `:=` (the assignment operator in Go) are perfect predictors of the underlying language: + +```python +pipeline(":=") +# [{'label': 'go', 'score': 0.9998052}] +``` + +By the way, because we trained our own custom tokenizer on the [CodeSearchNet](https://github.blog/2019-09-26-introducing-the-codesearchnet-challenge/) dataset, and it handles streams of bytes in a very generic way, syntactic constructs such `:=` are represented by a single token: + +```python +self.tokenizer.encode(" :=", add_special_tokens=False) +# [521] +``` + +
+ +## Fine-tuning code + +
+ +```python +import gzip +import json +import logging +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import numpy as np +import torch +from sklearn.metrics import f1_score +from tokenizers.implementations.byte_level_bpe import ByteLevelBPETokenizer +from tokenizers.processors import BertProcessing +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.dataset import Dataset +from torch.utils.tensorboard.writer import SummaryWriter +from tqdm import tqdm, trange + +from transformers import RobertaForSequenceClassification +from transformers.data.metrics import acc_and_f1, simple_accuracy + + +logging.basicConfig(level=logging.INFO) + + +CODEBERTA_PRETRAINED = "huggingface/CodeBERTa-small-v1" + +LANGUAGES = [ + "go", + "java", + "javascript", + "php", + "python", + "ruby", +] +FILES_PER_LANGUAGE = 1 +EVALUATE = True + +# Set up tokenizer +tokenizer = ByteLevelBPETokenizer("./pretrained/vocab.json", "./pretrained/merges.txt",) +tokenizer._tokenizer.post_processor = BertProcessing( + ("", tokenizer.token_to_id("")), ("", tokenizer.token_to_id("")), +) +tokenizer.enable_truncation(max_length=512) + +# Set up Tensorboard +tb_writer = SummaryWriter() + + +class CodeSearchNetDataset(Dataset): + examples: List[Tuple[List[int], int]] + + def __init__(self, split: str = "train"): + """ + train | valid | test + """ + + self.examples = [] + + src_files = [] + for language in LANGUAGES: + src_files += list( + Path("../CodeSearchNet/resources/data/").glob(f"{language}/final/jsonl/{split}/*.jsonl.gz") + )[:FILES_PER_LANGUAGE] + for src_file in src_files: + label = src_file.parents[3].name + label_idx = LANGUAGES.index(label) + print("🔥", src_file, label) + lines = [] + fh = gzip.open(src_file, mode="rt", encoding="utf-8") + for line in fh: + o = json.loads(line) + lines.append(o["code"]) + examples = [(x.ids, label_idx) for x in tokenizer.encode_batch(lines)] + self.examples += examples + print("🔥🔥") + + def __len__(self): + return len(self.examples) + + def __getitem__(self, i): + # We’ll pad at the batch level. + return self.examples[i] + + +model = RobertaForSequenceClassification.from_pretrained(CODEBERTA_PRETRAINED, num_labels=len(LANGUAGES)) + +train_dataset = CodeSearchNetDataset(split="train") +eval_dataset = CodeSearchNetDataset(split="test") + + +def collate(examples): + input_ids = pad_sequence([torch.tensor(x[0]) for x in examples], batch_first=True, padding_value=1) + labels = torch.tensor([x[1] for x in examples]) + # ^^ uncessary .unsqueeze(-1) + return input_ids, labels + + +train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=collate) + +batch = next(iter(train_dataloader)) + + +model.to("cuda") +model.train() +for param in model.roberta.parameters(): + param.requires_grad = False +## ^^ Only train final layer. + +print(f"num params:", model.num_parameters()) +print(f"num trainable params:", model.num_parameters(only_trainable=True)) + + +def evaluate(): + eval_loss = 0.0 + nb_eval_steps = 0 + preds = np.empty((0), dtype=np.int64) + out_label_ids = np.empty((0), dtype=np.int64) + + model.eval() + + eval_dataloader = DataLoader(eval_dataset, batch_size=512, collate_fn=collate) + for step, (input_ids, labels) in enumerate(tqdm(eval_dataloader, desc="Eval")): + with torch.no_grad(): + outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda")) + loss = outputs[0] + logits = outputs[1] + eval_loss += loss.mean().item() + nb_eval_steps += 1 + preds = np.append(preds, logits.argmax(dim=1).detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0) + eval_loss = eval_loss / nb_eval_steps + acc = simple_accuracy(preds, out_label_ids) + f1 = f1_score(y_true=out_label_ids, y_pred=preds, average="macro") + print("=== Eval: loss ===", eval_loss) + print("=== Eval: acc. ===", acc) + print("=== Eval: f1 ===", f1) + # print(acc_and_f1(preds, out_label_ids)) + tb_writer.add_scalars("eval", {"loss": eval_loss, "acc": acc, "f1": f1}, global_step) + + +### Training loop + +global_step = 0 +train_iterator = trange(0, 4, desc="Epoch") +optimizer = torch.optim.AdamW(model.parameters()) +for _ in train_iterator: + epoch_iterator = tqdm(train_dataloader, desc="Iteration") + for step, (input_ids, labels) in enumerate(epoch_iterator): + optimizer.zero_grad() + outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda")) + loss = outputs[0] + loss.backward() + tb_writer.add_scalar("training_loss", loss.item(), global_step) + optimizer.step() + global_step += 1 + if EVALUATE and global_step % 50 == 0: + evaluate() + model.train() + + +evaluate() + +os.makedirs("./models/CodeBERT-language-id", exist_ok=True) +model.save_pretrained("./models/CodeBERT-language-id") +``` + +
+ +
+ +## CodeSearchNet citation + +
+ +```bibtex +@article{husain_codesearchnet_2019, + title = {{CodeSearchNet} {Challenge}: {Evaluating} the {State} of {Semantic} {Code} {Search}}, + shorttitle = {{CodeSearchNet} {Challenge}}, + url = {http://arxiv.org/abs/1909.09436}, + urldate = {2020-03-12}, + journal = {arXiv:1909.09436 [cs, stat]}, + author = {Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + month = sep, + year = {2019}, + note = {arXiv: 1909.09436}, +} +``` + +
diff --git a/model_cards/huggingface/CodeBERTa-small-v1/README.md b/model_cards/huggingface/CodeBERTa-small-v1/README.md new file mode 100644 index 000000000..6bc86756f --- /dev/null +++ b/model_cards/huggingface/CodeBERTa-small-v1/README.md @@ -0,0 +1,128 @@ +--- +language: code +thumbnail: https://hf-dinosaur.huggingface.co/CodeBERTa/CodeBERTa.png +--- + +# CodeBERTa + +CodeBERTa is a RoBERTa-like model trained on the [CodeSearchNet](https://github.blog/2019-09-26-introducing-the-codesearchnet-challenge/) dataset from GitHub. + +Supported languages: + +```shell +"go" +"java" +"javascript" +"php" +"python" +"ruby" +``` + +The **tokenizer** is a Byte-level BPE tokenizer trained on the corpus using Hugging Face `tokenizers`. + +Because it is trained on a corpus of code (vs. natural language), it encodes the corpus efficiently (the sequences are between 33% to 50% shorter, compared to the same corpus tokenized by gpt2/roberta). + +The (small) **model** is a 6-layer, 84M parameters, RoBERTa-like Transformer model – that’s the same number of layers & heads as DistilBERT – initialized from the default initialization settings and trained from scratch on the full corpus (~2M functions) for 5 epochs. + +### Tensorboard for this training ⤵️ + +[![tb](https://hf-dinosaur.huggingface.co/CodeBERTa/tensorboard.png)](https://tensorboard.dev/experiment/irRI7jXGQlqmlxXS0I07ew/#scalars) + +## Quick start: masked language modeling prediction + +```python +PHP_CODE = """ +public static set(string $key, $value) { + if (!in_array($key, self::$allowedKeys)) { + throw new \InvalidArgumentException('Invalid key given'); + } + self::$storedValues[$key] = $value; +} +""".lstrip() +``` + +### Does the model know how to complete simple PHP code? + +```python +from transformers import pipeline + +fill_mask = pipeline( + "fill-mask", + model="huggingface/CodeBERTa-small-v1", + tokenizer="huggingface/CodeBERTa-small-v1" +) + +fill_mask(PHP_CODE) + +## Top 5 predictions: +# +' function' # prob 0.9999827146530151 +'function' # +' void' # +' def' # +' final' # +``` + +### Yes! That was easy 🎉 What about some Python (warning: this is going to be meta) + +```python +PYTHON_CODE = """ +def pipeline( + task: str, + model: Optional = None, + framework: Optional[] = None, + **kwargs +) -> Pipeline: + pass +""".lstrip() +``` + +Results: +```python +'framework', 'Framework', ' framework', 'None', 'str' +``` + +> This program can auto-complete itself! 😱 + +### Just for fun, let's try to mask natural language (not code): + +```python +fill_mask("My name is .") + +# {'sequence': ' My name is undefined.', 'score': 0.2548016905784607, 'token': 3353} +# {'sequence': ' My name is required.', 'score': 0.07290805131196976, 'token': 2371} +# {'sequence': ' My name is null.', 'score': 0.06323737651109695, 'token': 469} +# {'sequence': ' My name is name.', 'score': 0.021919190883636475, 'token': 652} +# {'sequence': ' My name is disabled.', 'score': 0.019681859761476517, 'token': 7434} +``` + +This (kind of) works because code contains comments (which contain natural language). + +Of course, the most frequent name for a Computer scientist must be undefined 🤓. + + +## Downstream task: [programming language identification](https://huggingface.co/huggingface/CodeBERTa-language-id) + +See the model card for **[`huggingface/CodeBERTa-language-id`](https://huggingface.co/huggingface/CodeBERTa-language-id)** 🤯. + +
+ +## CodeSearchNet citation + +
+ +```bibtex +@article{husain_codesearchnet_2019, + title = {{CodeSearchNet} {Challenge}: {Evaluating} the {State} of {Semantic} {Code} {Search}}, + shorttitle = {{CodeSearchNet} {Challenge}}, + url = {http://arxiv.org/abs/1909.09436}, + urldate = {2020-03-12}, + journal = {arXiv:1909.09436 [cs, stat]}, + author = {Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + month = sep, + year = {2019}, + note = {arXiv: 1909.09436}, +} +``` + +