In [None]:
%load_ext autoreload
%autoreload 2
from datasets import load_dataset
from transformers import AutoTokenizer

NER_TAGS_COL = 'tags'

raw_datasets = load_dataset("tner/mit_restaurant")
model_checkpoint = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

original_label_to_id = {
    "O": 0,
    "B-Rating": 1,
    "I-Rating": 2,
    "B-Amenity": 3,
    "I-Amenity": 4,
    "B-Location": 5,
    "I-Location": 6,
    "B-Restaurant_Name": 7,
    "I-Restaurant_Name": 8,
    "B-Price": 9,
    "B-Hours": 10,
    "I-Hours": 11,
    "B-Dish": 12,
    "I-Dish": 13,
    "B-Cuisine": 14,
    "I-Price": 15,
    "I-Cuisine": 16
}
original_id_to_label = {i:name for name, i in original_label_to_id.items()}

label2id = {
    "O": 0,
    "B-Rating": 1,
    "I-Rating": 2,
    "B-Amenity": 3,
    "I-Amenity": 4,
    "B-Location": 5,
    "I-Location": 6,
    "B-Restaurant_Name": 7,
    "I-Restaurant_Name": 8,
    "B-Price": 9,
    "I-Price": 10,
    "B-Hours": 11,
    "I-Hours": 12,
    "B-Dish": 13,
    "I-Dish": 14,
    "B-Cuisine": 15,
    "I-Cuisine": 16
}
id2label = {i:name for name, i in label2id.items()}

original_to_ours = {original_label_to_id[k]: label2id[k] for k in label2id}


In [None]:
def relable(examples, original_to_ours):
    new_tags = []
    for tags in examples[NER_TAGS_COL]:
        new_tags.append([original_to_ours[item] for item in tags])
    examples[NER_TAGS_COL] = new_tags
    return examples

raw_datasets=raw_datasets.map(relable, batched=True, fn_kwargs={'original_to_ours':original_to_ours})

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
labels = raw_datasets["train"][0][NER_TAGS_COL]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

In [None]:
# now on the whole dataset
def tokenize_and_align_labels(examples, is_split_into_words=True):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=is_split_into_words
    )
    all_labels = examples[NER_TAGS_COL]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets =raw_datasets.map(tokenize_and_align_labels, batched=True, remove_columns=raw_datasets['train'].column_names)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

In [None]:
import evaluate
import numpy as np
metric = evaluate.load("seqeval")
def ner_metrics_factory(id2label: dict | list, module: str = "seqeval"):
    
    metric = evaluate.load(module)
    def compute_metrics(eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)

        # Remove ignored index (special tokens) and convert to labels
        true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        return all_metrics
    
    return compute_metrics

compute_metrics = ner_metrics_factory(id2label)

In [None]:
labels = raw_datasets["train"][0][NER_TAGS_COL]
labels = [id2label[i] for i in labels]
labels


In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    "roberta-ner",
    evaluation_strategy="epoch",
    save_strategy='no', #epoch
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"].select(range(100)),
    eval_dataset=tokenized_datasets["validation"].select(range(100)),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "model/roberta-ner"
local_checkpoint = 'model/local_ner' # <--------------
token_classifier = pipeline(
    "token-classification", model=local_checkpoint, aggregation_strategy="average"
)
token_classifier("Can you locate a place to eat that has steak on the menu and serves breakfast all day")

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "model/roberta-ner"
local_checkpoint = 'model/local_ner' # <--------------
token_classifier = pipeline(
    "token-classification", model=local_checkpoint, aggregation_strategy="average"
)
token_classifier("Can you locate a place to eat that has steak on the menu and serves breakfast all day")