In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

NER_TAGS_COL = 'tags'

raw_datasets = load_dataset("tner/mit_restaurant")
model_checkpoint = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

original_label_to_id = {
    "O": 0,
    "B-Rating": 1,
    "I-Rating": 2,
    "B-Amenity": 3,
    "I-Amenity": 4,
    "B-Location": 5,
    "I-Location": 6,
    "B-Restaurant_Name": 7,
    "I-Restaurant_Name": 8,
    "B-Price": 9,
    "B-Hours": 10,
    "I-Hours": 11,
    "B-Dish": 12,
    "I-Dish": 13,
    "B-Cuisine": 14,
    "I-Price": 15,
    "I-Cuisine": 16
}
original_id_to_label = {i:name for name, i in original_label_to_id.items()}

label2id = {
    "O": 0,
    "B-Rating": 1,
    "I-Rating": 2,
    "B-Amenity": 3,
    "I-Amenity": 4,
    "B-Location": 5,
    "I-Location": 6,
    "B-Restaurant_Name": 7,
    "I-Restaurant_Name": 8,
    "B-Price": 9,
    "I-Price": 10,
    "B-Hours": 11,
    "I-Hours": 12,
    "B-Dish": 13,
    "I-Dish": 14,
    "B-Cuisine": 15,
    "I-Cuisine": 16
}
id2label = {i:name for name, i in label2id.items()}

original_to_ours = {original_label_to_id[k]: label2id[k] for k in label2id}


In [4]:
def relable(examples, original_to_ours):
    new_tags = []
    for tags in examples[NER_TAGS_COL]:
        new_tags.append([original_to_ours[item] for item in tags])
    examples[NER_TAGS_COL] = new_tags
    return examples

raw_datasets = raw_datasets.map(relable, batched=True, fn_kwargs={'original_to_ours':original_to_ours})

In [6]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
labels = raw_datasets["train"][0][NER_TAGS_COL]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 9, 15, 0, 5]
[-100, 0, 0, 0, 0, 0, 9, 15, 16, 16, 0, 5, -100]


In [23]:
raw_datasets["train"][0]["tokens"]

['can',
 'you',
 'find',
 'me',
 'the',
 'cheapest',
 'mexican',
 'restaurant',
 'nearby']

In [9]:
tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)

{'input_ids': [0, 64, 47, 465, 162, 5, 21084, 162, 1178, 12657, 2391, 3027, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
tokenizer.encode('mexican')
tokenizer.decode([12657])

'ican'

In [24]:
# now on the whole dataset
def tokenize_and_align_labels(examples, is_split_into_words=True):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=is_split_into_words
    )
    all_labels = examples[NER_TAGS_COL]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets =raw_datasets.map(tokenize_and_align_labels, batched=True, remove_columns=raw_datasets['train'].column_names)

In [25]:
tokenized_datasets['train'][:5]

{'input_ids': [[0,
   64,
   47,
   465,
   162,
   5,
   21084,
   162,
   1178,
   12657,
   2391,
   3027,
   2],
  [0, 64, 47, 465, 162, 5, 1367, 18079, 8453, 2],
  [0, 64, 47, 465, 162, 5, 8099, 21629, 35427, 5566, 2],
  [0, 64, 47, 465, 162, 5, 16198, 44355, 218, 5618, 29, 2],
  [0, 64, 47, 465, 162, 5, 3673, 516, 20212, 3027, 19, 14591, 23, 5, 2003, 2]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'labels': [[-100, 0, 0, 0, 0, 0, 9, 15, 16, 16, 0, 5, -100],
  [-100, 0, 0, 0, 0, 0, 5, 7, 8, -100],
  [-100, 0, 0, 0, 0, 0, 5, 7, 8, 8, -100],
  [-100, 0, 0, 0, 0, 0, 5, 7, 8, 8, 8, -100],
  [-100, 0, 0, 0, 0, 0, 7, 8, 8, 5, 0, 3, 4, 4, 4, -100]]}

In [26]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [27]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    9,   15,   16,   16,    0,    5,
         -100],
        [-100,    0,    0,    0,    0,    0,    5,    7,    8, -100, -100, -100,
         -100]])

In [29]:
import evaluate
import numpy as np
metric = evaluate.load("seqeval")
def ner_metrics_factory(id2label: dict | list, module: str = "seqeval"):
    
    metric = evaluate.load(module)
    def compute_metrics(eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)

        # Remove ignored index (special tokens) and convert to labels
        true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        return all_metrics
    
    return compute_metrics

compute_metrics = ner_metrics_factory(id2label)

In [30]:
labels = raw_datasets["train"][0][NER_TAGS_COL]
labels = [id2label[i] for i in labels]
labels

['O', 'O', 'O', 'O', 'O', 'B-Price', 'B-Cuisine', 'O', 'B-Location']

In [202]:
predictions = labels.copy()
predictions[-1] = "O"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'Cuisine': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Location': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'Price': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [31]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    "roberta-ner",
    evaluation_strategy="epoch",
    save_strategy='no', #epoch
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"], #.select(range(100)),
    eval_dataset=tokenized_datasets["validation"], #.select(range(100)),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


In [36]:
trainer.save_model('model/local_ner')

In [37]:
model = AutoModelForTokenClassification.from_pretrained('model/local_ner')

In [44]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "model/roberta-ner"
local_checkpoint = 'model/local_ner' # <--------------
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="average"
)
token_classifier("Where is the best american style steak for breakfast")

[{'entity_group': 'Rating',
  'score': 0.8687941,
  'word': ' best',
  'start': 13,
  'end': 17},
 {'entity_group': 'Cuisine',
  'score': 0.47829947,
  'word': ' american',
  'start': 18,
  'end': 26},
 {'entity_group': 'Dish',
  'score': 0.40889955,
  'word': ' style',
  'start': 27,
  'end': 32},
 {'entity_group': 'Dish',
  'score': 0.54043555,
  'word': ' steak',
  'start': 33,
  'end': 38}]

In [45]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "model/roberta-ner"
local_checkpoint = 'model/local_ner' # <--------------
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="average"
)
token_classifier("Where is the best american style steak for breakfast")

[{'entity_group': 'Rating',
  'score': 0.9469606,
  'word': ' best',
  'start': 13,
  'end': 17},
 {'entity_group': 'Dish',
  'score': 0.67589,
  'word': ' american style steak',
  'start': 18,
  'end': 38},
 {'entity_group': 'Hours',
  'score': 0.8031102,
  'word': ' breakfast',
  'start': 43,
  'end': 52}]

In [154]:
model(**data_collator([tokenized_datasets["train"][i] for i in range(2)]))

TokenClassifierOutput(loss=tensor(2.8301, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.1519, -0.0436, -0.3144,  0.1548,  0.0801, -0.1030, -0.4586,
          -0.1639,  0.0112, -0.2178,  0.0613, -0.0064,  0.3944, -0.2061,
           0.1541,  0.3359, -0.0254],
         [ 0.1326, -0.2831, -0.3997,  0.3890,  0.4758, -0.0217, -0.6431,
          -0.1239, -0.2796, -0.2442, -0.2296,  0.3537,  0.4291, -0.2513,
           0.2084,  0.5746,  0.2295],
         [ 0.1017, -0.0653, -0.1833,  0.1308,  0.3437, -0.0944, -0.7408,
          -0.2187, -0.3630, -0.3572, -0.2900,  0.2107,  0.6002, -0.0853,
           0.2342,  0.3991,  0.2268],
         [ 0.3121, -0.2272, -0.3717,  0.2202,  0.3417, -0.0545, -0.3707,
          -0.3016, -0.4658, -0.3904, -0.3797,  0.3842,  0.4422, -0.0133,
           0.1260,  0.6062,  0.4494],
         [ 0.2335,  0.0402, -0.2908,  0.2044,  0.4027, -0.2161, -0.6017,
          -0.2181, -0.1466, -0.2539, -0.1411,  0.3368,  0.5524, -0.3622,
           0.2096,  0.4108,  0.1510],
  

In [138]:
raw_datasets['train'][105]

{'tokens': ['can',
  'you',
  'locate',
  'a',
  'place',
  'to',
  'eat',
  'that',
  'has',
  'steak',
  'on',
  'the',
  'menu',
  'and',
  'serves',
  'breakfast',
  'all',
  'day'],
 'tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 15, 11, 12]}

In [144]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "model/roberta-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="average"
)
token_classifier("Can you locate a place to eat that has steak on the menu and serves breakfast all day")

[{'entity_group': 'Dish',
  'score': 0.924998,
  'word': ' steak',
  'start': 39,
  'end': 44},
 {'entity_group': 'Hours',
  'score': 0.72994566,
  'word': ' breakfast all day',
  'start': 68,
  'end': 85}]

In [223]:
import torch
model.classifier.weight.shape

torch.Size([17, 768])

In [157]:
logits = model(**data_collator([tokenized_datasets["train"][i] for i in range(2)])).logits

In [214]:
tokenizer("Hello, how are you?", return_tensors="pt")

torch.Size([1, 8, 17])