Entity Recognition with BERT
Introduction
This post uses BERT (from huggingface) and tf.keras to train a NER model. The data is expected in this format:
Sentence # Word Tag
0 Sentence: 1 Thousands O
1 NaN of O
2 NaN demonstrators O
3 NaN have O
4 NaN marched O
5 NaN through O
6 NaN London B-geo
7 NaN to O
8 NaN protest O
...
...
24 Sentence: 2 Families O
25 NaN of O
26 NaN soldiers O
...
Setup
import os
import re
import json
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
max_len = 384
configuration = BertConfig()
data_csv = "ner_dataset.csv"
Setup Tokenizers
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)
# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)
Define model
Model
Add a fully connected layer that takes token embeddings from BERT as input and
predicts probability of that token belonging to each of the possible tags.
The fully connected layer in the code below shows a keras.layers.Dense
layer
with num_tags+1
units to accomodate a padding label.
Masked Loss
Each batch of data will consist of variable sized sentence tokens with
appropriate padding in both input and target.
During loss calculation, we ignore the loss corresponding padding tokens
in the target.
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)
def masked_ce_loss(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 17))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
def create_model(num_tags):
## BERT encoder
encoder = TFBertModel.from_pretrained("bert-base-uncased")
## NER Model
input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
embedding = encoder(
input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
)[0]
embedding = layers.Dropout(0.3)(embedding)
tag_logits = layers.Dense(num_tags+1, activation='softmax')(embedding)
model = keras.Model(
inputs=[input_ids, token_type_ids, attention_mask],
outputs=[tag_logits],
)
optimizer = keras.optimizers.Adam(lr=3e-5)
model.compile(optimizer=optimizer, loss=masked_ce_loss, metrics=['accuracy'])
return model
Preprocess dataset
First, we read the convert the rows of our data file into sentences and lists of
tags. sklearn.preprocessing.LabelEncoder
encodes each tag in a number.
Then, we create tokenize each sentence using BERT tokenizer from huggingface.
After tokenization each sentence is represented by a set of input_ids,
attention_masks and token_type_ids. Note that a single word may be tokenized into
multiple tokens. In that case, each token gets the label of the original word.
eg.
this is a differentiator -> ['this', 'is', 'a', 'different', '##ia', '##tor']
tag1 tag2 tag3 tag4 -> [tag1, tag2, tag3, tag4, tag4, tag4]
def process_csv(data_path):
df = pd.read_csv(data_path, encoding="latin-1")
df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
enc_tag = preprocessing.LabelEncoder()
df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
sentences = df.groupby("Sentence #")["Word"].apply(list).values
tag = df.groupby("Sentence #")["Tag"].apply(list).values
return sentences, tag, enc_tag
def create_inputs_targets(data_csv):
dataset_dict = {
"input_ids": [],
"token_type_ids": [],
"attention_mask": [],
"tags": []
}
sentences, tags, tag_encoder = process_csv(data_csv)
for sentence, tag in zip(sentences, tags):
input_ids = []
target_tags = []
for idx, word in enumerate(sentence):
ids = tokenizer.encode(word, add_special_tokens=False)
input_ids.extend(ids.ids)
num_tokens = len(ids)
target_tags.extend([tag[idx]] * num_tokens)
# Pad truncate
input_ids = input_ids[:max_len - 2]
target_tags = target_tags[:max_len - 2]
input_ids = [101] + input_ids + [102]
target_tags = [16] + target_tags + [16]
token_type_ids = [0] * len(input_ids)
attention_mask = [1] * len(input_ids)
padding_len = max_len - len(input_ids)
input_ids = input_ids + ([0] * padding_len)
attention_mask = attention_mask + ([0] * padding_len)
token_type_ids = token_type_ids + ([0] * padding_len)
target_tags = target_tags + ([17] * padding_len)
dataset_dict["input_ids"].append(input_ids)
dataset_dict["token_type_ids"].append(token_type_ids)
dataset_dict["attention_mask"].append(attention_mask)
dataset_dict["tags"].append(target_tags)
assert len(target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'
for key in dataset_dict:
dataset_dict[key] = np.array(dataset_dict[key])
x = [
dataset_dict["input_ids"],
dataset_dict["token_type_ids"],
dataset_dict["attention_mask"],
]
y = dataset_dict["tags"]
return x, y, tag_encoder
Create model
Use TPU if possible.
num_tags = pd.read_csv(data_csv, encoding="latin-1")["Tag"].nunique()
use_tpu = None
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
use_tpu = True
except:
use_tpu = False
if use_tpu:
# Create distribution strategy
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
# Create model
with strategy.scope():
model = create_model(num_tags)
else:
model = create_model(num_tags)
model.summary()
Output:
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 384)] 0
__________________________________________________________________________________________________
input_3 (InputLayer) [(None, 384)] 0
__________________________________________________________________________________________________
input_2 (InputLayer) [(None, 384)] 0
__________________________________________________________________________________________________
tf_bert_model (TFBertModel) ((None, 384, 768), ( 109482240 input_1[0][0]
__________________________________________________________________________________________________
dropout_37 (Dropout) (None, 384, 768) 0 tf_bert_model[0][0]
__________________________________________________________________________________________________
dense (Dense) (None, 384, 18) 13842 dropout_37[0][0]
==================================================================================================
Total params: 109,496,082
Trainable params: 109,496,082
Non-trainable params: 0
__________________________________________________________________________________________________
Train
x_train, y_train, tag_encoder = create_inputs_targets(data_csv)
bs = 64 if use_tpu else 16
model.fit(
x_train,
y_train,
epochs=1,
verbose=1,
batch_size=bs,
validation_split=0.1
)
Output:
675/675 [==============================] - 151s 223ms/step - accuracy: 0.9909 - loss: nan - val_accuracy: 0.9969 - val_loss: 2.3592
Inference
Check the predicted tags for a sample sentence.
def create_test_input_from_text(texts):
dataset_dict = {
"input_ids": [],
"token_type_ids": [],
"attention_mask": []
}
for sentence in texts:
input_ids = []
for idx, word in enumerate(sentence.split()):
ids = tokenizer.encode(word, add_special_tokens=False)
input_ids.extend(ids.ids)
num_tokens = len(ids)
# Pad and create attention masks.
# Skip if truncation is needed
input_ids = input_ids[:max_len - 2]
input_ids = [101] + input_ids + [102]
n_tokens = len(input_ids)
token_type_ids = [0] * len(input_ids)
attention_mask = [1] * len(input_ids)
padding_len = max_len - len(input_ids)
input_ids = input_ids + ([0] * padding_len)
attention_mask = attention_mask + ([0] * padding_len)
token_type_ids = token_type_ids + ([0] * padding_len)
dataset_dict["input_ids"].append(input_ids)
dataset_dict["token_type_ids"].append(token_type_ids)
dataset_dict["attention_mask"].append(attention_mask)
for key in dataset_dict:
dataset_dict[key] = np.array(dataset_dict[key])
x = [
dataset_dict["input_ids"],
dataset_dict["token_type_ids"],
dataset_dict["attention_mask"],
]
return x, n_tokens
test_inputs = ["alex lives in london"]
x_test, n_tokens = create_test_input_from_text(test_inputs)
print('input tokens')
print(x_test[0][0][:n_tokens])
pred_test = model.predict(x_test)
pred_tags = np.argmax(pred_test,2)[0][:n_tokens] # ignore predictions of padding tokens
# create dictionary of tags and and their indexes
le_dict = dict(zip(tag_encoder.transform(tag_encoder.classes_), tag_encoder.classes_))
print('predicted tags')
print([le_dict.get(_, '[pad]') for _ in pred_tags])
Output:
input tokens
[ 101 4074 3268 1999 2414 102]
predicted tags
['O', 'B-per', 'O', 'O', 'B-geo', 'O']
You can run this code with an NER dataset in required format, in this kaggle kernel. Please enable TPU for faster training.