Entity Recognition with BERT

Introduction

This post uses BERT (from huggingface) and tf.keras to train a NER model. The data is expected in this format:

    Sentence #	 Word            Tag
0   Sentence: 1	 Thousands       O
1   NaN	         of   	        O
2   NaN	         demonstrators  O
3   NaN	         have           O
4   NaN	         marched        O
5   NaN	         through        O
6   NaN	         London         B-geo
7   NaN	         to             O
8   NaN	         protest        O
...
...
24  Sentence: 2	 Families       O
25  NaN	         of             O
26  NaN	         soldiers       O
...

Setup

import os
import re
import json
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

max_len = 384
configuration = BertConfig()
data_csv = "ner_dataset.csv"

Setup Tokenizers

# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

Define model

Model
Add a fully connected layer that takes token embeddings from BERT as input and
predicts probability of that token belonging to each of the possible tags.
The fully connected layer in the code below shows a keras.layers.Dense layer
with num_tags+1 units to accomodate a padding label.

Masked Loss
Each batch of data will consist of variable sized sentence tokens with
appropriate padding in both input and target.
During loss calculation, we ignore the loss corresponding padding tokens
in the target.

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)

def masked_ce_loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 17))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def create_model(num_tags):
    ## BERT encoder
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    ## NER Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]
    embedding = layers.Dropout(0.3)(embedding)
    tag_logits = layers.Dense(num_tags+1, activation='softmax')(embedding)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(lr=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss, metrics=['accuracy'])
    return model

Preprocess dataset

First, we read the convert the rows of our data file into sentences and lists of
tags. sklearn.preprocessing.LabelEncoder encodes each tag in a number.
Then, we create tokenize each sentence using BERT tokenizer from huggingface.
After tokenization each sentence is represented by a set of input_ids,
attention_masks and token_type_ids. Note that a single word may be tokenized into
multiple tokens. In that case, each token gets the label of the original word.
eg.

this is   a    differentiator -> ['this', 'is', 'a', 'different', '##ia', '##tor']
tag1 tag2 tag3 tag4           -> [tag1,   tag2, tag3, tag4,       tag4,   tag4]

def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag


def create_inputs_targets(data_csv):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }
    sentences, tags, tag_encoder = process_csv(data_csv)
    
    for sentence, tag in zip(sentences, tags):
        input_ids = []
        target_tags = []
        for idx, word in enumerate(sentence):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            target_tags.extend([tag[idx]] * num_tokens)
        
        
        # Pad truncate
        input_ids = input_ids[:max_len - 2]
        target_tags = target_tags[:max_len - 2]

        input_ids = [101] + input_ids + [102]
        target_tags = [16] + target_tags + [16]
        token_type_ids = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)
        padding_len = max_len - len(input_ids)

        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tags = target_tags + ([17] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        dataset_dict["tags"].append(target_tags)
        assert len(target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'
        
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = dataset_dict["tags"]
    return x, y, tag_encoder

Create model

Use TPU if possible.

num_tags = pd.read_csv(data_csv, encoding="latin-1")["Tag"].nunique()

use_tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    use_tpu = True
except:
    use_tpu = False

if use_tpu:
    # Create distribution strategy
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model(num_tags)
else:
    model = create_model(num_tags)
    
model.summary()

Output:

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 384, 768), ( 109482240   input_1[0][0]                    
__________________________________________________________________________________________________
dropout_37 (Dropout)            (None, 384, 768)     0           tf_bert_model[0][0]              
__________________________________________________________________________________________________
dense (Dense)                   (None, 384, 18)      13842       dropout_37[0][0]                 
==================================================================================================
Total params: 109,496,082
Trainable params: 109,496,082
Non-trainable params: 0
__________________________________________________________________________________________________

Train

x_train, y_train, tag_encoder = create_inputs_targets(data_csv)

bs = 64 if use_tpu else 16

model.fit(
    x_train,
    y_train,
    epochs=1,
    verbose=1,
    batch_size=bs,
    validation_split=0.1
)

Output:

675/675 [==============================] - 151s 223ms/step - accuracy: 0.9909 - loss: nan - val_accuracy: 0.9969 - val_loss: 2.3592

Inference

Check the predicted tags for a sample sentence.

def create_test_input_from_text(texts):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": []
    }
    for sentence in texts:
        input_ids = []
        for idx, word in enumerate(sentence.split()):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            
        # Pad and create attention masks.
        # Skip if truncation is needed
        input_ids = input_ids[:max_len - 2]

        input_ids = [101] + input_ids + [102]
        n_tokens = len(input_ids)
        token_type_ids = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)
        padding_len = max_len - len(input_ids)

        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    return x, n_tokens

test_inputs = ["alex lives in london"]
x_test, n_tokens = create_test_input_from_text(test_inputs)
print('input tokens')
print(x_test[0][0][:n_tokens])
pred_test = model.predict(x_test)
pred_tags = np.argmax(pred_test,2)[0][:n_tokens]  # ignore predictions of padding tokens

# create dictionary of tags and and their indexes
le_dict = dict(zip(tag_encoder.transform(tag_encoder.classes_), tag_encoder.classes_))
print('predicted tags')
print([le_dict.get(_, '[pad]') for _ in pred_tags])

Output:

input tokens
[ 101 4074 3268 1999 2414  102]
predicted tags
['O', 'B-per', 'O', 'O', 'B-geo', 'O']

You can run this code with an NER dataset in required format, in this kaggle kernel. Please enable TPU for faster training.