Text Classification in Tensorflow

In this tutorial, we demonstrate a text classification model from raw text from scratch based on the IMDB sentiment classification dataset.

import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import tensorflow as tf
import numpy as np
from keras import layers

Load Dataset The dataset folder contains a train and test subfolder

!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

!dir aclImdb

## Chec content of the test directory
!dir aclImdb\test

## Chec content of the train directory
!dir aclImdb\train

# The train and test folders contains pos and negative subfolders for the positive and negative reviews. Lets see an example positive review from the train folder.

!type aclImdb\train\pos\6248_7.txt

Now we can use keras.utils.text_dataset_from_directory to generate a labeled tf.data.dataset object from a set of text files on disk filed into class-specific folders. We will use this to generate the training, validation dataset from the train directory using 80:20 split. We will use the test directory to generate the test dataset.

import keras

batch_size = 32
raw_train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1000
)

raw_val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1000,
)

raw_test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test",
    batch_size=batch_size
)

print(f"Number of batches in train dataset: {raw_train_ds.cardinality()}")
print(f"Number of batches in validation dataset: {raw_val_ds.cardinality()}")
print(f"Number of batches in test dataset: {raw_test_ds.cardinality()}")

Let’s preview few samples of the data to ensure normalization and tokenization will work as expected. We use eager execution by evaluating the tensors using numpy().

for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(2):
        print(f"Text: {text_batch.numpy()[i]}")
        print(f"Label: {label_batch.numpy()[i]}")

Lets Prepare the data

We create a custom standardization to handle the HTML break tags ‘
’ since this cannot be removed by derault standardizer.

import string
import re
import tensorflow as tf
from tensorflow.keras import layers

def custom_standardization(input_data):
    """
    - change to lower case
    - strip all html tags
    - remove punctuations
    """
    lower_case = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lower_case, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

# Define model constants
max_features = 20000 
sequence_length = 500 # we set an explicit maximum sequence length, since the CNNs model won't support variable sequence

# Create text vectorization layer to normalize, split, and map strings to integers.
# For this, we set out 'ouput_mode' to int'
vectorizer_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# First we extract the text-only dataset and call 'adapt' to create the vocabulary.
text_ds = raw_train_ds.map(lambda x, y: x)
# Call 'adapt' on text-only dataset
vectorizer_layer.adapt(text_ds)

# %%
from keras import layers

# Make the vectorization layer a part of the model
text_input = keras.Input(shape=(1,), dtype=tf.string, name='text')
x = vectorizer_layer(text_input)
x = layers.Embedding(max_features+1, embedding_dim)(x)

# Apply the vectorization layer to the raw dataset
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorizer_layer(text), label

# retrieve a batch of 32 reviews and labels as dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

We can look up the token (string) that each integer corresponds to by calling .get_vocabulary() on the layer

print("1627 ---> ", vectorizer_layer.get_vocabulary()[1627])
print("313 ---> ", vectorizer_layer.get_vocabulary()[313])
print("Vocabulary ---> {}".format(len(vectorizer_layer.get_vocabulary())))

Apply to dataset

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best GPU performance
"""
.cache() keeps data in memory after it's loaded off disk. 
This will ensure the dataset does not become a bottleneck while training your model. 
If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, 
which is more efficient to read than many small files.

.prefetch() overlaps data preprocessing and model execution while training.
"""
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

Build and Train model

from tensorflow.keras import losses, layers

embedding_dim = 16
model_2 = tf.keras.Sequential([
    layers.Embedding(max_features, embedding_dim), # Embedding layer takes the integer-encoded reviews and looks up an embedding vector for each word-index. These vectors are learned as the model trains. The vectors add a dimension to the output array. The resulting dimensions are: (batch, sequence, embedding)
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(), # GlobalAveragePooling1D layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

model_2.summary()

model_2.compile(loss=losses.BinaryCrossentropy(),
                optimizer='adam',
                metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)])

epochs = 10
history = model_2.fit(train_ds,
                      validation_data=val_ds,
                      epochs=epochs)

Evaluate the model

loss, accuracy = model_2.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

To visualize the training history

history_dict = history.history
acc = history_dict["binary_accuracy"]
val_acc = history_dict["val_binary_accuracy"]
loss = history_dict["loss"]
val_loss = history_dict["val_loss"]

epochs = range(1, len(acc)+1)

import matplotlib.pyplot as plt

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

Make an end-to-end model to export for inference on raw strings

export_model = tf.keras.Sequential([
    vectorizer_layer,
    model_2,
    layers.Activation("sigmoid")
])

export_model.compile(loss=losses.BinaryCrossentropy(from_logits=False),
                     optimizer="adam",
                     metrics=["accuracy"])

loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

Inference on new examples

examples = tf.constant([
    "The movie was great",
    "The movie was okay.",
    "The movie was awful."
])
export_model.predict(examples)
Sodiq Adewole PhD
Sodiq Adewole PhD
ML Scientist and Adjunct Professor of Data Science

My research interests include Deep Representation Learning, Multi-Modal Deep Learning, Computer Vision, Natural Language Processing, Deep Reinforcement Learning.