# To silence the TensorFlow warnings, you can use the following code before you import the TensorFlow library.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers
from tensorflow.keras import losses
import re
import string
import matplotlib.pyplot as plt

print("Imports successful!")

Imports successful!

# Select your favourite number for the random seed
seed = 42

# Sets the global random seed for numpy.
np.random.seed(seed)
# Sets the global random seed for TensorFlow.
tf.random.set_seed(seed)

print(f"Random seed set to {seed}")

Random seed set to 42

data_dir = './data/aclImdb'

# Here you have two main directories: one for train and one for test data.
# You load files from each to create training and test datasets.

# Create the training set. Use 80% of the data and keep the remaining 20% for the validation.
raw_training_set = tf.keras.utils.text_dataset_from_directory(
    f'{data_dir}/train',
    labels='inferred',
    label_mode='int',
    batch_size=32, 
    validation_split=0.2, 
    subset='training', 
    seed=seed
)

# Create the validation set. Use 20% of the data that was not used for training.
raw_validation_set = tf.keras.utils.text_dataset_from_directory(
    f'{data_dir}/train',
    labels='inferred',
    label_mode='int',
    batch_size=32, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed
)

# Create the test set.
raw_test_set = tf.keras.utils.text_dataset_from_directory(
    f'{data_dir}/test',
    labels='inferred',
    label_mode='int',
    batch_size=32,
)

Found 5000 files belonging to 2 classes.
Using 4000 files for training.
Found 5000 files belonging to 2 classes.
Using 1000 files for validation.
Found 5000 files belonging to 2 classes.

print(f"Label 0 corresponds to {raw_training_set.class_names[0]}")
print(f"Label 1 corresponds to {raw_training_set.class_names[1]}")

Label 0 corresponds to neg
Label 1 corresponds to pos

# Take one batch from the dataset and print out the first three datapoints in the batch
for text_batch, label_batch in raw_training_set.take(1):
    for i in range(3):
        print(f"Review:\n {text_batch.numpy()[i]}")
        print(f"Label: {label_batch.numpy()[i]}\n")

Review:
 b'This is a reunion, a team, and a great episode of Justice. From hesitation to resolution, Clark has made a important leap from a troubled teenager who was afraid of a controlled destiny, to a Superman who, like Green Arrow, sets aside his emotions to his few loved ones, ready to save the whole planet. This is not just a thrilling story about teamwork, loyalty, and friendship; this is also about deciding what\'s more important in life, a lesson for Clark. I do not want the series to end, but I hope the ensuing episodes will strictly stick to what Justice shows without any "rewind" pushes and put a good end here of Smallville---and a wonderful beginning of Superman.<br /><br />In this episode, however, we should have seen more contrast between Lex and the Team. Nine stars should give it enough credit.'
Label: 1

Review:
 b'"Hey Babu Riba" is a film about a young woman, Mariana (nicknamed "Esther" after a famous American movie star), and four young men, Glenn, Sacha, Kicha, and Pop, all perhaps 15-17 years old in 1953 Belgrade, Yugoslavia. The five are committed friends and crazy about jazz, blue jeans, or anything American it seems.<br /><br />The very close relationship of the teenagers is poignant, and ultimately a sacrifice is willingly made to try to help one of the group who has fallen on unexpected difficulties. In the wake of changing communist politics, they go their separate ways and reunite in 1985 (the year before the film was made).<br /><br />I enjoyed the film with some reservations. The subtitles for one thing were difficult. Especially in the beginning, there were a number of dialogues which had no subtitles at all. Perhaps the conversational pace required it, but I couldn\'t always both read the text and absorb the scene, which caused me to not always understand which character was involved. I watched the movie (a video from our public library) with a friend, and neither of us really understood part of the story about acquiring streptomycin for a sick relative.<br /><br />This Yugoslavian coming of age film effectively conveyed the teenagers\' sense of invulnerability, idealism, and strong and loyal bonds to each other. There is a main flashforward, and it was intriguing, keeping me guessing until the end as to who these characters were vis-a-vis the 1953 cast, and what had actually happened.<br /><br />I would rate it 7 out of 10, and would like to see other films by the director, Jovan Acin (1941-1991).'
Label: 1

Review:
 b"No message. No symbolism. No dark undercurrents.Just a wonderful melange of music, nostalgia and good fun put to-gether by people who obviously had a great time doing it. It's a refreshing antidote to some of the pretentious garbage being ground out by the studios. Of course ANYTHING with the incomparable Judi Dench is worth watching. And Cleo Laine's brilliant jazz singing is a bonus. This lady is in the same league as the late Ella. This goes on my movie shelf to be pulled out again anytime I feel the need for a warm experience and a hearty good natured chuckle. Just a wonderful film!"
Label: 1

# Set the maximum number of words
max_features = 10000

# Define the custom standardization function
def custom_standardization(input_data):
    # Convert all text to lowercase
    lowercase = tf.strings.lower(input_data)
    # Remove HTML tags
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    # Remove punctuation
    replaced = tf.strings.regex_replace(
        stripped_html,
        '[%s]' % re.escape(string.punctuation),
        ''
    )
    return replaced

# Create a layer that you can use to convert text to vectors
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=250)

# Build the vocabulary
train_text = raw_training_set.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

# Print out the vocabulary size
print(f"Vocabulary size: {len(vectorize_layer.get_vocabulary())}")

Vocabulary size: 10000

# Define the final function that you will use to vectorize the text.
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# Get one batch and select the first datapoint
text_batch, label_batch = next(iter(raw_training_set))
first_review, first_label = text_batch[0], label_batch[0]

# Show the raw data
print(f"Review:\n{first_review}")
print(f"\nLabel: {raw_training_set.class_names[first_label]}")
# Show the vectorized data
print(f"\nVectorized review\n{vectorize_text(first_review, first_label)}")

Review:
b"Okay, so the plot is on shaky ground. Yeah, all right, so there are some randomly inserted song and/or dance sequences (for example: Adam's concert and Henri's stage act). And Leslie Caron can't really, um, you know... act.<br /><br />But somehow, 'An American In Paris' manages to come through it all as a polished, first-rate musical--largely on the basis of Gene Kelly's incredible dancing talent and choreography, and the truckloads of charm he seems to be importing into each scene with Caron. (He needs to, because she seems to have a... problem with emoting.) <br /><br />The most accomplished and technically awe-inspiring number in this musical is obviously the 16-minute ballet towards the end of the film. It's stunningly filmed, and Kelly and Caron dance beautifully. But my favourite number would have to be Kelly's character singing 'I Got Rhythm' with a bunch of French school-children, then breaking into an array of American dances. It just goes to prove how you don't need special effects when you've got some real *talent*.<br /><br />Not on the 'classics' level with 'Singin' In The Rain', but pretty high up there nonetheless. Worth the watch!"

Label: pos

Vectorized review
(<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[ 947,   38,    2,  112,    7,   20, 6022, 1754, 1438,   31,  201,
          38,   46,   24,   47, 6565, 8919,  603, 2928,  831,  858,   15,
         476, 3241, 3010,    4,    1,  892,  478,    4, 3553, 5885,  175,
          63, 6992,   21,  118,  478,   18,  813,   33,  329,    8, 1466,
        1029,    6,  227,  143,    9,   31,   14,    3, 6590, 9055,    1,
          20,    2, 3025,    5, 1996,    1, 1085,  914,  597,    4, 2733,
           4,    2,    1,    5, 1411,   27,  190,    6,   26,    1,   77,
         244,  130,   16, 5885,   27,  731,    6,   80,   53,  190,    6,
          25,    3,  425,   16,    1,    2,   85, 3622,    4, 2603,    1,
         593,    8,   10,  663,    7,  506,    2,    1, 4342, 1089,    2,
         121,    5,    2,   19,   29, 5994,  886,    4, 1561,    4, 5885,
         831, 1415,   18,   55, 1496,  593,   62,   25,    6,   26,    1,
         105,  965,   11,  186, 4687,   16,    3,  862,    5, 1001,    1,
          96, 2442,   77,   33, 7537,    5,  329, 4825,    9,   41,  264,
           6, 2131,   86,   21,   87,  333,  290,  317,   51,  699,  186,
          47,  144,  597,   23,   20,    2, 2008,  557,   16, 7714,    8,
           2, 2477,   18,  179,  307,   57,   46, 2878,  268,    2,  106,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]])>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

train_ds = raw_training_set.map(vectorize_text)
val_ds = raw_validation_set.map(vectorize_text)
test_ds = raw_test_set.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

embedding_dim = 16

# Create the model by calling tf.keras.Sequential, where the layers are given in a list.
model_sequential = tf.keras.Sequential([
    layers.Embedding(max_features, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation='sigmoid')
])

# Print out the summary of the model
model_sequential.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, None, 16)          160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
=================================================================
Total params: 160017 (625.07 KB)
Trainable params: 160017 (625.07 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

model_sequential.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

# Define the inputs
inputs = tf.keras.Input(shape=(None,))

# Define the first layer
embedding = layers.Embedding(max_features, embedding_dim)
# Call the first layer with inputs as the parameter
x = embedding(inputs)

# Define the second layer
pooling = layers.GlobalAveragePooling1D()
# Call the first layer with the output of the previous layer as the parameter
x = pooling(x)

# Define and call in the same line. (Same thing used two lines of code above
# for other layers. You can use any option you prefer.)
outputs = layers.Dense(1, activation='sigmoid')(x)
#The two-line alternative to the one layer would be:
# dense = layers.Dense(1, activation='sigmoid')
# x = dense(x)


# Create the model
model_functional = tf.keras.Model(inputs=inputs, outputs=outputs)

# Print out the summary of the model
model_functional.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 16)          160000    
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 160017 (625.07 KB)
Trainable params: 160017 (625.07 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

model_functional.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

# Select which model you want to use and train. the results should be the same
model = model_functional # model = model_sequential

epochs = 25
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    verbose=2
)

Epoch 1/25
125/125 - 2s - loss: 0.6911 - accuracy: 0.5502 - val_loss: 0.6883 - val_accuracy: 0.6290 - 2s/epoch - 18ms/step
Epoch 2/25
125/125 - 1s - loss: 0.6817 - accuracy: 0.6685 - val_loss: 0.6758 - val_accuracy: 0.7100 - 775ms/epoch - 6ms/step
Epoch 3/25
125/125 - 1s - loss: 0.6625 - accuracy: 0.7333 - val_loss: 0.6541 - val_accuracy: 0.7320 - 790ms/epoch - 6ms/step
Epoch 4/25
125/125 - 1s - loss: 0.6342 - accuracy: 0.7657 - val_loss: 0.6262 - val_accuracy: 0.7620 - 730ms/epoch - 6ms/step
Epoch 5/25
125/125 - 1s - loss: 0.6004 - accuracy: 0.7865 - val_loss: 0.5965 - val_accuracy: 0.7880 - 731ms/epoch - 6ms/step
Epoch 6/25
125/125 - 1s - loss: 0.5648 - accuracy: 0.8058 - val_loss: 0.5674 - val_accuracy: 0.7950 - 735ms/epoch - 6ms/step
Epoch 7/25
125/125 - 1s - loss: 0.5293 - accuracy: 0.8290 - val_loss: 0.5402 - val_accuracy: 0.8030 - 750ms/epoch - 6ms/step
Epoch 8/25
125/125 - 1s - loss: 0.4952 - accuracy: 0.8470 - val_loss: 0.5151 - val_accuracy: 0.8050 - 684ms/epoch - 5ms/step
Epoch 9/25
125/125 - 1s - loss: 0.4630 - accuracy: 0.8658 - val_loss: 0.4923 - val_accuracy: 0.8170 - 743ms/epoch - 6ms/step
Epoch 10/25
125/125 - 1s - loss: 0.4329 - accuracy: 0.8808 - val_loss: 0.4719 - val_accuracy: 0.8250 - 787ms/epoch - 6ms/step
Epoch 11/25
125/125 - 1s - loss: 0.4053 - accuracy: 0.8907 - val_loss: 0.4538 - val_accuracy: 0.8310 - 709ms/epoch - 6ms/step
Epoch 12/25
125/125 - 1s - loss: 0.3800 - accuracy: 0.9005 - val_loss: 0.4377 - val_accuracy: 0.8410 - 698ms/epoch - 6ms/step
Epoch 13/25
125/125 - 1s - loss: 0.3570 - accuracy: 0.9047 - val_loss: 0.4236 - val_accuracy: 0.8440 - 684ms/epoch - 5ms/step
Epoch 14/25
125/125 - 1s - loss: 0.3359 - accuracy: 0.9128 - val_loss: 0.4113 - val_accuracy: 0.8460 - 690ms/epoch - 6ms/step
Epoch 15/25
125/125 - 1s - loss: 0.3167 - accuracy: 0.9202 - val_loss: 0.4004 - val_accuracy: 0.8520 - 687ms/epoch - 5ms/step
Epoch 16/25
125/125 - 1s - loss: 0.2991 - accuracy: 0.9275 - val_loss: 0.3909 - val_accuracy: 0.8530 - 702ms/epoch - 6ms/step
Epoch 17/25
125/125 - 1s - loss: 0.2829 - accuracy: 0.9312 - val_loss: 0.3825 - val_accuracy: 0.8520 - 777ms/epoch - 6ms/step
Epoch 18/25
125/125 - 1s - loss: 0.2679 - accuracy: 0.9358 - val_loss: 0.3751 - val_accuracy: 0.8550 - 731ms/epoch - 6ms/step
Epoch 19/25
125/125 - 1s - loss: 0.2539 - accuracy: 0.9398 - val_loss: 0.3686 - val_accuracy: 0.8570 - 726ms/epoch - 6ms/step
Epoch 20/25
125/125 - 1s - loss: 0.2409 - accuracy: 0.9460 - val_loss: 0.3628 - val_accuracy: 0.8600 - 719ms/epoch - 6ms/step
Epoch 21/25
125/125 - 1s - loss: 0.2287 - accuracy: 0.9495 - val_loss: 0.3577 - val_accuracy: 0.8630 - 703ms/epoch - 6ms/step
Epoch 22/25
125/125 - 1s - loss: 0.2172 - accuracy: 0.9523 - val_loss: 0.3532 - val_accuracy: 0.8630 - 692ms/epoch - 6ms/step
Epoch 23/25
125/125 - 1s - loss: 0.2064 - accuracy: 0.9565 - val_loss: 0.3493 - val_accuracy: 0.8610 - 695ms/epoch - 6ms/step
Epoch 24/25
125/125 - 1s - loss: 0.1962 - accuracy: 0.9607 - val_loss: 0.3458 - val_accuracy: 0.8610 - 715ms/epoch - 6ms/step
Epoch 25/25
125/125 - 1s - loss: 0.1865 - accuracy: 0.9638 - val_loss: 0.3428 - val_accuracy: 0.8600 - 779ms/epoch - 6ms/step

loss, accuracy = model.evaluate(test_ds)

print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

157/157 [==============================] - 1s 6ms/step - loss: 0.3645 - accuracy: 0.8460
Loss: 0.36446624994277954
Accuracy: 0.8460000157356262

def plot_metrics(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history[f'val_{metric}'])
    plt.xlabel("Epochs")
    plt.ylabel(metric.title())
    plt.legend([metric, f'val_{metric}'])
    plt.show()
    
plot_metrics(history, "accuracy")
plot_metrics(history, "loss")

# Make a new sequential model using the vectorization layer and the model you just trained.
export_model = tf.keras.Sequential([
  vectorize_layer,
  model]
)

# Compile the model
export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

examples = ['this movie was very, very good', 'quite ok', 'the movie was not bad', 'bad', 'negative disappointed bad scary', 'this movie was stupid']

results = export_model.predict(examples, verbose=False)

for result, example in zip(results, examples):
    print(f'Result: {result[0]:.3f},   Label: {int(np.round(result[0]))},   Review: {example}')

Result: 0.625,   Label: 1,   Review: this movie was very, very good
Result: 0.541,   Label: 1,   Review: quite ok
Result: 0.425,   Label: 0,   Review: the movie was not bad
Result: 0.472,   Label: 0,   Review: bad
Result: 0.428,   Label: 0,   Review: negative disappointed bad scary
Result: 0.455,   Label: 0,   Review: this movie was stupid

Lab 1: TensorFlow Tutorial and Some Useful Functions¶

Table of Contents¶

1. Import the libraries¶

2. Load the data¶

3. Prepare the Data¶

Configure the Dataset¶

4. Create a Sequential Model¶

5. Create a Model Using Functional API¶

6. Train the Model¶

7. Predict on Your Own Data¶