import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import os
import numpy as np
import pandas as pd
import random as rnd
import tensorflow as tf

# Set random seeds
rnd.seed(34)

import w3_unittest

data = pd.read_csv("questions.csv")
N = len(data)
print('Number of question pairs: ', N)
data.head()

Number of question pairs:  404351

N_train = 300000
N_test = 10240
data_train = data[:N_train]
data_test = data[N_train:N_train + N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del (data)  # remove to free memory

Train set: 300000 Test set: 10240

td_index = data_train['is_duplicate'] == 1
td_index = [i for i, x in enumerate(td_index) if x]
print('Number of duplicate questions: ', len(td_index))
print('Indexes of first ten duplicate questions:', td_index[:10])

Number of duplicate questions:  111486
Indexes of first ten duplicate questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]

print(data_train['question1'][5])
print(data_train['question2'][5])
print('is_duplicate: ', data_train['is_duplicate'][5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?
is_duplicate:  1

Q1_train = np.array(data_train['question1'][td_index])
Q2_train = np.array(data_train['question2'][td_index])

Q1_test = np.array(data_test['question1'])
Q2_test = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train[0])
print('Question 2: ', Q2_train[0], '\n')
print('Question 1: ', Q1_train[5])
print('Question 2: ', Q2_train[5], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test[0])
print('Question 2: ', Q2_test[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How do I prepare for interviews for cse?
Question 2:  What is the best way to prepare for cse? 

is_duplicate = 0

# Splitting the data
cut_off = int(len(Q1_train) * 0.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off:], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  111486
The length of the training set is:   89188
The length of the validation set is:  22298

tf.random.set_seed(0)
text_vectorization = tf.keras.layers.TextVectorization(output_mode='int',split='whitespace', standardize='strip_punctuation')
text_vectorization.adapt(np.concatenate((Q1_train,Q2_train)))

print(f'Vocabulary size: {text_vectorization.vocabulary_size()}')

Vocabulary size: 36224

print('first question in the train set:\n')
print(Q1_train[0], '\n') 
print('encoded version:')
print(text_vectorization(Q1_train[0]),'\n')

print('first question in the test set:\n')
print(Q1_test[0], '\n')
print('encoded version:')
print(text_vectorization(Q1_test[0]) )

first question in the train set:

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 

encoded version:
tf.Tensor(
[ 6984     6   178    10  8988  2442 35393   761    13  6636 28205    31
    28   483    45    98], shape=(16,), dtype=int64) 

first question in the test set:

How do I prepare for interviews for cse? 

encoded version:
tf.Tensor([    4     8     6   160    17  2079    17 11775], shape=(8,), dtype=int64)

# GRADED FUNCTION: Siamese

def Siamese(text_vectorizer, vocab_size=36224, d_feature=128):
    """Returns a Siamese model.

    Args:
        text_vectorizer (TextVectorization): TextVectorization instance, already adapted to your training data.
        vocab_size (int, optional): Length of the vocabulary. Defaults to 56400.
        d_model (int, optional): Depth of the model. Defaults to 128.
        
    Returns:
        tf.model.Model: A Siamese model. 
    
    """
    ### START CODE HERE ###

    branch = tf.keras.models.Sequential(name='sequential') 
    # Add the text_vectorizer layer. This is the text_vectorizer you instantiated and trained before 
    branch.add(text_vectorizer)
    # Add the Embedding layer. Remember to call it 'embedding' using the parameter `name`
    branch.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=d_feature, name='embedding'))
    # Add the LSTM layer, recall from W2 that you want to the LSTM layer to return sequences, not just one value. 
    # Remember to call it 'LSTM' using the parameter `name`
    branch.add(tf.keras.layers.LSTM(units=d_feature, return_sequences=True, name='LSTM'))
    # Add the GlobalAveragePooling1D layer. Remember to call it 'mean' using the parameter `name`
    branch.add(tf.keras.layers.GlobalAveragePooling1D(name='mean'))
    # Add the normalizing layer using the Lambda function. Remember to call it 'out' using the parameter `name`
    branch.add(tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1), name='out'))
    
    # Define both inputs. Remember to call them 'input_1' and 'input_2' using the `name` parameter. 
    # Be mindful of the data type and size
    input1 = tf.keras.layers.Input(shape=(1), dtype=tf.string, name='input_1')
    input2 = tf.keras.layers.Input(shape=(1), dtype=tf.string, name='input_2')
    # Define the output of each branch of your Siamese network. Remember that both branches have the same coefficients, 
    # but they each receive different inputs.
    branch1 = branch(input1)
    branch2 = branch(input2)
    # Define the Concatenate layer. You should concatenate columns, you can fix this using the `axis` parameter. 
    # This layer is applied over the outputs of each branch of the Siamese network
    conc = tf.keras.layers.Concatenate(axis=1, name='conc_1_2')([branch1, branch2])
    
    ### END CODE HERE ###
    
    return tf.keras.models.Model(inputs=[input1, input2], outputs=conc, name="SiameseModel")

# check your model
model = Siamese(text_vectorization, vocab_size=text_vectorization.vocabulary_size())
model.build(input_shape=None)
model.summary()
model.get_layer(name='sequential').summary()

Model: "SiameseModel"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
==================================================================================================
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, 128)                  4768256   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 conc_1_2 (Concatenate)      (None, 256)                  0         ['sequential[0][0]',          
                                                                     'sequential[1][0]']          
                                                                                                  
==================================================================================================
Total params: 4768256 (18.19 MB)
Trainable params: 4768256 (18.19 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 text_vectorization (TextVe  (None, None)              0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, None, 128)         4636672   
                                                                 
 LSTM (LSTM)                 (None, None, 128)         131584    
                                                                 
 mean (GlobalAveragePooling  (None, 128)               0         
 1D)                                                             
                                                                 
 out (Lambda)                (None, 128)               0         
                                                                 
=================================================================
Total params: 4768256 (18.19 MB)
Trainable params: 4768256 (18.19 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

tf.keras.utils.plot_model(
    model,
    to_file="model.png",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=True)

# Test your function!
w3_unittest.test_Siamese(Siamese)

All tests passed!

# GRADED FUNCTION: TripletLossFn
def TripletLossFn(v1, v2, margin=0.25):
    """Custom Triplet Loss function.
    
    Args:
        v1 (numpy.ndarray or Tensor): Array with dimension (batch_size, model_dimension) associated to Q1.
        v2 (numpy.ndarray or Tensor): Array with dimension (batch_size, model_dimension) associated to Q2.
        margin (float, optional): Desired margin. Defaults to 0.25.

    Returns:
        triplet_loss (Tensor): Triplet Loss.
    """
   
    ### START CODE HERE ###

    # use `tf.linalg.matmul` to take the dot product of the two batches. 
    # Don't forget to transpose the second argument using `transpose_b=True`
    scores = tf.linalg.matmul(v2, v1, transpose_b=True)
    
    # calculate new batch size and cast it as the same datatype as scores. 
    batch_size = tf.cast(tf.shape(v1)[0], scores.dtype) 
    
    # use `tf.linalg.diag_part` to grab the cosine similarity of all positive examples
    positive = tf.linalg.diag_part(scores)
    
    # subtract the diagonal from scores. You can do this by creating a diagonal matrix with the values 
    # of all positive examples using `tf.linalg.diag`
    negative_zero_on_duplicate = scores - tf.linalg.diag(positive)
    
    # use `tf.math.reduce_sum` on `negative_zero_on_duplicate` for `axis=1` and divide it by `(batch_size - 1)`
    mean_negative = tf.reduce_sum(negative_zero_on_duplicate, axis=1) / (batch_size - 1)
    
    # create a composition of two masks: 
    # the first mask to extract the diagonal elements, 
    # the second mask to extract elements in the negative_zero_on_duplicate matrix that are larger than the elements in the diagonal 
    mask_exclude_positives = tf.cast(
        (tf.eye(batch_size, dtype=tf.bool) | (negative_zero_on_duplicate > tf.expand_dims(positive, axis=1))),
        scores.dtype
    )
    
    # multiply `mask_exclude_positives` with 2.0 and subtract it out of `negative_zero_on_duplicate`
    negative_without_positive = negative_zero_on_duplicate - 2.0 * tf.cast(mask_exclude_positives, scores.dtype)
    
    # take the row by row `max` of `negative_without_positive`. 
    # Hint: `tf.math.reduce_max(negative_without_positive, axis=None)`
    closest_negative = tf.reduce_max(negative_without_positive, axis=1)
    
    # compute `tf.maximum` among 0.0 and `A`
    # A = subtract `positive` from `margin` and add `closest_negative` 
    triplet_loss1 = tf.maximum(0.0, margin - positive + closest_negative)
    
    # compute `tf.maximum` among 0.0 and `B`
    # B = subtract `positive` from `margin` and add `mean_negative` 
    triplet_loss2 = tf.maximum(0.0, margin - positive + mean_negative)
    
    # add the two losses together and take the `tf.math.reduce_sum` of it
    triplet_loss = tf.reduce_sum(triplet_loss1 + triplet_loss2)
    
    ### END CODE HERE ###

    return triplet_loss

v1 = np.array([[0.26726124, 0.53452248, 0.80178373],[0.5178918 , 0.57543534, 0.63297887]])
v2 = np.array([[ 0.26726124,  0.53452248,  0.80178373],[-0.5178918 , -0.57543534, -0.63297887]])
print("Triplet Loss:", TripletLossFn(v1,v2).numpy())

Triplet Loss: 0.703507682515891

Triplet Loss: ~ 0.70

def TripletLoss(labels, out, margin=0.25):
    _, embedding_size = out.shape # get embedding size
    v1 = out[:,:int(embedding_size/2)] # Extract v1 from out
    v2 = out[:,int(embedding_size/2):] # Extract v2 from out
    return TripletLossFn(v1, v2, margin=margin)

# Test your function!
w3_unittest.test_TripletLoss(TripletLoss)

All tests passed!

train_dataset = tf.data.Dataset.from_tensor_slices(((train_Q1, train_Q2),tf.constant([1]*len(train_Q1))))
val_dataset = tf.data.Dataset.from_tensor_slices(((val_Q1, val_Q2),tf.constant([1]*len(val_Q1))))

# GRADED FUNCTION: train_model
def train_model(Siamese, TripletLoss, text_vectorizer, train_dataset, val_dataset, d_feature=128, lr=0.01, train_steps=5):
    """Training the Siamese Model

    Args:
        Siamese (function): Function that returns the Siamese model.
        TripletLoss (function): Function that defines the TripletLoss loss function.
        text_vectorizer: trained instance of `TextVecotrization` 
        train_dataset (tf.data.Dataset): Training dataset
        val_dataset (tf.data.Dataset): Validation dataset
        d_feature (int, optional) = size of the encoding. Defaults to 128.
        lr (float, optional): learning rate for optimizer. Defaults to 0.01
        train_steps (int): number of epochs
        
    Returns:
        tf.keras.Model
    """
    ## START CODE HERE ###

    # Instantiate your Siamese model
    model = Siamese(text_vectorizer,
                    vocab_size = text_vectorizer.vocabulary_size(), #set vocab_size accordingly to the size of your vocabulary
                    d_feature = d_feature)
    # Compile the model
    model.compile(loss=TripletLoss,
                  optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
            )
    # Train the model 
    model.fit(train_dataset,
              epochs = train_steps,
              validation_data = val_dataset,
             )
             
    ### END CODE HERE ###

    return model

train_steps = 2
batch_size = 256
train_generator = train_dataset.shuffle(len(train_Q1),
                                        seed=7, 
                                        reshuffle_each_iteration=True).batch(batch_size=batch_size)
val_generator = val_dataset.shuffle(len(val_Q1), 
                                   seed=7,
                                   reshuffle_each_iteration=True).batch(batch_size=batch_size)
model = train_model(Siamese, TripletLoss,text_vectorization, 
                                            train_generator, 
                                            val_generator, 
                                            train_steps=train_steps,)

Epoch 1/2
349/349 [==============================] - 33s 84ms/step - loss: 28.9937 - val_loss: 12.0744
Epoch 2/2
349/349 [==============================] - 9s 24ms/step - loss: 8.4336 - val_loss: 9.4537

# Test your function!
w3_unittest.test_train_model(train_model, Siamese, TripletLoss)

All tests passed!

model = tf.keras.models.load_model('model/trained_model.keras', safe_mode=False, compile=False)

# Show the model architecture
model.summary()

Model: "SiameseModel"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
==================================================================================================
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, 128)                  4768256   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 conc_1_2 (Concatenate)      (None, 256)                  0         ['sequential[0][0]',          
                                                                     'sequential[1][0]']          
                                                                                                  
==================================================================================================
Total params: 4768256 (18.19 MB)
Trainable params: 4768256 (18.19 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________

# GRADED FUNCTION: classify
def classify(test_Q1, test_Q2, y_test, threshold, model, batch_size=64, verbose=True):
    """Function to test the accuracy of the model.

    Args:
        test_Q1 (numpy.ndarray): Array of Q1 questions. Each element of the array would be a string.
        test_Q2 (numpy.ndarray): Array of Q2 questions. Each element of the array would be a string.
        y_test (numpy.ndarray): Array of actual target.
        threshold (float): Desired threshold
        model (tensorflow.Keras.Model): The Siamese model.
        batch_size (int, optional): Size of the batches. Defaults to 64.
        verbose (bool, optional): Whether to print accuracy and confusion matrix. Defaults to True.

    Returns:
        float: Accuracy of the model
        numpy.array: confusion matrix
    """
    y_pred = []
    test_gen = tf.data.Dataset.from_tensor_slices(((test_Q1, test_Q2), y_test)).batch(batch_size=batch_size)
    
    # Initialize variables to collect results
    accuracy = 0.0
    all_preds = []
    all_true = []
    
    for (q1, q2), y_true in test_gen:
        # Predict embeddings
        pred = model([q1, q2])
        v1, v2 = tf.split(pred, num_or_size_splits=2, axis=1)
        
        # Compute cosine similarity
        d = tf.reduce_sum(tf.multiply(v1, v2), axis=1)
        
        # Check if d > threshold to make predictions
        y_pred_batch = tf.cast(d > threshold, tf.int64)
        
        # Accumulate correct predictions to calculate accuracy
        accuracy += np.sum(y_pred_batch == y_true.numpy())
        
        # Collect all predictions and true labels for confusion matrix
        all_preds.extend(y_pred_batch.numpy())
        all_true.extend(y_true.numpy())
    
    # Calculate final accuracy
    accuracy = tf.reduce_mean(tf.cast(tf.equal(all_true, all_preds), tf.float32))
    
    # Compute confusion matrix
    cm = tf.math.confusion_matrix(all_true, all_preds)
    
    return accuracy, cm

# this takes around 1 minute
accuracy, cm = classify(Q1_test,Q2_test, y_test, 0.7, model,  batch_size = 512) 
print("Accuracy", accuracy.numpy())
print(f"Confusion matrix:\n{cm.numpy()}")

Accuracy 0.7259766
Confusion matrix:
[[4876 1506]
 [1300 2558]]

# Test your function!
w3_unittest.test_classify(classify, model)

All tests passed!

# GRADED FUNCTION: predict
def predict(question1, question2, threshold, model, verbose=False):
    """Function for predicting if two questions are duplicates.

    Args:
        question1 (str): First question.
        question2 (str): Second question.
        threshold (float): Desired threshold.
        model (tensorflow.keras.Model): The Siamese model.
        data_generator (function): Data generator function. Defaults to data_generator.
        verbose (bool, optional): If the results should be printed out. Defaults to False.

    Returns:
        bool: True if the questions are duplicates, False otherwise.
    """
    generator = tf.data.Dataset.from_tensor_slices((([question1], [question2]),None)).batch(batch_size=1)
    
    ### START CODE HERE ###
    
    # Call the predict method of your model and save the output into v1v2
    v1v2 = model.predict(generator)
    # Extract v1 and v2 from the model output
    v1,v2 =  tf.split(v1v2, num_or_size_splits=2, axis=1)
    # Take the dot product to compute cos similarity of each pair of entries, v1, v2
    # Since v1 and v2 are both vectors, use the function tf.math.reduce_sum instead of tf.linalg.matmul
    d = tf.reduce_sum(tf.multiply(v1, v2), axis=1)
    # Is d greater than the threshold?
    res = d > threshold

    ### END CODE HERE ###
    
    if(verbose):
        print("Q1  = ", question1, "\nQ2  = ", question2)
        print("d   = ", d.numpy())
        print("res = ", res.numpy())

    return res.numpy()

# Feel free to try with your own questions
question1 = "When will I see you?"
question2 = "When can I see you again?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose = True)

1/1 [==============================] - 0s 12ms/step
Q1  =  When will I see you? 
Q2  =  When can I see you again?
d   =  [0.8422112]
res =  [ True]

array([ True])

# Feel free to try with your own questions
question1 = "Where are you going?"
question2 = "Are you going to Dhaka?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose=True)

1/1 [==============================] - 0s 12ms/step
Q1  =  Where are you going? 
Q2  =  Are you going to Dhaka?
d   =  [0.28556922]
res =  [False]

array([False])

# Feel free to try with your own questions
question1 = "Do they enjoy eating the dessert?"
question2 = "Do they like hiking in the desert?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose=True)

1/1 [==============================] - 0s 12ms/step
Q1  =  Do they enjoy eating the dessert? 
Q2  =  Do they like hiking in the desert?
d   =  [0.12625802]
res =  [False]

array([False])

# Test your function!
w3_unittest.test_predict(predict, model)

1/1 [==============================] - 1s 535ms/step
1/1 [==============================] - 0s 12ms/step
1/1 [==============================] - 0s 13ms/step
1/1 [==============================] - 0s 12ms/step
1/1 [==============================] - 0s 12ms/step
All tests passed!

	id	qid1	qid2	question1	question2
0	0	1	2	What is the step by step guide to invest in sh...	What is the step by step guide to invest in sh...
1	1	3	4	What is the story of Kohinoor (Koh-i-Noor) Dia...	What would happen if the Indian government sto...
2	2	5	6	How can I increase the speed of my internet co...	How can Internet speed be increased by hacking...
3	3	7	8	Why am I mentally very lonely? How can I solve...	Find the remainder when [math]23^{24}[/math] i...
4	4	9	10	Which one dissolve in water quikly sugar, salt...	Which fish would survive in salt water?

Assignment 3: Question duplicates¶

Outline¶

Overview¶

TIPS FOR SUCCESSFUL GRADING OF YOUR ASSIGNMENT:¶

¶

Part 1: Importing the Data¶

1.1 Loading in the data¶

1.2 Learning question encoding¶

Part 2: Defining the Siamese model¶

2.1 Understanding the Siamese Network¶

Exercise 01¶

2.2 Hard Negative Mining¶

Exercise 02¶

Part 3: Training¶

3.1 Training the model¶

Exercise 03¶

Part 4: Evaluation¶

4.1 Evaluating your siamese network¶

4.2 Classify¶

Exercise 04¶

Expected Result¶

Part 5: Testing with your own questions¶

Exercise 05¶

Expected Output¶

Expected output¶

On Siamese networks¶

¶