import nltk                                # My go-to library for NLP tasks
from nltk.corpus import twitter_samples    # Sample Twitter dataset from NLTK
import matplotlib.pyplot as plt            # For visualizing data
import random                              # For selecting random samples

# Download the sample Twitter dataset (if not already present)
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!

True

# Load positive and negative tweets from the dataset
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

print('Number of positive tweets:', len(positive_tweets))
print('Number of negative tweets:', len(negative_tweets))

print('\nType of positive_tweets:', type(positive_tweets))
print('Type of a tweet entry:', type(negative_tweets[0]))

Number of positive tweets:  5000
Number of negative tweets:  5000

The type of all_positive_tweets is:  <class 'list'>
The type of a tweet entry is:  <class 'str'>

# Create a pie chart to visualize class distribution
fig = plt.figure(figsize=(5, 5))

# labels for the two classes
labels = ['Positive', 'Negative']

# Sizes for each slide
sizes = [len(positive_tweets), len(negative_tweets)] 

# Declare pie chart, where the slices will be ordered and plotted counter-clockwise:
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

# Equal aspect ratio ensures that pie is drawn as a circle.
plt.axis('equal')  

# Display the chart
plt.show()

# Print a random positive tweet in green
print('\033[92m' + positive_tweets[random.randint(0, 4999)])

# Print a random negative tweet in red
print('\033[91m' + negative_tweets[random.randint(0, 4999)])

@steer_michael Dare you to run in the corridor :-)
@cooldigangana @DiganganaS I want to attend ur birthday plssssssssssssssss :(

# Select a sample tweet to demonstrate preprocessing steps
sample_tweet = positive_tweets[2277]
print(sample_tweet)

My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i

# download the stopwords from NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True

import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

print('\033[92m' + sample_tweet)
print('\033[94m')

# Remove retweet text "RT"
cleaned_tweet = re.sub(r'^RT[\s]+', '', sample_tweet)
# Remove hyperlinks
cleaned_tweet = re.sub(r'https?://[^\s\n\r]+', '', cleaned_tweet)
# Remove hashtags (just the # symbol)
cleaned_tweet = re.sub(r'#', '', cleaned_tweet)

print(cleaned_tweet)

My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i

My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off…

print()
print('\033[92m' + cleaned_tweet)
print('\033[94m')

# Initialize the tokenizer
my_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
# Tokenize the cleaned tweet
tokens = my_tokenizer.tokenize(cleaned_tweet)

print()
print('Tokenized string:')
print(tokens)

My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 


Tokenized string:
['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']

#Import the english stop words list from NLTK
stopwords_english = stopwords.words('english') 

print('Stop words\n')
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)

Stop words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

Punctuation

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

print()
print('\033[92m')
print(tokens)
print('\033[94m')

clean_tokens = []
for word in tokens:  # Go through every word in your tokens list
    if (word not in stopwords_english and  # remove stopwords
        word not in string.punctuation):  # remove punctuation
        clean_tokens.append(word)

print('Removed stop words and punctuation:')
print(clean_tokens)


['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']

removed stop words and punctuation:
['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']

print()
print('\033[92m')
print(clean_tokens)
print('\033[94m')

# Initialize the stemmer
my_stemmer = PorterStemmer()
stemmed_tokens = []
for word in clean_tokens:
    stemmed_word = my_stemmer.stem(word)
    stemmed_tokens.append(stemmed_word)

print('Stemmed words:')
print(stemmed_tokens)


['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']

stemmed words:
['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']

from utils import process_tweet  # Import my custom tweet preprocessing function

# Use the same sample tweet
sample_tweet = positive_tweets[2277]

print()
print('\033[92m')
print(sample_tweet)
print('\033[94m')

# Call my helper function
tweet_processed = process_tweet(sample_tweet)

print('Preprocessed tweet:')
print(tweet_processed)


My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i

preprocessed tweet:
['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']

My Approach to Tweet Preprocessing¶

Getting Started¶

About the Twitter Dataset¶

Exploring Raw Tweets¶

My Preprocessing Pipeline for Sentiment Analysis¶

Removing Twitter-Specific Text¶

Tokenizing the Text¶

Removing Stop Words and Punctuation¶

Stemming¶

My process_tweet() Function¶