# Author: Mohammad Sayem Chowdhury
# If you need to read Excel files, ensure openpyxl is installed.
# !pip install openpyxl==3.0.9

# Author: Mohammad Sayem Chowdhury
import numpy as np  # for numerical operations
import pandas as pd # for data manipulation
from PIL import Image # for image processing (used later)

# Author: Mohammad Sayem Chowdhury
# Load the dataset from the web
immigration_data = pd.read_excel(
    'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/Canada.xlsx',
    sheet_name='Canada by Citizenship',
    skiprows=range(20),
    skipfooter=2)

print('Data loaded into DataFrame.')

Data downloaded and read into a dataframe!

immigration_data.head()

# Author: Mohammad Sayem Chowdhury
print(f"DataFrame shape: {immigration_data.shape}")

(195, 43)

# Cleaning and preparing the data for my own analysis and visualization experiments
# Author: Mohammad Sayem Chowdhury
# Remove columns that aren't needed
immigration_data.drop(['AREA','REG','DEV','Type','Coverage'], axis=1, inplace=True)

# Rename columns for clarity
immigration_data.rename(columns={'OdName':'Country', 'AreaName':'Continent', 'RegName':'Region'}, inplace=True)

# Ensure all column labels are strings
immigration_data.columns = list(map(str, immigration_data.columns))

# Set country as index
immigration_data.set_index('Country', inplace=True)

# Add a total column for each country
immigration_data['Total'] = immigration_data.sum(axis=1)

# Prepare list of years for analysis
years = list(map(str, range(1980, 2014)))
print('Cleaned data shape:', immigration_data.shape)

data dimensions: (195, 38)

C:\Users\chysa\AppData\Local\Temp\ipykernel_10804\2754968886.py:14: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df_can['Total'] =  df_can.sum (axis = 1)

# Author: Mohammad Sayem Chowdhury
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches # for waffle charts

mpl.style.use('ggplot') # for a clean look

print('Matplotlib version:', mpl.__version__)

Matplotlib version:  3.5.1

# Author: Mohammad Sayem Chowdhury
nordic_data = immigration_data.loc[['Denmark', 'Norway', 'Sweden'], :]
nordic_data

# Author: Mohammad Sayem Chowdhury
total_immigrants = nordic_data['Total'].sum()
country_proportions = nordic_data['Total'] / total_immigrants
country_proportions.to_frame('Proportion')

# Author: Mohammad Sayem Chowdhury
waffle_width = 40
waffle_height = 10
total_tiles = waffle_width * waffle_height
print(f'Total tiles: {total_tiles}')

Total number of tiles is 400.

# Author: Mohammad Sayem Chowdhury
tiles_per_country = (country_proportions * total_tiles).round().astype(int)
tiles_per_country.to_frame('Tiles')

# Author: Mohammad Sayem Chowdhury
waffle_matrix = np.zeros((waffle_height, waffle_width), dtype=int)
category_idx = 0
tile_count = 0
for col in range(waffle_width):
    for row in range(waffle_height):
        tile_count += 1
        if tile_count > sum(tiles_per_country[:category_idx]):
            category_idx += 1
        waffle_matrix[row, col] = category_idx
print('Waffle matrix created.')

Waffle chart populated!

waffle_matrix

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]],
      dtype=uint32)

# Author: Mohammad Sayem Chowdhury
fig = plt.figure()
colormap = plt.cm.coolwarm
plt.matshow(waffle_matrix, cmap=colormap)
plt.colorbar()
plt.show()

C:\Users\chysa\AppData\Local\Temp\ipykernel_10804\103890981.py:7: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  plt.colorbar()

<Figure size 432x288 with 0 Axes>

# Author: Mohammad Sayem Chowdhury
fig = plt.figure()
plt.matshow(waffle_matrix, cmap=colormap)
plt.colorbar()
ax = plt.gca()
ax.set_xticks(np.arange(-.5, waffle_width, 1), minor=True)
ax.set_yticks(np.arange(-.5, waffle_height, 1), minor=True)
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
plt.xticks([])
plt.yticks([])
plt.show()

C:\Users\chysa\AppData\Local\Temp\ipykernel_10804\1261046109.py:7: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  plt.colorbar()

<Figure size 432x288 with 0 Axes>

# Author: Mohammad Sayem Chowdhury
fig = plt.figure()
plt.matshow(waffle_matrix, cmap=colormap)
plt.colorbar()
ax = plt.gca()
ax.set_xticks(np.arange(-.5, waffle_width, 1), minor=True)
ax.set_yticks(np.arange(-.5, waffle_height, 1), minor=True)
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
plt.xticks([])
plt.yticks([])

# Create legend
cumulative = np.cumsum(nordic_data['Total'])
total = cumulative[-1]
legend_handles = []
for i, country in enumerate(nordic_data.index.values):
    label = f"{country} ({nordic_data['Total'][i]})"
    color = colormap(float(cumulative[i])/total)
    legend_handles.append(mpatches.Patch(color=color, label=label))
plt.legend(handles=legend_handles, loc='lower center', ncol=len(nordic_data.index.values), bbox_to_anchor=(0., -0.2, 0.95, .1))
plt.show()

C:\Users\chysa\AppData\Local\Temp\ipykernel_10804\2463873726.py:7: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  plt.colorbar()

<Figure size 432x288 with 0 Axes>

# Author: Mohammad Sayem Chowdhury
def create_waffle_chart(categories, values, height, width, colormap, value_sign=''):
    """Create a waffle chart for the given categories and values."""
    total_values = sum(values)
    proportions = [float(value) / total_values for value in values]
    total_tiles = width * height
    print('Total tiles:', total_tiles)
    tiles_per_category = [round(p * total_tiles) for p in proportions]
    for i, tiles in enumerate(tiles_per_category):
        print(f"{categories[i]}: {tiles}")
    waffle = np.zeros((height, width))
    category_idx = 0
    tile_idx = 0
    for col in range(width):
        for row in range(height):
            tile_idx += 1
            if tile_idx > sum(tiles_per_category[:category_idx]):
                category_idx += 1
            waffle[row, col] = category_idx
    fig = plt.figure()
    plt.matshow(waffle, cmap=colormap)
    plt.colorbar()
    ax = plt.gca()
    ax.set_xticks(np.arange(-.5, width, 1), minor=True)
    ax.set_yticks(np.arange(-.5, height, 1), minor=True)
    ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
    plt.xticks([])
    plt.yticks([])
    legend_handles = []
    cumulative = np.cumsum(values)
    total = cumulative[-1]
    for i, category in enumerate(categories):
        label = f"{category} ({values[i]}{value_sign})"
        color = colormap(float(cumulative[i])/total)
        legend_handles.append(mpatches.Patch(color=color, label=label))
    plt.legend(handles=legend_handles, loc='lower center', ncol=len(categories), bbox_to_anchor=(0., -0.2, 0.95, .1))
    plt.show()

# Author: Mohammad Sayem Chowdhury
# Setting up the waffle chart parameters for my own visualization
waffle_cols = 40  # number of columns in the waffle chart
waffle_rows = 10  # number of rows in the waffle chart

waffle_countries = nordic_data.index.values  # countries to visualize
waffle_totals = nordic_data['Total']         # total immigrants per country

waffle_palette = plt.cm.coolwarm  # color palette for the chart

# Author: Mohammad Sayem Chowdhuryrk, Norway, and Sweden, visualizing their immigration proportions to Canada.






Now, I use my custom function to create a waffle chart for Denmark, Norway, and Sweden, visualizing their immigration proportions to Canada.

*By Mohammad Sayem Chowdhury*

```python
create_waffle_chart(categories, values, height, width, colormap)
```    categories=waffle_countries,
    values=waffle_totals,
    height=waffle_rows,
    width=waffle_cols,
    colormap=waffle_palette,
    value_sign=''
)
```

Total number of tiles is 400
Denmark: 129
Norway: 77
Sweden: 194

C:\Users\chysa\AppData\Local\Temp\ipykernel_10804\3286913405.py:45: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  plt.colorbar()

<Figure size 432x288 with 0 Axes>

# install wordcloud
# !pip3 install wordcloud==1.8.1

# import package and its set of stopwords
from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator

print ('Wordcloud is installed and imported!')

Wordcloud is installed and imported!

import urllib

# open the file and read it into a variable alice_novel
alice_novel = urllib.request.urlopen('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/alice_novel.txt').read().decode("utf-8")

stopwords = set(STOPWORDS)

# Author: Mohammad Sayem Chowdhury
# Generate a word cloud from Alice in Wonderland (as a fun example)
alice_wc = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)
alice_wc.generate(alice_novel)

<wordcloud.wordcloud.WordCloud at 0x1a792111190>

plt.imshow(alice_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

stopwords.add('said')
alice_wc.generate(alice_novel)
fig = plt.figure(figsize=(14, 18))
plt.imshow(alice_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

alice_mask = np.array(Image.open('alice_mask.png'))

fig = plt.figure(figsize=(14, 18))
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis('off')
plt.show()

alice_wc = WordCloud(background_color='white', max_words=2000, mask=alice_mask, stopwords=stopwords)
alice_wc.generate(alice_novel)
fig = plt.figure(figsize=(14, 18))
plt.imshow(alice_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

# Author: Mohammad Sayem Chowdhury
max_words = 90
word_string = ''
for country in immigration_data.index.values:
    if country.count(" ") == 0:
        repeat_num_times = int(immigration_data.loc[country, 'Total'] / immigration_data['Total'].sum() * max_words)
        word_string += (country + ' ') * repeat_num_times

wordcloud = WordCloud(background_color='white').generate(word_string)
print('Word cloud created!')

Word cloud created!

plt.figure(figsize=(14, 18))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

canada_mask = np.array(Image.open('Flag-Canada.webp'))
wordcloud_canada = WordCloud(stopwords=stopwords, background_color="white", max_words=90, mask=canada_mask).generate(word_string)
image_colors = ImageColorGenerator(canada_mask)
plt.figure(figsize=(14, 18))
plt.imshow(wordcloud_canada.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.show()

# Author: Mohammad Sayem Chowdhury
# If seaborn is not installed, uncomment the next line:
# !pip install seaborn
import seaborn as sns
print('Seaborn ready!')

Seaborn installed and imported!

total_per_year = pd.DataFrame(immigration_data[years].sum(axis=0))
total_per_year.index = map(float, total_per_year.index)
total_per_year.reset_index(inplace=True)
total_per_year.columns = ['year', 'total']
total_per_year.head()

sns.regplot(x='year', y='total', data=total_per_year)
plt.show()

<AxesSubplot:xlabel='year', ylabel='total'>

sns.regplot(x='year', y='total', data=total_per_year, color='green')
plt.show()

ax = sns.regplot(x='year', y='total', data=total_per_year, color='green', marker='+')
plt.show()

plt.figure(figsize=(15, 10))
sns.regplot(x='year', y='total', data=total_per_year, color='green', marker='+')
plt.show()

plt.figure(figsize=(15, 10))
ax = sns.regplot(x='year', y='total', data=total_per_year, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Total Immigration to Canada from 1980 - 2013')
plt.show()

plt.figure(figsize=(15, 10))
sns.set(font_scale=1.5)
ax = sns.regplot(x='year', y='total', data=total_per_year, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Total Immigration to Canada from 1980 - 2013')
plt.show()

plt.figure(figsize=(15, 10))
sns.set(font_scale=1.5)
sns.set_style('ticks')
ax = sns.regplot(x='year', y='total', data=total_per_year, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Total Immigration to Canada from 1980 - 2013')
plt.show()

plt.figure(figsize=(15, 10))
sns.set(font_scale=1.5)
sns.set_style('whitegrid')
ax = sns.regplot(x='year', y='total', data=total_per_year, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Total Immigration to Canada from 1980 - 2013')
plt.show()

# Author: Mohammad Sayem Chowdhury
nordic_yearly = immigration_data.loc[['Denmark', 'Norway', 'Sweden'], years].transpose()
nordic_total = pd.DataFrame(nordic_yearly.sum(axis=1))
nordic_total.reset_index(inplace=True)
nordic_total.columns = ['year', 'total']
nordic_total['year'] = nordic_total['year'].astype(int)
plt.figure(figsize=(15, 10))
sns.set(font_scale=1.5)
sns.set_style('whitegrid')
ax = sns.regplot(x='year', y='total', data=nordic_total, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Immigration from Denmark, Sweden, and Norway to Canada (1980-2013)')
plt.show()

Text(0.5, 1.0, 'Total Immigrationn from Denmark, Sweden, and Norway to Canada from 1980 - 2013')

	Type	Coverage	OdName	AREA	AreaName	REG	RegName	DEV	DevName	1980	...	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
0	Immigrants	Foreigners	Afghanistan	935	Asia	5501	Southern Asia	902	Developing regions	16	...	2978	3436	3009	2652	2111	1746	1758	2203	2635	2004
1	Immigrants	Foreigners	Albania	908	Europe	925	Southern Europe	901	Developed regions	1	...	1450	1223	856	702	560	716	561	539	620	603
2	Immigrants	Foreigners	Algeria	903	Africa	912	Northern Africa	902	Developing regions	80	...	3616	3626	4807	3623	4005	5393	4752	4325	3774	4331
3	Immigrants	Foreigners	American Samoa	909	Oceania	957	Polynesia	902	Developing regions	0	...	0	0	1	0	0	0	0	0	0	0
4	Immigrants	Foreigners	Andorra	908	Europe	925	Southern Europe	901	Developed regions	0	...	0	0	1	1	0	0	0	0	1	1

	Continent	Region	DevName	1980	1981	1982	1983	1984	1985	1986	...	2005	2006	2007	2008	2009	2010	2011	2012	2013	Total
Country
Denmark	Europe	Northern Europe	Developed regions	272	293	299	106	93	73	93	...	62	101	97	108	81	92	93	94	81	3901
Norway	Europe	Northern Europe	Developed regions	116	77	106	51	31	54	56	...	57	53	73	66	75	46	49	53	59	2327
Sweden	Europe	Northern Europe	Developed regions	281	308	222	176	128	158	187	...	205	139	193	165	167	159	134	140	140	5866

	year	total
0	1980.0	99137
1	1981.0	110563
2	1982.0	104271
3	1983.0	75550
4	1984.0	73417

Visualizing Data: Waffle Charts, Word Clouds, and Regression Analysis¶

My Motivation¶

Project Roadmap¶

Exploring Data with pandas and Matplotlib¶

Downloading and Preparing Data¶

Visualizing Data with Matplotlib¶

Waffle Charts¶

Word Clouds¶

Regression Plots¶

My Own Exploration: Immigration from Denmark, Sweden, and Norway¶

Personal Reflections & Summary¶

	Category Proportion
Country
Denmark	0.322557
Norway	0.192409
Sweden	0.485034