import numpy as np  # For scientific computing (by Mohammad Sayem Chowdhury)
import pandas as pd # For data manipulation and analysis

df_canada = pd.read_excel('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/Canada.xlsx',
                       sheet_name='Canada by Citizenship',
                       skiprows=range(20),
                       skipfooter=2)
print('Dataset loaded into my DataFrame!')

Data downloaded and read into a dataframe!

df_canada.head()

# Checking the shape of my DataFrame
print(df_canada.shape)

(195, 43)

df_canada.drop(['AREA', 'REG', 'DEV', 'Type', 'Coverage'], axis=1, inplace=True)
# Quick check after dropping unnecessary columns
print(df_canada.head())

df_canada.rename(columns={'OdName':'Country', 'AreaName':'Continent','RegName':'Region'}, inplace=True)
# Checking the new column names
print(df_canada.head())

# Confirming all column labels are strings
all(isinstance(column, str) for column in df_canada.columns)

False

df_canada.columns = list(map(str, df_canada.columns))

# Double-check
all(isinstance(column, str) for column in df_canada.columns)

True

df_canada.set_index('Country', inplace=True)
# Preview the DataFrame with country as index
print(df_canada.head())

df_canada['Total'] = df_canada.sum(axis=1)
# Check the updated DataFrame
print(df_canada.head())

C:\Users\chysa\AppData\Local\Temp\ipykernel_2848\2933561449.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df_can['Total'] = df_can.sum(axis=1)

print('DataFrame shape after adding Total:', df_canada.shape)

data dimensions: (195, 38)

# Creating a list of years for plotting
years_list = list(map(str, range(1980, 2014)))

years_list

['1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013']

# Show plots inline in the notebook (by Mohammad Sayem Chowdhury)
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot')  # I like the ggplot style for its clarity

# Check Matplotlib version
print('Matplotlib version:', mpl.__version__)

Matplotlib version:  3.5.1

# Sort by total immigrants and get the top 5 countries
most_immigrants = df_canada.sort_values(['Total'], ascending=False).head()
# Transpose for plotting
top5_trend = most_immigrants[years_list].transpose()
print(top5_trend.head())

df_top5.index = df_top5.index.map(int)  # Make sure the index is integer for plotting

# Unstacked area plot for top 5 countries
ax = df_top5.plot(kind='area', 
             stacked=False,
             figsize=(20, 10), # pass a tuple (x, y) size
             alpha=0.6)

plt.title('Top 5 Countries: Immigration Trend to Canada (1980-2013)')
plt.ylabel('Number of Immigrants')
plt.xlabel('Year')
plt.legend(title='Country')
plt.show()

# (by Mohammad Sayem Chowdhury)

top5_trend.plot(kind='area', 
                 alpha=0.35, # 0-1, default value a= 0.5
                 stacked=False,
                 figsize=(20, 10),
                )

plt.title('Top 5 Countries: Immigration Trend to Canada (with Transparency)')
plt.ylabel('Number of Immigrants')
plt.xlabel('Year')
plt.legend(title='Country')
plt.show()

# (by Mohammad Sayem Chowdhury)

    # Option 1: This is what we have been using so far
    df_top5.plot(kind='area', alpha=0.35, figsize=(20, 10)) 
    plt.title('Immigration trend of top 5 countries')
    plt.ylabel('Number of immigrants')
    plt.xlabel('Years')

# option 2: preferred option with more flexibility
ax = df_top5.plot(kind='area', alpha=0.35, figsize=(20, 10))

ax.set_title('Immigration Trend of Top 5 Countries')
ax.set_ylabel('Number of Immigrants')
ax.set_xlabel('Years')

Text(0.5, 0, 'Years')

least_immigrants = df_canada.sort_values(['Total'], ascending=True).head(5)

# transpose the dataframe
least5_trend = least_immigrants[years_list].transpose() 
least5_trend.index = least5_trend.index.map(int) # let's change the index values of df_least5 to type integer for plotting
least5_trend.plot(kind='area', 
             alpha=0.45, # 0-1, default value a= 0.5
             stacked=True,
             figsize=(20, 10),
            )

plt.title('Least 5 Countries: Immigration Trend to Canada (Stacked)')
plt.ylabel('Number of Immigrants')
plt.xlabel('Year')
plt.legend(title='Country')
plt.show()

# (by Mohammad Sayem Chowdhury)

    #The correct answer is:
    # get the 5 countries with the least contribution
    df_least5 = df_can.tail(5)
     
    # transpose the dataframe
    df_least5 = df_least5[years].transpose() 
    df_least5.head()

    df_least5.index = df_least5.index.map(int) # let's change the index values of df_least5 to type integer for plotting
    df_least5.plot(kind='area', alpha=0.45, figsize=(20, 10)) 

    plt.title('Immigration Trend of 5 Countries with Least Contribution to Immigration')
    plt.ylabel('Number of Immigrants')
    plt.xlabel('Years')

    plt.show()

ax = least5_trend.plot(kind='area', alpha=0.55, stacked=False, figsize=(20, 10))

ax.set_title('Least 5 Countries: Immigration Trend to Canada (Unstacked)')
ax.set_ylabel('Number of Immigrants')
ax.set_xlabel('Year')
ax.legend(title='Country')
# (by Mohammad Sayem Chowdhury)

Text(0.5, 0, 'Years')

    #The correct answer is:
    
    # get the 5 countries with the least contribution
    df_least5 = df_can.tail(5)

    # transpose the dataframe
    df_least5 = df_least5[years].transpose() 
    
    df_least5.head()

    df_least5.index = df_least5.index.map(int) # let's change the index values of df_least5 to type integer for plotting
    
    ax = df_least5.plot(kind='area', alpha=0.55, stacked=False, figsize=(20, 10))
    
    ax.set_title('Immigration Trend of 5 Countries with Least Contribution to Immigration')
    ax.set_ylabel('Number of Immigrants')
    ax.set_xlabel('Years')

# Quick look at 2013 immigration numbers
print(df_canada['2013'].head())

Country
India                                                   33087
China                                                   34129
United Kingdom of Great Britain and Northern Ireland     5827
Philippines                                             29544
Pakistan                                                12603
Name: 2013, dtype: int64

# Get frequency counts and bin edges for 2013 data
freq_counts, bin_edges = np.histogram(df_canada['2013'])

print(freq_counts)  # Frequency count
print(bin_edges)    # Bin ranges

[178  11   1   2   0   0   0   0   1   2]
[    0.   3412.9  6825.8 10238.7 13651.6 17064.5 20477.4 23890.3 27303.2
 30716.1 34129. ]

# My first histogram: Immigration to Canada in 2013
plt.figure(figsize=(8, 5))
df_canada['2013'].plot(kind='hist')
plt.title('Distribution of Immigrants to Canada (2013)')
plt.ylabel('Number of Countries')
plt.xlabel('Number of Immigrants')
plt.show()
# (by Mohammad Sayem Chowdhury)

# 'bin_edges' is a list of bin intervals
count, bin_edges = np.histogram(df_can['2013'])

# Histogram with custom x-ticks for better clarity
plt.figure(figsize=(8, 5))
df_can['2013'].plot(kind='hist', xticks=bin_edges)
plt.title('Distribution of Immigrants to Canada (2013)')
plt.ylabel('Number of Countries')
plt.xlabel('Number of Immigrants')
plt.show()
# (by Mohammad Sayem Chowdhury)

# Select data for Denmark, Norway, and Sweden
nordic_countries = df_canada.loc[['Denmark', 'Norway', 'Sweden'], years_list]
print(nordic_countries)

# Attempt to plot histogram (will show why transposing is needed)
nordic_countries.plot.hist()
plt.show()

<AxesSubplot:ylabel='Frequency'>

# Transpose for correct histogram
nordic_trend = nordic_countries.transpose()
print(nordic_trend.head())

# Now plot the histogram correctly
nordic_trend.plot(kind='hist', figsize=(10, 6))

plt.title('Immigration from Denmark, Norway, and Sweden (1980-2013)')
plt.ylabel('Number of Years')
plt.xlabel('Number of Immigrants')

plt.show()
# (by Mohammad Sayem Chowdhury)

# Get bin edges for 15 bins
count, bin_edges = np.histogram(nordic_trend, 15)

# Custom histogram
nordic_trend.plot(kind='hist',
                  figsize=(10, 6),
                  bins=15,
                  alpha=0.6,
                  xticks=bin_edges,
                  color=['coral', 'darkslateblue', 'mediumseagreen'])

plt.title('Immigration from Denmark, Norway, and Sweden (1980-2013)')
plt.ylabel('Number of Years')
plt.xlabel('Number of Immigrants')
plt.show()

# (by Mohammad Sayem Chowdhury)

import matplotlib
for name, hex in matplotlib.colors.cnames.items():
    print(name, hex)

count, bin_edges = np.histogram(nordic_trend, 15)
xmin = bin_edges[0] - 10  # Add buffer for aesthetics
xmax = bin_edges[-1] + 10
# Stacked histogram
nordic_trend.plot(kind='hist',
                  figsize=(10, 6),
                  bins=15,
                  xticks=bin_edges,
                  color=['coral', 'darkslateblue', 'mediumseagreen'],
                  stacked=True,
                  xlim=(xmin, xmax))
plt.title('Immigration from Denmark, Norway, and Sweden (1980-2013)')
plt.ylabel('Number of Years')
plt.xlabel('Number of Immigrants')
plt.show()
# (by Mohammad Sayem Chowdhury)

# Select and transpose data for Greece, Albania, Bulgaria
gab_countries = df_canada.loc[['Greece', 'Albania', 'Bulgaria'], years_list].transpose()

# Get bin edges
count, bin_edges = np.histogram(gab_countries, 15)

# Overlapping histogram
gab_countries.plot(kind='hist',
                   figsize=(10, 6),
                   bins=15,
                   alpha=0.35,
                   xticks=bin_edges,
                   color=['coral', 'darkslateblue', 'mediumseagreen'])

plt.title('Immigration from Greece, Albania, and Bulgaria (1980-2013)')
plt.ylabel('Number of Years')
plt.xlabel('Number of Immigrants')
plt.show()

# (by Mohammad Sayem Chowdhury)

    #The correct answer is:
    
    # create a dataframe of the countries of interest (cof)
    df_cof = df_can.loc[['Greece', 'Albania', 'Bulgaria'], years]

    # transpose the dataframe
    df_cof = df_cof.transpose() 

    # let's get the x-tick values
    count, bin_edges = np.histogram(df_cof, 15)

    # Un-stacked Histogram
    df_cof.plot(kind ='hist',
                figsize=(10, 6),
                bins=15,
                alpha=0.35,
                xticks=bin_edges,
                color=['coral', 'darkslateblue', 'mediumseagreen']
                )

    plt.title('Histogram of Immigration from Greece, Albania, and Bulgaria from 1980 - 2013')
    plt.ylabel('Number of Years')
    plt.xlabel('Number of Immigrants')

    plt.show()

# Get Iceland data for all years
iceland_trend = df_canada.loc['Iceland', years_list]
print(iceland_trend.head())

1980    17
1981    33
1982    10
1983     9
1984    13
Name: Iceland, dtype: object

# step 2: plot data
df_iceland.plot(kind='bar', figsize=(10, 6))

plt.xlabel('Year') # add to x-label to the plot
plt.ylabel('Number of immigrants') # add y-label to the plot
plt.title('Icelandic immigrants to Canada from 1980 to 2013') # add title to the plot

plt.show()
# (by Mohammad Sayem Chowdhury)

iceland_trend.plot(kind='bar', figsize=(10, 6), rot=90) # rotate the xticks(labelled points on x-axis) by 90 degrees

plt.xlabel('Year')
plt.ylabel('Number of Immigrants')
plt.title('Icelandic Immigration to Canada (1980-2013)')

# Annotate the financial crisis impact
plt.annotate('',                      # s: str. Will leave it blank for no text
             xy=(32, 70),             # place head of the arrow at point (year 2012 , pop 70)
             xytext=(28, 20),         # place base of the arrow at point (year 2008 , pop 20)
             xycoords='data',         # will use the coordinate system of the object being annotated 
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2)
            )

plt.show()

# (by Mohammad Sayem Chowdhury)

iceland_trend.plot(kind='bar', figsize=(10, 6), rot=90)

plt.xlabel('Year')
plt.ylabel('Number of Immigrants')
plt.title('Icelandic Immigration to Canada (1980-2013)')

plt.annotate('',                      # s: str. will leave it blank for no text
             xy=(32, 70),             # place head of the arrow at point (year 2012 , pop 70)
             xytext=(28, 20),         # place base of the arrow at point (year 2008 , pop 20)
             xycoords='data',         # will use the coordinate system of the object being annotated 
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2)
            )

plt.annotate('2008-2011 Financial Crisis', # text to display
             xy=(28, 30),                    # start the text at at point (year 2008 , pop 30)
             rotation=72.5,                  # based on trial and error to match the arrow
             va='bottom',                    # want the text to be vertically 'bottom' aligned
             ha='left',                      # want the text to be horizontally 'left' algned.
            )

plt.show()

# (by Mohammad Sayem Chowdhury)

### type your answer here
# sort dataframe on 'Total' column (descending)
df_can.sort_values(by='Total', ascending=True, inplace=True)

df_top15 = df_can['Total'].tail(15)
df_top15

Country
Romania                                                  93585
Viet Nam                                                 97146
Jamaica                                                 106431
France                                                  109091
Lebanon                                                 115359
Poland                                                  139241
Republic of Korea                                       142581
Sri Lanka                                               148358
Iran (Islamic Republic of)                              175923
United States of America                                241122
Pakistan                                                241600
Philippines                                             511391
United Kingdom of Great Britain and Northern Ireland    551500
China                                                   659962
India                                                   691904
Name: Total, dtype: int64

    #The correct answer is:
    
    # sort dataframe on 'Total' column (descending)
    df_can.sort_values(by='Total', ascending=True, inplace=True)

    # get top 15 countries
    df_top15 = df_can['Total'].tail(15)
    df_top15

### type your answer here

# generate plot
df_top15.plot(kind='barh', figsize=(12, 12), color='steelblue')
plt.xlabel('Number of Immigrants')
plt.title('Top 15 Conuntries Contributing to the Immigration to Canada between 1980 - 2013')

# annotate value labels to each country
for index, value in enumerate(df_top15): #enamurate returns tuple
    # print(index, value)
    label = format(int(value), ',') # format int with commas

# place text at the end of bar (subtracting 47000 from x, and 0.1 from y to make it fit within the bar)
    plt.annotate(label, xy=(value - 47000, index - 0.10), color='white')

plt.show()

    #The correct answer is:
    
    # generate plot
    df_top15.plot(kind='barh', figsize=(12, 12), color='steelblue')
    plt.xlabel('Number of Immigrants')
    plt.title('Top 15 Conuntries Contributing to the Immigration to Canada between 1980 - 2013')

    # annotate value labels to each country
    for index, value in enumerate(df_top15): 
        label = format(int(value), ',') # format int with commas
    
    # place text at the end of bar (subtracting 47000 from x, and 0.1 from y to make it fit within the bar)
        plt.annotate(label, xy=(value - 47000, index - 0.10), color='white')

    plt.show()

	Type	Coverage	OdName	AREA	AreaName	REG	RegName	DEV	DevName	1980	...	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
0	Immigrants	Foreigners	Afghanistan	935	Asia	5501	Southern Asia	902	Developing regions	16	...	2978	3436	3009	2652	2111	1746	1758	2203	2635	2004
1	Immigrants	Foreigners	Albania	908	Europe	925	Southern Europe	901	Developed regions	1	...	1450	1223	856	702	560	716	561	539	620	603
2	Immigrants	Foreigners	Algeria	903	Africa	912	Northern Africa	902	Developing regions	80	...	3616	3626	4807	3623	4005	5393	4752	4325	3774	4331
3	Immigrants	Foreigners	American Samoa	909	Oceania	957	Polynesia	902	Developing regions	0	...	0	0	1	0	0	0	0	0	0	0
4	Immigrants	Foreigners	Andorra	908	Europe	925	Southern Europe	901	Developed regions	0	...	0	0	1	1	0	0	0	0	1	1

	OdName	AreaName	RegName	DevName	1980	1981	1982	1983	1984	1985	...	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
0	Afghanistan	Asia	Southern Asia	Developing regions	16	39	39	47	71	340	...	2978	3436	3009	2652	2111	1746	1758	2203	2635	2004
1	Albania	Europe	Southern Europe	Developed regions	1	0	0	0	0	0	...	1450	1223	856	702	560	716	561	539	620	603
2	Algeria	Africa	Northern Africa	Developing regions	80	67	71	69	63	44	...	3616	3626	4807	3623	4005	5393	4752	4325	3774	4331
3	American Samoa	Oceania	Polynesia	Developing regions	0	1	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
4	Andorra	Europe	Southern Europe	Developed regions	0	0	0	0	0	0	...	0	0	1	1	0	0	0	0	1	1

	Country	Continent	Region	DevName	1980	1981	1982	1983	1984	1985	...	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
0	Afghanistan	Asia	Southern Asia	Developing regions	16	39	39	47	71	340	...	2978	3436	3009	2652	2111	1746	1758	2203	2635	2004
1	Albania	Europe	Southern Europe	Developed regions	1	0	0	0	0	0	...	1450	1223	856	702	560	716	561	539	620	603
2	Algeria	Africa	Northern Africa	Developing regions	80	67	71	69	63	44	...	3616	3626	4807	3623	4005	5393	4752	4325	3774	4331
3	American Samoa	Oceania	Polynesia	Developing regions	0	1	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
4	Andorra	Europe	Southern Europe	Developed regions	0	0	0	0	0	0	...	0	0	1	1	0	0	0	0	1	1

	Continent	Region	DevName	1980	1981	1982	1983	1984	1985	1986	...	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
Country
Afghanistan	Asia	Southern Asia	Developing regions	16	39	39	47	71	340	496	...	2978	3436	3009	2652	2111	1746	1758	2203	2635	2004
Albania	Europe	Southern Europe	Developed regions	1	0	0	0	0	0	1	...	1450	1223	856	702	560	716	561	539	620	603
Algeria	Africa	Northern Africa	Developing regions	80	67	71	69	63	44	69	...	3616	3626	4807	3623	4005	5393	4752	4325	3774	4331
American Samoa	Oceania	Polynesia	Developing regions	0	1	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
Andorra	Europe	Southern Europe	Developed regions	0	0	0	0	0	0	2	...	0	0	1	1	0	0	0	0	1	1

	Continent	Region	DevName	1980	1981	1982	1983	1984	1985	1986	...	2005	2006	2007	2008	2009	2010	2011	2012	2013	Total
Country
Afghanistan	Asia	Southern Asia	Developing regions	16	39	39	47	71	340	496	...	3436	3009	2652	2111	1746	1758	2203	2635	2004	58639
Albania	Europe	Southern Europe	Developed regions	1	0	0	0	0	0	1	...	1223	856	702	560	716	561	539	620	603	15699
Algeria	Africa	Northern Africa	Developing regions	80	67	71	69	63	44	69	...	3626	4807	3623	4005	5393	4752	4325	3774	4331	69439
American Samoa	Oceania	Polynesia	Developing regions	0	1	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	6
Andorra	Europe	Southern Europe	Developed regions	0	0	0	0	0	0	2	...	0	1	1	0	0	0	0	1	1	15

Area, Histogram, and Bar Chart Visualization Project¶

Table of Contents¶

Exploring Datasets with pandas and Matplotlib ¶

Getting to Know the Data¶

Downloading and Preparing the Data¶

Visualizing Data with Matplotlib ¶

Area Plots ¶

Area Plots¶

Two types of plotting¶

Histograms¶

Bar Charts¶

Vertical Bar Plot¶

Thank you for completing this lab!¶

© IBM Corporation 2020. All rights reserved.

Country	India	China	United Kingdom of Great Britain and Northern Ireland	Philippines	Pakistan
1980	8880	5123	22045	6051	978
1981	8670	6682	24796	5921	972
1982	8147	3308	20620	5249	1201
1983	7338	1863	10015	4562	900
1984	5704	1527	10170	3801	668

	1980	1981	1982	1983	1984	1985	1986	1987	1988	1989	...	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
Country
Denmark	272	293	299	106	93	73	93	109	129	129	...	89	62	101	97	108	81	92	93	94	81
Norway	116	77	106	51	31	54	56	80	73	76	...	73	57	53	73	66	75	46	49	53	59
Sweden	281	308	222	176	128	158	187	198	171	182	...	129	205	139	193	165	167	159	134	140	140

Area, Histogram, and Bar Chart Visualization Project¶

Table of Contents¶

Exploring Datasets with pandas and Matplotlib¶

Getting to Know the Data¶

Downloading and Preparing the Data¶

Visualizing Data with Matplotlib¶

Area Plots¶

Area Plots¶

Two types of plotting¶

Histograms¶

Bar Charts¶

Vertical Bar Plot¶

Thank you for completing this lab!¶

© IBM Corporation 2020. All rights reserved.

Exploring Datasets with pandas and Matplotlib ¶

Visualizing Data with Matplotlib ¶

Area Plots ¶