# !wget --no-check-certificate https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/LargeData/m4_survey_data.sqlite

import sqlite3
conn = sqlite3.connect("m4_survey_data.sqlite") # open a database connection

import pandas as pd
import seaborn as sns
import numpy as np

# use the inline backend to generate the plots within the browser
%matplotlib inline 

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot') # optional: for ggplot-like style

# check for latest version of Matplotlib
print ('Matplotlib version: ', mpl.__version__) # >= 2.0.0

Matplotlib version:  3.5.1

# print how many rows are there in the table named 'master'
QUERY = """
SELECT COUNT(*)
FROM master
"""

# the read_sql_query runs the sql query and returns the data as a dataframe
df = pd.read_sql_query(QUERY,conn)
df.head()

# print all the tables names in the database
QUERY = """
SELECT name as Table_Name FROM
sqlite_master WHERE
type = 'table'
"""
# the read_sql_query runs the sql query and returns the data as a dataframe
pd.read_sql_query(QUERY,conn)

QUERY = """
SELECT Age,COUNT(*) as count
FROM master
group by age
order by age
"""
pd.read_sql_query(QUERY,conn)

table_name = 'master'  # the table you wish to describe

QUERY = """
SELECT sql FROM sqlite_master
WHERE name= '{}'
""".format(table_name)

df = pd.read_sql_query(QUERY,conn)
print(df.iat[0,0])

CREATE TABLE "master" (
"index" INTEGER,
  "Respondent" INTEGER,
  "MainBranch" TEXT,
  "Hobbyist" TEXT,
  "OpenSourcer" TEXT,
  "OpenSource" TEXT,
  "Employment" TEXT,
  "Country" TEXT,
  "Student" TEXT,
  "EdLevel" TEXT,
  "UndergradMajor" TEXT,
  "OrgSize" TEXT,
  "YearsCode" TEXT,
  "Age1stCode" TEXT,
  "YearsCodePro" TEXT,
  "CareerSat" TEXT,
  "JobSat" TEXT,
  "MgrIdiot" TEXT,
  "MgrMoney" TEXT,
  "MgrWant" TEXT,
  "JobSeek" TEXT,
  "LastHireDate" TEXT,
  "FizzBuzz" TEXT,
  "ResumeUpdate" TEXT,
  "CurrencySymbol" TEXT,
  "CurrencyDesc" TEXT,
  "CompTotal" REAL,
  "CompFreq" TEXT,
  "ConvertedComp" REAL,
  "WorkWeekHrs" REAL,
  "WorkRemote" TEXT,
  "WorkLoc" TEXT,
  "ImpSyn" TEXT,
  "CodeRev" TEXT,
  "CodeRevHrs" REAL,
  "UnitTests" TEXT,
  "PurchaseHow" TEXT,
  "PurchaseWhat" TEXT,
  "OpSys" TEXT,
  "BlockchainOrg" TEXT,
  "BlockchainIs" TEXT,
  "BetterLife" TEXT,
  "ITperson" TEXT,
  "OffOn" TEXT,
  "SocialMedia" TEXT,
  "Extraversion" TEXT,
  "ScreenName" TEXT,
  "SOVisit1st" TEXT,
  "SOVisitFreq" TEXT,
  "SOFindAnswer" TEXT,
  "SOTimeSaved" TEXT,
  "SOHowMuchTime" TEXT,
  "SOAccount" TEXT,
  "SOPartFreq" TEXT,
  "SOJobs" TEXT,
  "EntTeams" TEXT,
  "SOComm" TEXT,
  "WelcomeChange" TEXT,
  "Age" REAL,
  "Trans" TEXT,
  "Dependents" TEXT,
  "SurveyLength" TEXT,
  "SurveyEase" TEXT
)

# your code goes here

QUERY = """
SELECT ConvertedComp
FROM master
"""
df = pd.read_sql_query(QUERY,conn)
df["ConvertedComp"].plot.hist()
plt.title('Histogram of Salary converted to annual USD salaries using the exchange rate on 2019-02-01')
plt.ylabel('Number of Convertion')
plt.xlabel('Number of Salaries')

plt.show()

# your code goes here
QUERY = """
SELECT Age
FROM master
"""
df = pd.read_sql_query(QUERY,conn)

df["Age"].plot(kind='box', figsize=(10, 8))

plt.title('Box plot of Age and Number of People')
plt.ylabel('Number of People')

plt.show()

# your code goes here
QUERY = """
SELECT Age,WorkWeekHrs
FROM master
"""
df = pd.read_sql_query(QUERY,conn)

df.plot(kind='scatter', x='Age', y='WorkWeekHrs', figsize=(10, 6), color='darkblue')

plt.title('Work Week Hours of people by their age ')
plt.xlabel('Age')
plt.ylabel('Work Week Hours')

plt.show()

# your code goes here
QUERY = """
SELECT Age,WorkWeekHrs,CodeRevHrs
FROM master
"""
df = pd.read_sql_query(QUERY,conn)
df.head()

norm_age = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())

ax = df.plot(kind='scatter',
                    x='WorkWeekHrs',
                    y='CodeRevHrs',
                    figsize=(14, 8),
                    alpha=0.5,      # transparency
                    color='green',
                    s=norm_age *1000 # pass in weights 
        
                   )

ax.set_ylabel('CodeRevHrs')
ax.set_title('Work Week Hours and Code Revision Hours by Age of people')
ax.legend(['Age'], loc='upper right', fontsize='x-large')

<matplotlib.legend.Legend at 0x24809efa550>

# your code goes here

QUERY = """
SELECT count(DatabaseDesireNextYear) as Count,DatabaseDesireNextYear
FROM DatabaseDesireNextYear
Group BY DatabaseDesireNextYear
ORDER BY Count desc
LIMIT 5
"""
df = pd.read_sql_query(QUERY,conn)
explode_list = [ 0, 0, 0.1, 0.1, 0.2] # ratio for each database with which to offset each wedge.

df["Count"].plot(kind='pie',
                            figsize=(14,8),
                            autopct='%1.1f%%', 
                            startangle=90,    
                            shadow=True,       
                            labels=None,                 # turn off labels on pie chart
                            pctdistance=1.12,            # the ratio between the pie center and start of text label
                            explode=explode_list         # 'explode' lowest 3 databases
                            )
# scale the title up by 12% to match pctdistance
plt.title('Database Desire Next Year by StackOverflow Survey 2019', y=1.12) 

plt.axis('equal') 

# add legend
plt.legend(labels=df["DatabaseDesireNextYear"], loc='upper left')

<matplotlib.legend.Legend at 0x2480a0cb0d0>

QUERY="""
SELECT DatabaseWorkedWith,count(Respondent)
FROM DatabaseWorkedWith
group by Respondent having count(DatabaseWorkedWith)=1 and DatabaseWorkedWith='MySQL'"""
df2=pd.read_sql_query(QUERY,conn)
df2.count()

DatabaseWorkedWith    474
count(Respondent)     474
dtype: int64

QUERY = """
SELECT count(LanguageWorkedWith) as Count,LanguageWorkedWith
FROM LanguageWorkedWith
Group BY LanguageWorkedWith
ORDER BY Count desc
LIMIT 30
"""
df = pd.read_sql_query(QUERY,conn)
df.head(30)

# your code goes here

QUERY = """
SELECT count(LanguageDesireNextYear) as Count,LanguageDesireNextYear
FROM LanguageDesireNextYear
Group BY LanguageDesireNextYear
ORDER BY Count desc
LIMIT 5
"""
df = pd.read_sql_query(QUERY,conn)
explode_list = [ 0, 0, 0.1, 0.1, 0.2] # ratio for each database with which to offset each wedge.

df["Count"].plot(kind='pie',
                            figsize=(14, 8),
                            autopct='%1.1f%%', 
                            startangle=90,    
                            shadow=True,       
                            labels=None,                 # turn off labels on pie chart
                            pctdistance=1.12,            # the ratio between the pie center and start of text label
                            explode=explode_list         # 'explode' lowest 3 databases
                            )
# scale the title up by 12% to match pctdistance
plt.title('Language Desire Next Year by StackOverflow Survey 2019', y=1.12) 

plt.axis('equal') 

# add legend
plt.legend(labels=df["LanguageDesireNextYear"], loc='upper left')

<matplotlib.legend.Legend at 0x24809fcdaf0>

# your code goes here
QUERY = """
SELECT WorkWeekHrs,CodeRevHrs,Age
FROM Master
Where Age between 30 and 35

"""
df = pd.read_sql_query(QUERY,conn)
df = df.groupby("Age").median()
df.plot(kind='bar', stacked=True, figsize=(10, 6), rot=90) # rotate the xticks(labelled points on x-axis) by 90 degrees

plt.xlabel('Age')
plt.ylabel('Hours')
plt.title('Median Work Week Hours and Code Revision hours in 30-35 age group')

Text(0.5, 1.0, 'Median Work Week Hours and Code Revision hours in 30-35 age group')

# your code goes here
QUERY = """
SELECT ConvertedComp,Age
FROM Master
Where Age between 45 and 60

"""
df = pd.read_sql_query(QUERY,conn)
df = df.groupby("Age").median()
df.plot(kind='line',figsize=(10, 6))

plt.title('Median ConvertedComp in 45-60 age group')
plt.ylabel('ConvertedComp')
plt.xlabel('Age')

plt.show() # need this line to show the updates made to the figure

# your code goes here
QUERY = """
SELECT MainBranch,count(MainBranch) as count
FROM Master
Group by MainBranch
"""
df = pd.read_sql_query(QUERY,conn)

df = df.set_index("MainBranch")
df["count"]=df["count"].astype('int')

df.rename(columns={'count':'Total People Opinion'}, inplace=True)

fig, ax = plt.subplots(figsize=(12,6))
# df.rename(columns={'count':'Total People Opinion'}, inplace=True)

df.plot(kind='barh', legend = False, ax=ax, color='aquamarine')
ax.set_xlabel('Number of People',color='black',fontfamily="Montserrat",fontsize=14)
ax.set_ylabel('Main Branch',color='black',fontfamily="Montserrat",fontsize=14)
ax.set_xlabel('Number of People')
ax.set_title('People Opinion about MainBranch',color='black',fontfamily="Montserrat",fontsize=20,loc='left')

ax.set_facecolor('whitesmoke')

ax.tick_params(axis='x', colors='black')    #setting up X-axis tick color to black
ax.tick_params(axis='y', colors='black')    #setting up Y-axis tick color to black

plt.rcParams["font.family"] = "Montserrat"
plt.grid(visible=None)

# annotate value labels to each country
for i in range(len(df)): #enamurate returns tuple
    # print(df["Total People Opinion"][i])
    value = df["Total People Opinion"][i]
    label = format(int(value), ',') # format int with commas

# place text at the end of bar (subtracting 700 from x, and 0.1 from y to make it fit within the bar)
    plt.annotate(label, xy=(value-700, i-0.1), color='black')
plt.savefig("mainbranch.png",bbox_inches='tight')
plt.show()

# conn.close()

	Table_Name
0	EduOther
1	DevType
2	LastInt
3	JobFactors
4	WorkPlan
5	WorkChallenge
6	LanguageWorkedWith
7	LanguageDesireNextYear
8	DatabaseWorkedWith
9	DatabaseDesireNextYear
10	PlatformWorkedWith
11	PlatformDesireNextYear
12	WebFrameWorkedWith
13	WebFrameDesireNextYear
14	MiscTechWorkedWith
15	MiscTechDesireNextYear
16	DevEnviron
17	Containers
18	SOVisitTo
19	SONewContent
20	Gender
21	Sexuality
22	Ethnicity
23	master

	Age	count
0	NaN	287
1	16.0	3
2	17.0	6
3	18.0	29
4	19.0	78
5	20.0	109
6	21.0	203
7	22.0	406
8	23.0	581
9	24.0	679
10	25.0	738
11	26.0	720
12	27.0	724
13	28.0	787
14	29.0	697
15	30.0	651
16	31.0	531
17	32.0	489
18	33.0	483
19	34.0	395
20	35.0	393
21	36.0	308
22	37.0	280
23	38.0	279
24	39.0	232
25	40.0	187
26	41.0	136
27	42.0	162
28	43.0	100
29	44.0	95
30	45.0	85
31	46.0	66
32	47.0	68
33	48.0	64
34	49.0	66
35	50.0	57
36	51.0	29
37	52.0	41
38	53.0	32
39	54.0	26
40	55.0	13
41	56.0	16
42	57.0	11
43	58.0	12
44	59.0	11
45	60.0	2
46	61.0	10
47	62.0	5
48	63.0	7
49	65.0	2
50	66.0	1
51	67.0	1
52	69.0	1
53	71.0	2
54	72.0	1
55	99.0	1

	Count	LanguageWorkedWith
0	8687	JavaScript
1	7830	HTML/CSS
2	7106	SQL
3	4642	Bash/Shell/PowerShell
4	4542	Python
5	4506	Java
6	4288	C#
7	3232	TypeScript
8	2913	PHP
9	1946	C++
10	1578	C
11	1149	Ruby
12	1114	Go
13	840	Other(s):
14	751	Kotlin
15	707	Swift
16	628	VBA
17	585	R
18	518	Objective-C
19	492	Scala
20	437	Assembly
21	324	Rust
22	237	Dart
23	187	Elixir
24	164	Clojure
25	158	F#
26	133	WebAssembly
27	98	Erlang

Data Visualization¶

Data Visualization Lab¶

Objectives¶

Demo: How to work with database¶

Demo: How to run an sql query¶

Demo: How to list all tables¶

Demo: How to run a group by query¶

Demo: How to describe a table¶

Hands-on Lab¶

Visualizing distribution of data¶

Histograms¶

Box Plots¶

Visualizing relationships in data¶

Scatter Plots¶

Bubble Plots¶

Visualizing composition of data¶

Pie Charts¶

Stacked Charts¶

Visualizing comparison of data¶

Line Chart¶

Bar Chart¶