import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

df= pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/edx/project/drinks.csv')

df.head()

df.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

df_Wine = df[['continent','wine_servings']]
df_winegrp = df_Wine.groupby(['continent'], as_index=False).sum()
df_winegrp

df_beer = df[['continent','beer_servings']]
df_beergrp = df_beer.groupby(['continent'], as_index=True).describe()
df_beergrp

import seaborn as sns 
sns.boxplot(x="continent", y="beer_servings", data=df_beer)
plt.show()

import seaborn as sns 
sns.regplot(x="wine_servings", y="beer_servings", data=df)
plt.show()
# Beer servings & Wine servings appear to be positively correlated.
# Also seems like there maybe some places where only Beer is served.

from sklearn.linear_model import LinearRegression
lm = LinearRegression()
x = df[['wine_servings']]
y = df['total_litres_of_pure_alcohol']
lm.fit(x,y)
yhat = lm.predict(x)
print(yhat[0:5])
print("Intercept is ",lm.intercept_)
print("Slope is ",lm.coef_)
print("R^2 is ",lm.score(x,y))

[ 3.15407943  4.86088833  3.59658545 13.01564196  4.57642018]
Intercept is  3.1540794346874996
Slope is  [0.03160757]
R^2 is  0.4456875459787605

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
x_data = df[['beer_servings','spirit_servings','wine_servings']]
y_data = df['total_litres_of_pure_alcohol']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=0)
lr = LinearRegression()
lr.fit(x_train, y_train)
print("Train Data R^2:", lr.score(x_train, y_train))
print("Test Data R^2:", lr.score(x_test, y_test))

Train Data R^2: 0.9471204262013297
Test Data R^2: 0.7370737388267039

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=0)

input =[('scale',StandardScaler()),('polynomial',PolynomialFeatures(include_bias=False,degree=2)),('model',LinearRegression())]
pipe = Pipeline(input)
pipe.fit(x_train, y_train)
yhat = pipe.predict(x_data)
print("R^2 using Test data is", pipe.score(x_test, y_test))
print("R^2 using Training data is", pipe.score(x_train, y_train))

R^2 using Test data is 0.7594556586231647
R^2 using Training data is 0.9555197146227157

/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/preprocessing/data.py:645: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/base.py:467: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.fit(X, y, **fit_params).transform(X)
/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/pipeline.py:331: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)
/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/pipeline.py:511: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)
/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/pipeline.py:511: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)

from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=0)
RidgeModel = Ridge(alpha=0.1)
RidgeModel.fit(x_train, y_train)
yhat = RidgeModel.predict(x_test)
print("Test R^2:",RidgeModel.score(x_test, y_test))
print("Train R^2:",RidgeModel.score(x_train, y_train))

Test R^2: 0.7370737565866847
Train R^2: 0.9471204262013262

pr = PolynomialFeatures(degree=2)
x_train_pr = pr.fit_transform(x_train)
x_test_pr = pr.fit_transform(x_test)
RidgeModel_pr = Ridge(alpha=0.1)
RidgeModel_pr.fit(x_train_pr, y_train)
print("RidgeModel Test data R^2: ",RidgeModel_pr.score(x_test_pr, y_test))
print("RidgeModel Train data R^2: ",RidgeModel_pr.score(x_train_pr, y_train))

RidgeModel Test data R^2:  0.7594556764530759
RidgeModel Train data R^2:  0.9555197146226246

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

	continent	wine_servings
0	Africa	862
1	Asia	399
2	Europe	6400
3	North America	564
4	Oceania	570
5	South America	749

	beer_servings
	count	mean	std	min	25%	50%	75%	max
continent
Africa	53.0	61.471698	80.557816	0.0	15.00	32.0	76.00	376.0
Asia	44.0	37.045455	49.469725	0.0	4.25	17.5	60.50	247.0
Europe	45.0	193.777778	99.631569	0.0	127.00	219.0	270.00	361.0
North America	23.0	145.434783	79.621163	1.0	80.00	143.0	198.00	285.0
Oceania	16.0	89.687500	96.641412	0.0	21.00	52.5	125.75	306.0
South America	12.0	175.083333	65.242845	93.0	129.50	162.5	198.00	333.0

Global Alcohol Consumption Patterns: Cross-Cultural Analysis¶

Project Overview¶

Research Objectives¶

Table of Contents¶

Executive Summary¶

Research Focus¶

Key Questions¶

Question 7¶