# Since I am running the lab in a browser, I will install the libraries using ``piplite``
# import piplite
# await piplite.install(['pandas'])
# await piplite.install(['matplotlib'])
# await piplite.install(['scipy'])
# await piplite.install(['seaborn'])
# await piplite.install(['scikit-learn'])

# Uncomment and use pip or conda to install specific versions if needed.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# This function will download the dataset into your browser 

# from pyodide.http import pyfetch

# async def download(url, filename):
#     response = await pyfetch(url)
#     if response.status == 200:
#         with open(filename, "wb") as f:
#             f.write(await response.bytes())

path = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/automobileEDA.csv'

# you will need to download the dataset; if you are running locally, please comment out the following 
# await download(path, "auto.csv")
# path="auto.csv"

df = pd.read_csv(path)
df.head()

from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm

LinearRegression()

X = df[['highway-mpg']]
Y = df['price']

lm.fit(X,Y)

LinearRegression()

Yhat=lm.predict(X)
Yhat[0:5]

array([16236.50464347, 16236.50464347, 17058.23802179, 13771.3045085 ,
       20345.17153508])

lm.intercept_

38423.305858157386

lm.coef_

array([-821.73337832])

# Write your code below and press Shift+Enter to execute 
lm1 = LinearRegression()
lm1

LinearRegression()

lm1 = LinearRegression()
lm1

# Write your code below and press Shift+Enter to execute 
X1 = df[['engine-size']]
lm1.fit(X1,Y)

LinearRegression()

lm1.fit(df[['engine-size']], df[['price']])
lm1

# Write your code below and press Shift+Enter to execute 
lm1.coef_

array([166.86001569])

# Write your code below and press Shift+Enter to execute 
lm1.intercept_

-7963.338906281049

# Slope 
lm1.coef_

# Intercept
lm1.intercept_

# Write your code below and press Shift+Enter to execute 
# using X and Y  
Yhat=-7963.34 + 166.86*X

Price=-7963.34 + 166.86*engine-size

# using X and Y  
Yhat=-7963.34 + 166.86*X

Price=-7963.34 + 166.86*engine-size

Z = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]

lm.fit(Z, df['price'])

LinearRegression()

lm.intercept_

-15806.624626329198

lm.coef_

array([53.49574423,  4.70770099, 81.53026382, 36.05748882])

# Write your code below and press Shift+Enter to execute 
lm2 = LinearRegression()
lm2.fit(df[['normalized-losses' , 'highway-mpg']],df['price'])

LinearRegression()

lm2 = LinearRegression()
lm2.fit(df[['normalized-losses' , 'highway-mpg']],df['price'])

# Write your code below and press Shift+Enter to execute 
lm2.coef_

array([   1.49789586, -820.45434016])

lm2.coef_

# import the visualization package: seaborn
import seaborn as sns
%matplotlib inline

width = 12
height = 10
plt.figure(figsize=(width, height))
sns.regplot(x="highway-mpg", y="price", data=df)
plt.ylim(0,)

(0.0, 48177.41357088331)

plt.figure(figsize=(width, height))
sns.regplot(x="peak-rpm", y="price", data=df)
plt.ylim(0,)

(0.0, 47414.1)

# Write your code below and press Shift+Enter to execute 
df[["peak-rpm","highway-mpg","price"]].corr()

# The variable "highway-mpg" has a stronger correlation with "price", it is approximate -0.704692  compared to "peak-rpm" which is approximate -0.101616. You can verify it using the following command:

df[["peak-rpm","highway-mpg","price"]].corr()

width = 12
height = 10
plt.figure(figsize=(width, height))
sns.residplot(df['highway-mpg'], df['price'])
plt.show()

E:\anaconda\envs\cvpr\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

Y_hat = lm.predict(Z)

plt.figure(figsize=(width, height))


ax1 = sns.distplot(df['price'], hist=False, color="r", label="Actual Value")
sns.distplot(Y_hat, hist=False, color="b", label="Fitted Values" , ax=ax1)


plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')

plt.show()
plt.close()

E:\anaconda\envs\cvpr\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
E:\anaconda\envs\cvpr\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)

def PlotPolly(model, independent_variable, dependent_variabble, Name):
    x_new = np.linspace(15, 55, 100)
    y_new = model(x_new)

    plt.plot(independent_variable, dependent_variabble, '.', x_new, y_new, '-')
    plt.title('Polynomial Fit with Matplotlib for Price ~ Length')
    ax = plt.gca()
    ax.set_facecolor((0.898, 0.898, 0.898))
    fig = plt.gcf()
    plt.xlabel(Name)
    plt.ylabel('Price of Cars')

    plt.show()
    plt.close()

x = df['highway-mpg']
y = df['price']

# Here we use a polynomial of the 3rd order (cubic) 
f = np.polyfit(x, y, 3)
p = np.poly1d(f)
print(p)

        3         2
-1.557 x + 204.8 x - 8965 x + 1.379e+05

PlotPolly(p, x, y, 'highway-mpg')

np.polyfit(x, y, 3)

array([-1.55663829e+00,  2.04754306e+02, -8.96543312e+03,  1.37923594e+05])

# Let me explore the complexity of an 11th-order polynomial model
f1 = np.polyfit(x, y, 11)
p1 = np.poly1d(f1)
print(p1)
PlotPolly(p1,x,y, 'Highway MPG')

            11             10             9           8         7
-1.243e-08 x  + 4.722e-06 x  - 0.0008028 x + 0.08056 x - 5.297 x
          6        5             4             3             2
 + 239.5 x - 7588 x + 1.684e+05 x - 2.565e+06 x + 2.551e+07 x - 1.491e+08 x + 3.879e+08

from sklearn.preprocessing import PolynomialFeatures

pr=PolynomialFeatures(degree=2)
pr

PolynomialFeatures()

Z_pr=pr.fit_transform(Z)

Z.shape

(201, 4)

Z_pr.shape

(201, 15)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

Input=[('scale',StandardScaler()), ('polynomial', PolynomialFeatures(include_bias=False)), ('model',LinearRegression())]

pipe=Pipeline(Input)
pipe

Pipeline(steps=[('scale', StandardScaler()),
                ('polynomial', PolynomialFeatures(include_bias=False)),
                ('model', LinearRegression())])

Z = Z.astype(float)
pipe.fit(Z,y)

Pipeline(steps=[('scale', StandardScaler()),
                ('polynomial', PolynomialFeatures(include_bias=False)),
                ('model', LinearRegression())])

ypipe=pipe.predict(Z)
ypipe[0:4]

array([13102.74784201, 13102.74784201, 18225.54572197, 10390.29636555])

# Creating a streamlined pipeline with standardization and linear regression
Input1=[('scale',StandardScaler()),('model',LinearRegression())]
pipe1 =Pipeline(Input1)
pipe1
pipe1.fit(Z,y)

ypipe=pipe1.predict(Z)
ypipe[0:10]

array([13699.11161184, 13699.11161184, 19051.65470233, 10620.36193015,
       15521.31420211, 13869.66673213, 15456.16196732, 15974.00907672,
       17612.35917161, 10722.32509097])

#highway_mpg_fit
lm.fit(X, Y)
# Find the R^2
print('The R-square is: ', lm.score(X, Y))

The R-square is:  0.4965911884339175

Yhat=lm.predict(X)
print('The output of the first four predicted value is: ', Yhat[0:4])

The output of the first four predicted value is:  [16236.50464347 16236.50464347 17058.23802179 13771.3045085 ]

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(df['price'], Yhat)
print('The mean square error of price and predicted value is: ', mse)

The mean square error of price and predicted value is:  31635042.944639895

# fit the model 
lm.fit(Z, df['price'])
# Find the R^2
print('The R-square is: ', lm.score(Z, df['price']))

The R-square is:  0.8093562806577457

Y_predict_multifit = lm.predict(Z)

print('The mean square error of price and predicted value using multifit is: ', \
      mean_squared_error(df['price'], Y_predict_multifit))

The mean square error of price and predicted value using multifit is:  11980366.87072649

from sklearn.metrics import r2_score

r_squared = r2_score(y, p(x))
print('The R-square value is: ', r_squared)

The R-square value is:  0.674194666390652

mean_squared_error(df['price'], p(x))

20474146.426361218

import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

new_input=np.arange(1, 100, 1).reshape(-1, 1)

lm.fit(X, Y)
lm

LinearRegression()

yhat=lm.predict(new_input)
yhat[0:5]

E:\anaconda\envs\cvpr\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

array([37601.57247984, 36779.83910151, 35958.10572319, 35136.37234487,
       34314.63896655])

plt.plot(new_input, yhat)
plt.show()

	symboling	normalized-losses	make	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	length	...	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg	price	city-L/100km	horsepower-binned	gas
0	3	122	alfa-romero	std	two	convertible	rwd	front	88.6	0.811148	...	9.0	111.0	5000.0	21	27	13495.0	11.190476	Medium	1
1	3	122	alfa-romero	std	two	convertible	rwd	front	88.6	0.811148	...	9.0	111.0	5000.0	21	27	16500.0	11.190476	Medium	1
2	1	122	alfa-romero	std	two	hatchback	rwd	front	94.5	0.822681	...	9.0	154.0	5000.0	19	26	16500.0	12.368421	Medium	1
3	2	164	audi	std	four	sedan	fwd	front	99.8	0.848630	...	10.0	102.0	5500.0	24	30	13950.0	9.791667	Medium	1
4	2	164	audi	std	four	sedan	4wd	front	99.4	0.848630	...	8.0	115.0	5500.0	18	22	17450.0	13.055556	Medium	1

	peak-rpm	highway-mpg	price
peak-rpm	1.000000	-0.058598	-0.101616
highway-mpg	-0.058598	1.000000	-0.704692
price	-0.101616	-0.704692	1.000000

Automobile Price Prediction: Advanced Model Development & Engineering¶

Project Overview¶

Business Objectives¶

Technical Approach¶

Table of Contents¶

Executive Summary¶

Key Research Questions¶

Expected Outcomes¶

Setup and Preparation¶

If you need to install any libraries, use pip or conda as appropriate for your environment.¶

1. Linear Regression and Multiple Linear Regression¶

Linear Regression¶

Create the linear regression object:

How could "highway-mpg" help us predict car price?

What is the value of the intercept (a)?

What is the value of the slope (b)?

What is the final estimated linear model we get?

Question #1 a):

Question #1 b):

Question #1 c):

Slope

Intercept

Question #1 d):

Multiple Linear Regression

2.1 Exploring Multiple Variable Relationships¶

Question #2 b):

2. Model Evaluation Using Visualization

Regression Plot

Question #3:

Residual Plot

Multiple Linear Regression

3. Polynomial Regression and Pipelines

4.3 High-Order Polynomial Analysis¶

Pipeline

5.1 Streamlined Pipeline for Linear Regression¶

4. Measures for In-Sample Evaluation

Model 1: Simple Linear Regression

Model 2: Multiple Linear Regression

Model 3: Polynomial Fit

MSE

5. Prediction and Decision Making

Prediction

Decision Making: Determining a Good Model Fit

Let's take a look at the values for the different models.

Simple Linear Regression Model (SLR) vs Multiple Linear Regression Model (MLR)

Simple Linear Model (SLR) vs. Polynomial Fit

Multiple Linear Regression (MLR) vs. Polynomial Fit

Conclusion

Thank you for completing this lab!¶