import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn import linear_model
# Reading CSV Input file and storing as pandas dataframe object
data = pd.read_csv('Data_miniproject.csv')
data.columns
Index(['Cost', 'Weight', 'Weight1', 'Length', 'Height', 'Width'], dtype='object')
# Checking top 5 rows for data
data.head(5)
ЭДА
# checking the data type and wthether it will accept null or not
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Cost 159 non-null float64
1 Weight 159 non-null float64
2 Weight1 159 non-null float64
3 Length 159 non-null float64
4 Height 159 non-null float64
5 Width 159 non-null float64
dtypes: float64(6)
memory usage: 7.6 KB
# Generate descriptive statistics.
# Checking Min, Max, Mean, Std for all the columns
data.describe(percentiles=[.10, .25, .5, .75])
Проверьте, есть ли нулевые значения
# checking column wise null values
data.isnull().any()
Cost False
Weight False
Weight1 False
Length False
Height False
Width False
dtype: bool
# checking column wise null value count
data.isnull().any()
data.isnull().sum()
Cost 0
Weight 0
Weight1 0
Length 0
Height 0
Width 0
dtype: int64
# checking overall null value count
data.isnull().sum().sum()
0
sns.jointplot(x="Length", y="Cost",kind="reg", data=data)
<seaborn.axisgrid.JointGrid at 0x7f66b74c6210>
sns.jointplot(x="Height", y="Cost",kind="reg",data=data)
<seaborn.axisgrid.JointGrid at 0x7f66b513c310>
# checking the frequency distribution
data.hist(figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b7521dd0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4f85650>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4f37e50>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4ef9690>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4eace90>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4e6c6d0>]],
dtype=object)
# For each pair of features (columns) in the dataset, we can visualize the scatter plot for each pair
# along with the feature’s histogram along the diagonal
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x7f66b4b66250>
Удаление выбросов
# From the descriptive statistics, cost min is 0, so checking how many cost rows are zero
data[data['Cost']==0]
# Removing rows where cost is 0.
data.drop(data[data['Cost']==0].index,inplace=True)
# Finding outliers using boxplot
melted_df = pd.melt(data.drop('Cost', axis=1))
sns.boxplot(x="variable", y="value", data=melted_df)
<matplotlib.axes._subplots.AxesSubplot at 0x7f66a79206d0>
data[(data['Weight'] >= 55)&(data['Length']>=64)]
# From the boxplot, dotted points are outliers, so removing these rows
data.drop(data[(data['Weight'] >= 55)&(data['Length']>=64)].index, inplace=True)
data.shape
(155, 6)
groups = melted_df.groupby("variable")
for name, group in groups:
plt.plot(group["variable"], group["value"], marker="o", label=name)
sns.scatterplot(data=data.drop('Cost', axis=1))
<matplotlib.axes._subplots.AxesSubplot at 0x7f66a5b37d10>
print(data.drop('Cost',axis=1).cumsum())
Weight Weight1 Length Height Width
0 23.2 25.4 30.0 11.5200 4.0200
1 47.2 51.7 61.2 24.0000 8.3256
2 71.1 78.2 92.3 36.3778 13.0217
3 97.4 107.2 125.8 49.1078 17.4772
4 123.9 136.2 159.8 61.5518 22.6112
.. ... ... ... ... ...
154 3932.5 4259.5 4687.6 1379.3888 672.7894
155 3944.2 4271.9 4701.1 1381.8188 674.0584
156 3956.3 4284.9 4714.9 1384.0958 675.3142
157 3969.5 4299.2 4730.1 1386.9686 677.3814
158 3983.3 4314.2 4746.3 1389.9008 679.2606
[155 rows x 5 columns]
# plotting the cummulative values for all the rows
data.drop('Cost',axis=1).cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f66a5ab6e90>
Корреляции
corr = data.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f66a58ff150>
# displaying the top 12 highly correlated column mappings
print(data.corr().abs().unstack().sort_values(ascending = False).nlargest(25))
Width Width 1.000000
Height Height 1.000000
Weight Weight 1.000000
Weight1 Weight1 1.000000
Length Length 1.000000
Cost Cost 1.000000
Weight1 Weight 0.999417
Weight Weight1 0.999417
Weight1 Length 0.993026
Length Weight1 0.993026
Weight Length 0.990580
Length Weight 0.990580
Width Cost 0.924029
Cost Width 0.924029
Length 0.907373
Length Cost 0.907373
Cost Weight1 0.899734
Weight1 Cost 0.899734
Width Length 0.896793
Length Width 0.896793
Weight Cost 0.895740
Cost Weight 0.895740
Width Weight1 0.895324
Weight1 Width 0.895324
Weight Width 0.888883
dtype: float64
## Weight1 Weight 0.999516
## Since weight and weight 1 has higher co-relation dropping 1 column
## Weight1 Cost has higher coreleation(0.899734) than Weight Cost(0.895740), So keeping Weight1 and dropping Weight
data.drop('Weight', axis=1,inplace=True)
data.head(3)
Возможности Масштабирование
#data = normalize(data, axis=0)
# Scaling the features by using min max scaling
minmax_scale = preprocessing.MinMaxScaler().fit(data.iloc[:,1:])
data.iloc[:,1:] = minmax_scale.transform(data.iloc[:,1:])
data.head(3)
X = data.iloc[:,1:].values
y = data.iloc[:,0:1].values
# Splitting the dataset by train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
learning_rate = 0.5
max_iteration = 5000
s_learning_rate = 0.05
s_max_iteration = 1000
mb_learning_rate = 0.05
mb_max_iteration = 10000
batch_size = 30
theta = np.zeros((data.shape[1], 1))
s_theta = np.zeros((data.shape[1], 1))
mb_theta = np.zeros((data.shape[1], 1))
class GradientDescent:
''' Gradient Descent class which contains batch, mini-batch and stochastic gradient '''
def __init__(self):
''' Constructor of Gradient Descent'''
print('GradientDescent Class initiated')
def get_metrics(self, x, y, theta):
''' To calculate the metrics like MAE, MSE, RMSE, R2 '''
y_pred = [(theta[1] * x1) + (theta[2] * x2)+ (theta[3] * x3)+ (theta[4] * x4) + theta[0]
for x1,x2,x3,x4 in x]
mae = mean_absolute_error(y,y_pred)
mse = mean_squared_error(y,y_pred)
metrics = "MAE = {0}, MSE = {1}, RMSE = {2}, R2 = {3}".format(mae, mse, math.sqrt(mse), r2_score(y,y_pred))
return metrics
def h (self, theta, X) :
''' hypotheseis function to calculate the equation '''
tempX = np.ones((X.shape[0], X.shape[1] + 1))
tempX[:,1:] = X
res = np.matmul(tempX, theta)
return res
def loss (self, theta, X, Y) :
''' Loss function to find the difference between actual and predicted values '''
return np.average(np.square(Y - self.h(theta, X))) / 2
def gradient(self, theta, X, Y):
tempX = np.ones((X.shape[0], X.shape[1] + 1))
tempX[:,1:] = X
d_theta = - np.average((Y - self.h(theta, X)) * tempX, axis= 0)
d_theta = d_theta.reshape((d_theta.shape[0], 1))
return d_theta
def batch_gradient_descent (self, theta, X, Y, learning_rate, max_iteration, gap):
''' Gradient or Batch function to calculate the gradient for the whole dataset '''
cost = np.zeros(max_iteration)
for i in range(max_iteration):
d_theta = self.gradient (theta, X, Y)
theta = theta - learning_rate * d_theta
cost[i] = self.loss(theta, X, Y)
if i % gap == 0 :
print ('iteration : ', i, ' loss : ', self.loss(theta, X, Y))
return theta, cost
def minibatch_gradient_descent (self, theta, X, Y, learning_rate, max_iteration, batch_size, gap):
''' Mini Batch Gradient Function to calculate the gradient by splitting the dataset by multiple batches '''
cost = np.zeros(max_iteration)
for i in range(max_iteration) :
for j in range(0, X.shape[0], batch_size):
d_theta = self.gradient(theta, X[j:j+batch_size,:], Y[j:j+batch_size,:])
theta = theta - learning_rate * d_theta
cost[i] = self.loss(theta, X, Y)
if i % gap == 0 :
print ('iteration : ', i, ' loss : ', self.loss(theta, X, Y))
return theta, cost
def stochastic_gradient_descent (self, theta, X, Y, learning_rate, max_iteration, gap):
''' Stochastic Gradient Function to calculate the gradient for all rows by looping the dataset '''
cost = np.zeros(max_iteration)
for i in range(max_iteration) :
for j in range(X.shape[0]):
d_theta = self.gradient(theta, X[j,:].reshape(1, X.shape[1]), Y[j,:].reshape(1, 1))
theta = theta - learning_rate * d_theta
cost[i] = self.loss(theta, X, Y)
if i % gap == 0 :
print ('iteration : ', i, ' loss : ', self.loss(theta, X, Y))
return theta, cost
Пакетный градиентный спуск
gradient_obj = GradientDescent()
GradientDescent Class initiated
theta, cost = gradient_obj.batch_gradient_descent(theta, X_train, y_train, learning_rate, max_iteration, 1000)
print(theta)
iteration : 0 loss : 26337.652117479473
iteration : 1000 loss : 5610.284986828074
iteration : 2000 loss : 5594.9578737015345
iteration : 3000 loss : 5581.183507278225
iteration : 4000 loss : 5568.280949897843
[[-227.20878731]
[ 609.04391977]
[ -19.20415027]
[ 205.60804752]
[ 584.45550559]]
#plot the cost
fig, ax = plt.subplots()
ax.plot(np.arange(max_iteration), cost, 'r')
ax.legend(loc='upper right', labels=['batch gradient descent'])
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Batch Gradient Descent - Error vs. Training Epoch', fontsize=16, fontweight='bold')
plt.show()
print('Batch Gradient Descent')
print(gradient_obj.get_metrics(X_train,y_train,theta ))
print(gradient_obj.get_metrics(X_test,y_test,theta ))
Batch Gradient Descent
MAE = 88.79709921177003, MSE = 11112.410017110149, RMSE = 105.41541641102665, R2 = 0.8859422707853275
MAE = 77.27555335889485, MSE = 9723.52560637414, RMSE = 98.60793886079426, R2 = 0.9118690650458746
Мини-градиентный спуск
mb_theta, mb_cost = gradient_obj.minibatch_gradient_descent (mb_theta, X_train, y_train,mb_learning_rate, mb_max_iteration, batch_size, 1000)
print(theta)
iteration : 0 loss : 68343.90926706592
iteration : 1000 loss : 5643.550424686975
iteration : 2000 loss : 5617.635873635339
iteration : 3000 loss : 5609.310338702081
iteration : 4000 loss : 5603.001382493756
iteration : 5000 loss : 5597.081134387655
iteration : 6000 loss : 5591.351527799153
iteration : 7000 loss : 5585.779399027338
iteration : 8000 loss : 5580.35487657774
iteration : 9000 loss : 5575.072604021426
[[-227.20878731]
[ 609.04391977]
[ -19.20415027]
[ 205.60804752]
[ 584.45550559]]
#plot the cost
fig, ax = plt.subplots()
ax.plot(np.arange(mb_max_iteration), mb_cost, 'r')
ax.legend(loc='upper right', labels=['mini batch gradient descent'])
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Mini Batch Gradient Descent - Error vs. Training Epoch', fontsize=16, fontweight='bold')
plt.show()
print('Mini Batch Gradient Descent')
print(gradient_obj.get_metrics(X_train,y_train,mb_theta ))
print(gradient_obj.get_metrics(X_test,y_test,mb_theta ))
Mini Batch Gradient Descent
MAE = 88.66784872143079, MSE = 11139.867009657384, RMSE = 105.54556840368706, R2 = 0.8856604523304488
MAE = 77.49573516074169, MSE = 9840.907489095162, RMSE = 99.20134822216461, R2 = 0.9108051530977127
Стохастический градиентный спуск
s_theta, s_cost = gradient_obj.stochastic_gradient_descent(s_theta, X_train, y_train, s_learning_rate, s_max_iteration, 1000)
iteration : 0 loss : 13402.40743175595
#plot the cost
fig, ax = plt.subplots()
ax.plot(np.arange(s_max_iteration), s_cost, 'r')
ax.legend(loc='upper right', labels=['stochastic gradient descent'])
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Stochastic Gradient Descent Error vs. Training Epoch', fontsize=16, fontweight='bold')
plt.show()
print('Stochastic Gradient Descent')
print(gradient_obj.get_metrics(X_train,y_train,s_theta ))
print(gradient_obj.get_metrics(X_test,y_test,s_theta ))
Stochastic Gradient Descent
MAE = 84.55370657249655, MSE = 11392.599861030192, RMSE = 106.73612256883887, R2 = 0.8830664034174635
MAE = 79.3819756016456, MSE = 10935.721474038075, RMSE = 104.57399999061944, R2 = 0.9008821083092442
sklearn SGDRegressor
sgd_regressor = SGDRegressor(max_iter=10000, tol=0.005)
sgd_regressor.fit(X_train, y_train.ravel())
print(sgd_regressor.coef_)
print(sgd_regressor.intercept_)
y_pred = sgd_regressor.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = math.sqrt(MSE)
print('sklearn SGDRegressor')
print("MAE = {0}, MSE = {1}, RMSE = {2}, R2 = {3}".format(MAE, MSE, math.sqrt(MSE), sgd_regressor.score(X_train, y_train)))
[359.50742321 316.35108074 225.46361317 481.28038367]
[-228.71501955]
sklearn SGDRegressor
MAE = 75.52972443435446, MSE = 9571.755566316218, RMSE = 97.83534926761502, R2 = 0.8828668103782972
линейная регрессия Sklearn
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train.ravel())
print(regr.coef_)
print('Intercept: \n', regr.intercept_)
y_pred = regr.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = math.sqrt(MSE)
print('sklearn linear regression')
print("MAE = {0}, MSE = {1}, RMSE = {2}, R2 = {3}".format(MAE, MSE, math.sqrt(MSE), regr.score(X_test, y_test)))
[ 2262.91575501 -1652.05396443 448.07520565 393.80065157]
Intercept:
-212.92015984545435
sklearn linear regression
MAE = 76.66002507824042, MSE = 9264.745150193983, RMSE = 96.25354616944762, R2 = 0.91602730478099
print('END OF NOTEBOOK - THANK YOU')