import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn import linear_model
# Reading CSV Input file and storing as pandas dataframe object
data = pd.read_csv('Data_miniproject.csv')
data.columns
Index(['Cost', 'Weight', 'Weight1', 'Length', 'Height', 'Width'], dtype='object')
# Checking top 5 rows for data
data.head(5)

ЭДА

# checking the data type and wthether it will accept null or not
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Cost     159 non-null    float64
 1   Weight   159 non-null    float64
 2   Weight1  159 non-null    float64
 3   Length   159 non-null    float64
 4   Height   159 non-null    float64
 5   Width    159 non-null    float64
dtypes: float64(6)
memory usage: 7.6 KB
# Generate descriptive statistics.
# Checking Min, Max, Mean, Std for all the columns
data.describe(percentiles=[.10, .25, .5, .75])

Проверьте, есть ли нулевые значения

# checking column wise null values
data.isnull().any()
Cost       False
Weight     False
Weight1    False
Length     False
Height     False
Width      False
dtype: bool
# checking column wise null value count
data.isnull().any()
data.isnull().sum()
Cost       0
Weight     0
Weight1    0
Length     0
Height     0
Width      0
dtype: int64
# checking overall null value count
data.isnull().sum().sum()
0
sns.jointplot(x="Length", y="Cost",kind="reg", data=data)
<seaborn.axisgrid.JointGrid at 0x7f66b74c6210>

sns.jointplot(x="Height", y="Cost",kind="reg",data=data)
<seaborn.axisgrid.JointGrid at 0x7f66b513c310>

# checking the frequency distribution
data.hist(figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b7521dd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4f85650>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4f37e50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4ef9690>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4eace90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f66b4e6c6d0>]],
      dtype=object)

# For each pair of features (columns) in the dataset, we can visualize the scatter plot for each pair 
# along with the feature’s histogram along the diagonal
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x7f66b4b66250>

Удаление выбросов

# From the descriptive statistics, cost min is 0, so checking how many cost rows are zero
data[data['Cost']==0]

# Removing rows where cost is 0.
data.drop(data[data['Cost']==0].index,inplace=True)
# Finding outliers using boxplot
melted_df = pd.melt(data.drop('Cost', axis=1))
sns.boxplot(x="variable", y="value", data=melted_df)
<matplotlib.axes._subplots.AxesSubplot at 0x7f66a79206d0>

data[(data['Weight'] >= 55)&(data['Length']>=64)]

# From the boxplot, dotted points are outliers, so removing these rows
data.drop(data[(data['Weight'] >= 55)&(data['Length']>=64)].index, inplace=True)
data.shape
(155, 6)
groups = melted_df.groupby("variable")
for name, group in groups:
    plt.plot(group["variable"], group["value"], marker="o",  label=name)

sns.scatterplot(data=data.drop('Cost', axis=1))
<matplotlib.axes._subplots.AxesSubplot at 0x7f66a5b37d10>

print(data.drop('Cost',axis=1).cumsum())
Weight  Weight1  Length     Height     Width
0      23.2     25.4    30.0    11.5200    4.0200
1      47.2     51.7    61.2    24.0000    8.3256
2      71.1     78.2    92.3    36.3778   13.0217
3      97.4    107.2   125.8    49.1078   17.4772
4     123.9    136.2   159.8    61.5518   22.6112
..      ...      ...     ...        ...       ...
154  3932.5   4259.5  4687.6  1379.3888  672.7894
155  3944.2   4271.9  4701.1  1381.8188  674.0584
156  3956.3   4284.9  4714.9  1384.0958  675.3142
157  3969.5   4299.2  4730.1  1386.9686  677.3814
158  3983.3   4314.2  4746.3  1389.9008  679.2606

[155 rows x 5 columns]
# plotting the cummulative values for all the rows
data.drop('Cost',axis=1).cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f66a5ab6e90>

Корреляции

corr = data.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f66a58ff150>

# displaying the top 12 highly correlated column mappings
print(data.corr().abs().unstack().sort_values(ascending = False).nlargest(25))
Width    Width      1.000000
Height   Height     1.000000
Weight   Weight     1.000000
Weight1  Weight1    1.000000
Length   Length     1.000000
Cost     Cost       1.000000
Weight1  Weight     0.999417
Weight   Weight1    0.999417
Weight1  Length     0.993026
Length   Weight1    0.993026
Weight   Length     0.990580
Length   Weight     0.990580
Width    Cost       0.924029
Cost     Width      0.924029
         Length     0.907373
Length   Cost       0.907373
Cost     Weight1    0.899734
Weight1  Cost       0.899734
Width    Length     0.896793
Length   Width      0.896793
Weight   Cost       0.895740
Cost     Weight     0.895740
Width    Weight1    0.895324
Weight1  Width      0.895324
Weight   Width      0.888883
dtype: float64
## Weight1  Weight     0.999516
## Since weight and weight 1 has higher co-relation dropping 1 column
## Weight1 Cost has higher coreleation(0.899734) than Weight Cost(0.895740), So keeping Weight1 and dropping Weight
data.drop('Weight', axis=1,inplace=True)
data.head(3)

Возможности Масштабирование

#data = normalize(data, axis=0)
# Scaling the features by using min max scaling
minmax_scale = preprocessing.MinMaxScaler().fit(data.iloc[:,1:])
data.iloc[:,1:] = minmax_scale.transform(data.iloc[:,1:])
data.head(3)

X = data.iloc[:,1:].values
y = data.iloc[:,0:1].values
# Splitting the dataset by train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
learning_rate = 0.5
max_iteration = 5000

s_learning_rate = 0.05
s_max_iteration = 1000

mb_learning_rate = 0.05
mb_max_iteration = 10000
batch_size = 30
theta = np.zeros((data.shape[1], 1))
s_theta = np.zeros((data.shape[1], 1))
mb_theta = np.zeros((data.shape[1], 1))
class GradientDescent:
    ''' Gradient Descent class which contains batch, mini-batch and stochastic gradient '''
    
    def __init__(self):
        ''' Constructor of Gradient Descent'''
        print('GradientDescent Class initiated')
        
    def get_metrics(self, x, y, theta):
        ''' To calculate the metrics like MAE, MSE, RMSE, R2 '''
        y_pred = [(theta[1] * x1) + (theta[2] * x2)+ (theta[3] * x3)+ (theta[4] * x4) + theta[0]
                       for x1,x2,x3,x4 in x]
        mae = mean_absolute_error(y,y_pred)
        mse = mean_squared_error(y,y_pred)
        metrics = "MAE = {0}, MSE = {1}, RMSE = {2}, R2 = {3}".format(mae, mse, math.sqrt(mse), r2_score(y,y_pred))
        return metrics
    
    def h (self, theta, X) :
        ''' hypotheseis function to calculate the equation '''
        tempX = np.ones((X.shape[0], X.shape[1] + 1))
        tempX[:,1:] = X
        res = np.matmul(tempX, theta)
        return res
    
    def loss (self, theta, X, Y) :
        ''' Loss function to find the difference between actual and predicted values '''
        return np.average(np.square(Y - self.h(theta, X))) / 2
    
    def gradient(self, theta, X, Y):
        tempX = np.ones((X.shape[0], X.shape[1] + 1))
        tempX[:,1:] = X
        d_theta = - np.average((Y - self.h(theta, X)) * tempX, axis= 0)
        d_theta = d_theta.reshape((d_theta.shape[0], 1))
        return d_theta
    
    def batch_gradient_descent (self, theta, X, Y, learning_rate, max_iteration, gap):
        ''' Gradient or Batch function to calculate the gradient for the whole dataset '''
        cost = np.zeros(max_iteration)
        for i in range(max_iteration):
            d_theta = self.gradient (theta, X, Y)
            theta = theta - learning_rate * d_theta
            cost[i] = self.loss(theta, X, Y)
            if i % gap == 0 :
                print ('iteration : ', i, ' loss : ', self.loss(theta, X, Y))
        return theta, cost

    def minibatch_gradient_descent (self, theta, X, Y, learning_rate, max_iteration, batch_size, gap):
        ''' Mini Batch Gradient Function to calculate the gradient by splitting the dataset by multiple batches '''
        cost = np.zeros(max_iteration)
        for i in range(max_iteration) :
            for j in range(0, X.shape[0], batch_size):
                d_theta = self.gradient(theta, X[j:j+batch_size,:], Y[j:j+batch_size,:])
                theta = theta - learning_rate * d_theta
            cost[i] = self.loss(theta, X, Y)
            if i % gap == 0 :
                print ('iteration : ', i, ' loss : ', self.loss(theta, X, Y)) 
        return theta, cost
    
    def stochastic_gradient_descent (self, theta, X, Y, learning_rate, max_iteration, gap):
        ''' Stochastic Gradient Function to calculate the gradient for all rows by looping the dataset '''
        cost = np.zeros(max_iteration)
        for i in range(max_iteration) :
            for j in range(X.shape[0]):
                d_theta = self.gradient(theta, X[j,:].reshape(1, X.shape[1]), Y[j,:].reshape(1, 1))
                theta = theta - learning_rate * d_theta    
            cost[i] = self.loss(theta, X, Y)
            if i % gap == 0 :
                print ('iteration : ', i, ' loss : ', self.loss(theta, X, Y))
        return theta, cost

Пакетный градиентный спуск

gradient_obj = GradientDescent()
GradientDescent Class initiated
theta, cost = gradient_obj.batch_gradient_descent(theta, X_train, y_train, learning_rate, max_iteration, 1000)
print(theta)
iteration :  0  loss :  26337.652117479473
iteration :  1000  loss :  5610.284986828074
iteration :  2000  loss :  5594.9578737015345
iteration :  3000  loss :  5581.183507278225
iteration :  4000  loss :  5568.280949897843
[[-227.20878731]
 [ 609.04391977]
 [ -19.20415027]
 [ 205.60804752]
 [ 584.45550559]]
#plot the cost
fig, ax = plt.subplots()  
ax.plot(np.arange(max_iteration), cost, 'r')  
ax.legend(loc='upper right', labels=['batch gradient descent'])
ax.set_xlabel('Iterations')  
ax.set_ylabel('Cost')  
ax.set_title('Batch Gradient Descent - Error vs. Training Epoch', fontsize=16, fontweight='bold')
plt.show()

print('Batch Gradient Descent')
print(gradient_obj.get_metrics(X_train,y_train,theta ))
print(gradient_obj.get_metrics(X_test,y_test,theta ))
Batch Gradient Descent
MAE = 88.79709921177003, MSE = 11112.410017110149, RMSE = 105.41541641102665, R2 = 0.8859422707853275
MAE = 77.27555335889485, MSE = 9723.52560637414, RMSE = 98.60793886079426, R2 = 0.9118690650458746

Мини-градиентный спуск

mb_theta, mb_cost = gradient_obj.minibatch_gradient_descent (mb_theta, X_train, y_train,mb_learning_rate, mb_max_iteration, batch_size, 1000)
print(theta)
iteration :  0  loss :  68343.90926706592
iteration :  1000  loss :  5643.550424686975
iteration :  2000  loss :  5617.635873635339
iteration :  3000  loss :  5609.310338702081
iteration :  4000  loss :  5603.001382493756
iteration :  5000  loss :  5597.081134387655
iteration :  6000  loss :  5591.351527799153
iteration :  7000  loss :  5585.779399027338
iteration :  8000  loss :  5580.35487657774
iteration :  9000  loss :  5575.072604021426
[[-227.20878731]
 [ 609.04391977]
 [ -19.20415027]
 [ 205.60804752]
 [ 584.45550559]]
#plot the cost
fig, ax = plt.subplots()  
ax.plot(np.arange(mb_max_iteration), mb_cost, 'r')  
ax.legend(loc='upper right', labels=['mini batch gradient descent'])
ax.set_xlabel('Iterations')  
ax.set_ylabel('Cost')  
ax.set_title('Mini Batch Gradient Descent - Error vs. Training Epoch', fontsize=16, fontweight='bold')
plt.show()

print('Mini Batch Gradient Descent')
print(gradient_obj.get_metrics(X_train,y_train,mb_theta ))
print(gradient_obj.get_metrics(X_test,y_test,mb_theta ))
Mini Batch Gradient Descent
MAE = 88.66784872143079, MSE = 11139.867009657384, RMSE = 105.54556840368706, R2 = 0.8856604523304488
MAE = 77.49573516074169, MSE = 9840.907489095162, RMSE = 99.20134822216461, R2 = 0.9108051530977127

Стохастический градиентный спуск

s_theta, s_cost = gradient_obj.stochastic_gradient_descent(s_theta, X_train, y_train, s_learning_rate, s_max_iteration, 1000)
iteration :  0  loss :  13402.40743175595
#plot the cost
fig, ax = plt.subplots()  
ax.plot(np.arange(s_max_iteration), s_cost, 'r')  
ax.legend(loc='upper right', labels=['stochastic gradient descent'])
ax.set_xlabel('Iterations')  
ax.set_ylabel('Cost')  
ax.set_title('Stochastic Gradient Descent Error vs. Training Epoch', fontsize=16, fontweight='bold')
plt.show()

print('Stochastic Gradient Descent')
print(gradient_obj.get_metrics(X_train,y_train,s_theta ))
print(gradient_obj.get_metrics(X_test,y_test,s_theta ))
Stochastic Gradient Descent
MAE = 84.55370657249655, MSE = 11392.599861030192, RMSE = 106.73612256883887, R2 = 0.8830664034174635
MAE = 79.3819756016456, MSE = 10935.721474038075, RMSE = 104.57399999061944, R2 = 0.9008821083092442

sklearn SGDRegressor

sgd_regressor = SGDRegressor(max_iter=10000, tol=0.005)
sgd_regressor.fit(X_train, y_train.ravel())
print(sgd_regressor.coef_)
print(sgd_regressor.intercept_)

y_pred = sgd_regressor.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = math.sqrt(MSE)
print('sklearn SGDRegressor')
print("MAE = {0}, MSE = {1}, RMSE = {2}, R2 = {3}".format(MAE, MSE, math.sqrt(MSE), sgd_regressor.score(X_train, y_train)))
[359.50742321 316.35108074 225.46361317 481.28038367]
[-228.71501955]
sklearn SGDRegressor
MAE = 75.52972443435446, MSE = 9571.755566316218, RMSE = 97.83534926761502, R2 = 0.8828668103782972

линейная регрессия Sklearn

# with sklearn
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train.ravel())
print(regr.coef_)
print('Intercept: \n', regr.intercept_)

y_pred = regr.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = math.sqrt(MSE)
print('sklearn linear regression')
print("MAE = {0}, MSE = {1}, RMSE = {2}, R2 = {3}".format(MAE, MSE, math.sqrt(MSE), regr.score(X_test, y_test)))
[ 2262.91575501 -1652.05396443   448.07520565   393.80065157]
Intercept: 
 -212.92015984545435
sklearn linear regression
MAE = 76.66002507824042, MSE = 9264.745150193983, RMSE = 96.25354616944762, R2 = 0.91602730478099
print('END OF NOTEBOOK - THANK YOU')