In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt  # very commonly use plot library
import pandas as pd              # powerful library to deal with data
import numpy as np              # powerful library to deal with numbers, array, matrix.
import seaborn as sns
In [ ]:
df_mpg = sns.load_dataset('mpg')
df_mpg
Out[ ]:
mpg cylinders displacement horsepower weight acceleration model_year origin name
0 18.0 8 307.0 130.0 3504 12.0 70 usa chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693 11.5 70 usa buick skylark 320
2 18.0 8 318.0 150.0 3436 11.0 70 usa plymouth satellite
3 16.0 8 304.0 150.0 3433 12.0 70 usa amc rebel sst
4 17.0 8 302.0 140.0 3449 10.5 70 usa ford torino
... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86.0 2790 15.6 82 usa ford mustang gl
394 44.0 4 97.0 52.0 2130 24.6 82 europe vw pickup
395 32.0 4 135.0 84.0 2295 11.6 82 usa dodge rampage
396 28.0 4 120.0 79.0 2625 18.6 82 usa ford ranger
397 31.0 4 119.0 82.0 2720 19.4 82 usa chevy s-10

398 rows × 9 columns

In [ ]:
df_mpg.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB
In [ ]:
sns.set_style('white')
sns.set_palette('gray')
# jointplot is for bi-variate plot
#sns.jointplot(x='mpg',y='acceleration',data=df_mpg,kind='scatter')
sns.scatterplot(x='mpg',y='horsepower',data=df_mpg)
#plt.plot(df_mpg['mpg'].mean(),df_mpg['acceleration'].mean(),'r.',markersize=20)
Out[ ]:
<Axes: xlabel='mpg', ylabel='horsepower'>
In [ ]:
x = df_mpg['mpg']
y = df_mpg['horsepower']
xt = np.log(x)
yt = np.log(y)
sns.set_style('white')
sns.set_palette('gray')
plt.plot(xt,yt,'k.')
Out[ ]:
[<matplotlib.lines.Line2D at 0x7f06c45e7150>]
In [ ]:
x = df_mpg['mpg']
y = df_mpg['horsepower']
xt = np.log(x)
yt = np.log(y)
plt.plot(xt,yt,'k.')
Out[ ]:
[<matplotlib.lines.Line2D at 0x7f06c40bebd0>]
In [ ]:
sns.set_style('dark')
sns.color_palette("husl")
# simulate a time series with length of 1000 following normal distribution.
A = np.random.normal(0,1,1000)
sns.histplot(A,fill=True,stat='count',bins=8)
#plt.plot(A,'k.')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f06c45bb450>
In [ ]:
import statsmodels.api as sm
In [ ]:
# qq-plot
import scipy as sp
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(A, plot=ax, fit=True)
r**2
Out[ ]:
0.9980222981582595
In [ ]:
import statsmodels.api as sm
Y = df_mpg['acceleration']
X = df_mpg['mpg']
plt.plot(X,Y,'k.')
Out[ ]:
[<matplotlib.lines.Line2D at 0x7f4464961a20>]
In [ ]:
X = sm.add_constant(X)
X.head()
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/tsatools.py:142: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
  x = pd.concat(x[::order], 1)
Out[ ]:
const mpg
0 1.0 18.0
1 1.0 15.0
2 1.0 18.0
3 1.0 16.0
4 1.0 17.0
In [ ]:
model = sm.OLS(Y, X, missing='drop')
model_result = model.fit()
print(model_result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           acceleration   R-squared:                       0.177
Model:                            OLS   Adj. R-squared:                  0.175
Method:                 Least Squares   F-statistic:                     84.96
Date:                Tue, 04 Oct 2022   Prob (F-statistic):           1.82e-18
Time:                        15:03:43   Log-Likelihood:                -929.29
No. Observations:                 398   AIC:                             1863.
Df Residuals:                     396   BIC:                             1871.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.0811      0.399     30.308      0.000      11.297      12.865
mpg            0.1483      0.016      9.217      0.000       0.117       0.180
==============================================================================
Omnibus:                       15.853   Durbin-Watson:                   1.320
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               16.669
Skew:                           0.468   Prob(JB):                     0.000240
Kurtosis:                       3.359   Cond. No.                         78.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
err = model_result.resid
plt.plot(err,'k.')
Out[ ]:
[<matplotlib.lines.Line2D at 0x7f06b9262dd0>]
In [ ]:
sns.histplot(err,fill=True,stat='count',bins=8)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f06b91bcd50>
In [ ]:
import scipy as sp
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(err, plot=ax, fit=True)
r**2
Out[ ]:
0.9851747850982233
In [ ]:

Multiple Regression

In [ ]:
Y = df_mpg['acceleration']
X = df_mpg.drop(['acceleration','name','origin','model_year'],axis=1)
X
Out[ ]:
mpg cylinders displacement horsepower weight
0 18.0 8 307.0 130.0 3504
1 15.0 8 350.0 165.0 3693
2 18.0 8 318.0 150.0 3436
3 16.0 8 304.0 150.0 3433
4 17.0 8 302.0 140.0 3449
... ... ... ... ... ...
393 27.0 4 140.0 86.0 2790
394 44.0 4 97.0 52.0 2130
395 32.0 4 135.0 84.0 2295
396 28.0 4 120.0 79.0 2625
397 31.0 4 119.0 82.0 2720

398 rows × 5 columns

In [ ]:
corrMatrix = df_mpg.corr()
fig , ax = plt.subplots()
fig.set_size_inches(8,8)
sns.heatmap(corrMatrix, annot=True, annot_kws={"size": 14},cmap="YlGnBu",fmt='.2f')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14,rotation=360)
Out[ ]:
(array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5]),
 <a list of 7 Text major ticklabel objects>)
In [ ]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = X.dropna()
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
  for i in range(len(X.columns))]

print(vif_data)
        feature         VIF
0           mpg   10.284728
1     cylinders  103.113644
2  displacement   67.410615
3    horsepower   42.194276
4        weight   79.889825
In [ ]:
X = X.drop(['cylinders'],axis=1)

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
  for i in range(len(X.columns))]

print(vif_data)
        feature        VIF
0           mpg   7.411553
1  displacement  41.703556
2    horsepower  42.189117
3        weight  66.476453
In [ ]:
X = X.drop(['weight'],axis=1)

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
  for i in range(len(X.columns))]

print(vif_data)
        feature        VIF
0           mpg   3.829775
1  displacement  25.521138
2    horsepower  35.504242
In [ ]:
X = X.drop(['horsepower'],axis=1)

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
  for i in range(len(X.columns))]

print(vif_data)
        feature      VIF
0           mpg  2.04919
1  displacement  2.04919

The independent variables show highly correlated.

In [ ]:
df_mpg = df_mpg.dropna()
Y = df_mpg['acceleration']
X = df_mpg.drop(['acceleration','name','origin','model_year','weight','horsepower','cylinders'],axis=1)

model = sm.OLS(Y, X, missing='drop')
model_result = model.fit()
print(model_result.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:           acceleration   R-squared (uncentered):                   0.956
Model:                            OLS   Adj. R-squared (uncentered):              0.955
Method:                 Least Squares   F-statistic:                              4201.
Date:                Mon, 23 Sep 2024   Prob (F-statistic):                   1.48e-264
Time:                        08:02:16   Log-Likelihood:                         -1027.1
No. Observations:                 392   AIC:                                      2058.
Df Residuals:                     390   BIC:                                      2066.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
mpg              0.4834      0.010     49.562      0.000       0.464       0.503
displacement     0.0201      0.001     18.397      0.000       0.018       0.022
==============================================================================
Omnibus:                        0.930   Durbin-Watson:                   1.139
Prob(Omnibus):                  0.628   Jarque-Bera (JB):                0.713
Skew:                           0.075   Prob(JB):                        0.700
Kurtosis:                       3.146   Cond. No.                         12.9
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.