%matplotlib inline
import matplotlib.pyplot as plt  # very commonly use plot library
import pandas as pd              # powerful library to deal with data
import numpy as np              # powerful library to deal with numbers, array, matrix.
import seaborn as sns


df_mpg = sns.load_dataset('mpg')
df_mpg


df_mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


sns.set_style('white')
sns.set_palette('gray')
# jointplot is for bi-variate plot
#sns.jointplot(x='mpg',y='acceleration',data=df_mpg,kind='scatter')
sns.scatterplot(x='mpg',y='horsepower',data=df_mpg)
#plt.plot(df_mpg['mpg'].mean(),df_mpg['acceleration'].mean(),'r.',markersize=20)

<Axes: xlabel='mpg', ylabel='horsepower'>


x = df_mpg['mpg']
y = df_mpg['horsepower']
xt = np.log(x)
yt = np.log(y)
sns.set_style('white')
sns.set_palette('gray')
plt.plot(xt,yt,'k.')

[<matplotlib.lines.Line2D at 0x7f06c45e7150>]


x = df_mpg['mpg']
y = df_mpg['horsepower']
xt = np.log(x)
yt = np.log(y)
plt.plot(xt,yt,'k.')

[<matplotlib.lines.Line2D at 0x7f06c40bebd0>]


sns.set_style('dark')
sns.color_palette("husl")
# simulate a time series with length of 1000 following normal distribution.
A = np.random.normal(0,1,1000)
sns.histplot(A,fill=True,stat='count',bins=8)
#plt.plot(A,'k.')

<matplotlib.axes._subplots.AxesSubplot at 0x7f06c45bb450>


import statsmodels.api as sm


# qq-plot
import scipy as sp
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(A, plot=ax, fit=True)
r**2

0.9980222981582595


import statsmodels.api as sm
Y = df_mpg['acceleration']
X = df_mpg['mpg']
plt.plot(X,Y,'k.')

[<matplotlib.lines.Line2D at 0x7f4464961a20>]


X = sm.add_constant(X)
X.head()

/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/tsatools.py:142: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
  x = pd.concat(x[::order], 1)


model = sm.OLS(Y, X, missing='drop')
model_result = model.fit()
print(model_result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           acceleration   R-squared:                       0.177
Model:                            OLS   Adj. R-squared:                  0.175
Method:                 Least Squares   F-statistic:                     84.96
Date:                Tue, 04 Oct 2022   Prob (F-statistic):           1.82e-18
Time:                        15:03:43   Log-Likelihood:                -929.29
No. Observations:                 398   AIC:                             1863.
Df Residuals:                     396   BIC:                             1871.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.0811      0.399     30.308      0.000      11.297      12.865
mpg            0.1483      0.016      9.217      0.000       0.117       0.180
==============================================================================
Omnibus:                       15.853   Durbin-Watson:                   1.320
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               16.669
Skew:                           0.468   Prob(JB):                     0.000240
Kurtosis:                       3.359   Cond. No.                         78.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


err = model_result.resid
plt.plot(err,'k.')

[<matplotlib.lines.Line2D at 0x7f06b9262dd0>]


sns.histplot(err,fill=True,stat='count',bins=8)

<matplotlib.axes._subplots.AxesSubplot at 0x7f06b91bcd50>


import scipy as sp
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(err, plot=ax, fit=True)
r**2

0.9851747850982233


Y = df_mpg['acceleration']
X = df_mpg.drop(['acceleration','name','origin','model_year'],axis=1)
X


corrMatrix = df_mpg.corr()
fig , ax = plt.subplots()
fig.set_size_inches(8,8)
sns.heatmap(corrMatrix, annot=True, annot_kws={"size": 14},cmap="YlGnBu",fmt='.2f')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14,rotation=360)

(array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5]),
 <a list of 7 Text major ticklabel objects>)


from statsmodels.stats.outliers_influence import variance_inflation_factor

X = X.dropna()
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
  for i in range(len(X.columns))]

print(vif_data)

        feature         VIF
0           mpg   10.284728
1     cylinders  103.113644
2  displacement   67.410615
3    horsepower   42.194276
4        weight   79.889825


X = X.drop(['cylinders'],axis=1)

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
  for i in range(len(X.columns))]

print(vif_data)

        feature        VIF
0           mpg   7.411553
1  displacement  41.703556
2    horsepower  42.189117
3        weight  66.476453


X = X.drop(['weight'],axis=1)

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
  for i in range(len(X.columns))]

print(vif_data)

        feature        VIF
0           mpg   3.829775
1  displacement  25.521138
2    horsepower  35.504242


X = X.drop(['horsepower'],axis=1)

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
  for i in range(len(X.columns))]

print(vif_data)

        feature      VIF
0           mpg  2.04919
1  displacement  2.04919


df_mpg = df_mpg.dropna()
Y = df_mpg['acceleration']
X = df_mpg.drop(['acceleration','name','origin','model_year','weight','horsepower','cylinders'],axis=1)

model = sm.OLS(Y, X, missing='drop')
model_result = model.fit()
print(model_result.summary())

                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:           acceleration   R-squared (uncentered):                   0.956
Model:                            OLS   Adj. R-squared (uncentered):              0.955
Method:                 Least Squares   F-statistic:                              4201.
Date:                Mon, 23 Sep 2024   Prob (F-statistic):                   1.48e-264
Time:                        08:02:16   Log-Likelihood:                         -1027.1
No. Observations:                 392   AIC:                                      2058.
Df Residuals:                     390   BIC:                                      2066.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
mpg              0.4834      0.010     49.562      0.000       0.464       0.503
displacement     0.0201      0.001     18.397      0.000       0.018       0.022
==============================================================================
Omnibus:                        0.930   Durbin-Watson:                   1.139
Prob(Omnibus):                  0.628   Jarque-Bera (JB):                0.713
Skew:                           0.075   Prob(JB):                        0.700
Kurtosis:                       3.146   Cond. No.                         12.9
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino
...	...	...	...	...	...	...	...	...	...
393	27.0	4	140.0	86.0	2790	15.6	82	usa	ford mustang gl
394	44.0	4	97.0	52.0	2130	24.6	82	europe	vw pickup
395	32.0	4	135.0	84.0	2295	11.6	82	usa	dodge rampage
396	28.0	4	120.0	79.0	2625	18.6	82	usa	ford ranger
397	31.0	4	119.0	82.0	2720	19.4	82	usa	chevy s-10

	const	mpg
0	1.0	18.0
1	1.0	15.0
2	1.0	18.0
3	1.0	16.0
4	1.0	17.0

	mpg	cylinders	displacement	horsepower	weight
0	18.0	8	307.0	130.0	3504
1	15.0	8	350.0	165.0	3693
2	18.0	8	318.0	150.0	3436
3	16.0	8	304.0	150.0	3433
4	17.0	8	302.0	140.0	3449
...	...	...	...	...	...
393	27.0	4	140.0	86.0	2790
394	44.0	4	97.0	52.0	2130
395	32.0	4	135.0	84.0	2295
396	28.0	4	120.0	79.0	2625
397	31.0	4	119.0	82.0	2720