%matplotlib inline
import matplotlib.pyplot as plt # very commonly use plot library
import pandas as pd # powerful library to deal with data
import numpy as np # powerful library to deal with numbers, array, matrix.
import seaborn as sns
df_mpg = sns.load_dataset('mpg')
df_mpg
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | usa | ford mustang gl |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | europe | vw pickup |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | usa | dodge rampage |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | usa | ford ranger |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | usa | chevy s-10 |
398 rows × 9 columns
df_mpg.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cylinders 398 non-null int64 2 displacement 398 non-null float64 3 horsepower 392 non-null float64 4 weight 398 non-null int64 5 acceleration 398 non-null float64 6 model_year 398 non-null int64 7 origin 398 non-null object 8 name 398 non-null object dtypes: float64(4), int64(3), object(2) memory usage: 28.1+ KB
sns.set_style('white')
sns.set_palette('gray')
# jointplot is for bi-variate plot
#sns.jointplot(x='mpg',y='acceleration',data=df_mpg,kind='scatter')
sns.scatterplot(x='mpg',y='horsepower',data=df_mpg)
#plt.plot(df_mpg['mpg'].mean(),df_mpg['acceleration'].mean(),'r.',markersize=20)
<Axes: xlabel='mpg', ylabel='horsepower'>
x = df_mpg['mpg']
y = df_mpg['horsepower']
xt = np.log(x)
yt = np.log(y)
sns.set_style('white')
sns.set_palette('gray')
plt.plot(xt,yt,'k.')
[<matplotlib.lines.Line2D at 0x7f06c45e7150>]
x = df_mpg['mpg']
y = df_mpg['horsepower']
xt = np.log(x)
yt = np.log(y)
plt.plot(xt,yt,'k.')
[<matplotlib.lines.Line2D at 0x7f06c40bebd0>]
sns.set_style('dark')
sns.color_palette("husl")
# simulate a time series with length of 1000 following normal distribution.
A = np.random.normal(0,1,1000)
sns.histplot(A,fill=True,stat='count',bins=8)
#plt.plot(A,'k.')
<matplotlib.axes._subplots.AxesSubplot at 0x7f06c45bb450>
import statsmodels.api as sm
# qq-plot
import scipy as sp
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(A, plot=ax, fit=True)
r**2
0.9980222981582595
import statsmodels.api as sm
Y = df_mpg['acceleration']
X = df_mpg['mpg']
plt.plot(X,Y,'k.')
[<matplotlib.lines.Line2D at 0x7f4464961a20>]
X = sm.add_constant(X)
X.head()
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/tsatools.py:142: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only x = pd.concat(x[::order], 1)
const | mpg | |
---|---|---|
0 | 1.0 | 18.0 |
1 | 1.0 | 15.0 |
2 | 1.0 | 18.0 |
3 | 1.0 | 16.0 |
4 | 1.0 | 17.0 |
model = sm.OLS(Y, X, missing='drop')
model_result = model.fit()
print(model_result.summary())
OLS Regression Results ============================================================================== Dep. Variable: acceleration R-squared: 0.177 Model: OLS Adj. R-squared: 0.175 Method: Least Squares F-statistic: 84.96 Date: Tue, 04 Oct 2022 Prob (F-statistic): 1.82e-18 Time: 15:03:43 Log-Likelihood: -929.29 No. Observations: 398 AIC: 1863. Df Residuals: 396 BIC: 1871. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 12.0811 0.399 30.308 0.000 11.297 12.865 mpg 0.1483 0.016 9.217 0.000 0.117 0.180 ============================================================================== Omnibus: 15.853 Durbin-Watson: 1.320 Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.669 Skew: 0.468 Prob(JB): 0.000240 Kurtosis: 3.359 Cond. No. 78.8 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
err = model_result.resid
plt.plot(err,'k.')
[<matplotlib.lines.Line2D at 0x7f06b9262dd0>]
sns.histplot(err,fill=True,stat='count',bins=8)
<matplotlib.axes._subplots.AxesSubplot at 0x7f06b91bcd50>
import scipy as sp
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(err, plot=ax, fit=True)
r**2
0.9851747850982233
Multiple Regression
Y = df_mpg['acceleration']
X = df_mpg.drop(['acceleration','name','origin','model_year'],axis=1)
X
mpg | cylinders | displacement | horsepower | weight | |
---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 |
... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790 |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130 |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295 |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625 |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720 |
398 rows × 5 columns
corrMatrix = df_mpg.corr()
fig , ax = plt.subplots()
fig.set_size_inches(8,8)
sns.heatmap(corrMatrix, annot=True, annot_kws={"size": 14},cmap="YlGnBu",fmt='.2f')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14,rotation=360)
(array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5]), <a list of 7 Text major ticklabel objects>)
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = X.dropna()
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
print(vif_data)
feature VIF 0 mpg 10.284728 1 cylinders 103.113644 2 displacement 67.410615 3 horsepower 42.194276 4 weight 79.889825
X = X.drop(['cylinders'],axis=1)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
print(vif_data)
feature VIF 0 mpg 7.411553 1 displacement 41.703556 2 horsepower 42.189117 3 weight 66.476453
X = X.drop(['weight'],axis=1)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
print(vif_data)
feature VIF 0 mpg 3.829775 1 displacement 25.521138 2 horsepower 35.504242
X = X.drop(['horsepower'],axis=1)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
print(vif_data)
feature VIF 0 mpg 2.04919 1 displacement 2.04919
The independent variables show highly correlated.
df_mpg = df_mpg.dropna()
Y = df_mpg['acceleration']
X = df_mpg.drop(['acceleration','name','origin','model_year','weight','horsepower','cylinders'],axis=1)
model = sm.OLS(Y, X, missing='drop')
model_result = model.fit()
print(model_result.summary())
OLS Regression Results ======================================================================================= Dep. Variable: acceleration R-squared (uncentered): 0.956 Model: OLS Adj. R-squared (uncentered): 0.955 Method: Least Squares F-statistic: 4201. Date: Mon, 23 Sep 2024 Prob (F-statistic): 1.48e-264 Time: 08:02:16 Log-Likelihood: -1027.1 No. Observations: 392 AIC: 2058. Df Residuals: 390 BIC: 2066. Df Model: 2 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- mpg 0.4834 0.010 49.562 0.000 0.464 0.503 displacement 0.0201 0.001 18.397 0.000 0.018 0.022 ============================================================================== Omnibus: 0.930 Durbin-Watson: 1.139 Prob(Omnibus): 0.628 Jarque-Bera (JB): 0.713 Skew: 0.075 Prob(JB): 0.700 Kurtosis: 3.146 Cond. No. 12.9 ============================================================================== Notes: [1] R² is computed without centering (uncentered) since the model does not contain a constant. [2] Standard Errors assume that the covariance matrix of the errors is correctly specified.