import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']


df = sns.load_dataset('mpg')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
 7   origin        392 non-null    object 
 8   name          392 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 30.6+ KB


train_dataset = df.sample(frac=0.8, random_state=0)
test_dataset = df.drop(train_dataset.index)
train_dataset


sns.pairplot(train_dataset[['mpg', 'cylinders', 'displacement', 'weight']], diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x7cbcc0308df0>


train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('mpg')
test_labels = test_features.pop('mpg')


import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import timedelta,date
from scipy.stats import linregress
import statsmodels.api as sm


Y = train_dataset['displacement']
X = train_dataset['mpg']
plt.plot(X,Y,'k.')
print(type(X))

<class 'pandas.core.series.Series'>


import statsmodels.api as sm


X = sm.add_constant(X)
X.head()


model = sm.OLS(Y, X, missing='drop')

model_result = model.fit()
print(model_result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           displacement   R-squared:                       0.641
Model:                            OLS   Adj. R-squared:                  0.639
Method:                 Least Squares   F-statistic:                     556.1
Date:                Mon, 11 Nov 2024   Prob (F-statistic):           2.62e-71
Time:                        06:05:54   Log-Likelihood:                -1743.7
No. Observations:                 314   AIC:                             3491.
Df Residuals:                     312   BIC:                             3499.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        447.1798     11.250     39.750      0.000     425.045     469.315
mpg          -10.8046      0.458    -23.583      0.000     -11.706      -9.903
==============================================================================
Omnibus:                        6.356   Durbin-Watson:                   2.134
Prob(Omnibus):                  0.042   Jarque-Bera (JB):                6.109
Skew:                           0.316   Prob(JB):                       0.0472
Kurtosis:                       3.260   Cond. No.                         78.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


xt = np.linspace(10,48,50)
Xt = train_dataset['mpg']

yt = xt*-10.8046+477.1798
plt.plot(Xt,Y,'k.')
plt.plot(xt,yt,'b--')

[<matplotlib.lines.Line2D at 0x7fcd885728f0>]


from sklearn import svm, metrics
print(X.shape)
print(X)

(314, 2)
     const   mpg
146    1.0  28.0
282    1.0  22.3
69     1.0  12.0
378    1.0  38.0
331    1.0  33.8
..     ...   ...
281    1.0  19.8
229    1.0  16.0
150    1.0  26.0
145    1.0  32.0
182    1.0  28.0

[314 rows x 2 columns]


svr_model = svm.SVR(C=100, epsilon=1, kernel='rbf')

# train the SVM model with training dataset.

svr_model.fit(X,Y)
#xt = sm.add_constant(xt)
# use the trained model for testing dataset.
predicted = svr_model.predict(X)
print('R-sqaure of SVR is ',svr_model.score(X,Y))
print('It is better than simple regression (0.641)')
plt.plot(X['mpg'],Y,'k.')
plt.plot(X['mpg'],predicted,'b.')

R-sqaure of SVR is  0.8017585387171895
It is better than simple regression (0.641)

[<matplotlib.lines.Line2D at 0x7c11e4fd51e0>]

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
146	28.0	4	90.0	75.0	2125	14.5	74	usa	dodge colt
282	22.3	4	140.0	88.0	2890	17.3	79	usa	ford fairmont 4
69	12.0	8	350.0	160.0	4456	13.5	72	usa	oldsmobile delta 88 royale
378	38.0	4	105.0	63.0	2125	14.7	82	usa	plymouth horizon miser
331	33.8	4	97.0	67.0	2145	18.0	80	japan	subaru dl
...	...	...	...	...	...	...	...	...	...
281	19.8	6	200.0	85.0	2990	18.2	79	usa	mercury zephyr 6
229	16.0	8	400.0	180.0	4220	11.1	77	usa	pontiac grand prix lj
150	26.0	4	108.0	93.0	2391	15.5	74	japan	subaru
145	32.0	4	83.0	61.0	2003	19.0	74	japan	datsun 710
182	28.0	4	107.0	86.0	2464	15.5	76	europe	fiat 131