In [ ]:
# import libraries we need
# this line is to make sure the figure will show below the code
%matplotlib inline
import matplotlib.pyplot as plt  # very commonly use plot library
import pandas as pd              # powerful library to deal with data
import numpy as np              # powerful library to deal with numbers, array, matrix.
import seaborn as sns           # a powerful libaray to make plot with dataframe
#from scipy import stats       # very commonly used library for data analysis
In [ ]:
# show datasets with sns library
print(sns.get_dataset_names())
['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']
In [ ]:
# load dataset from sns
df = sns.load_dataset('mpg')
df
Out[ ]:
mpg cylinders displacement horsepower weight acceleration model_year origin name
0 18.0 8 307.0 130.0 3504 12.0 70 usa chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693 11.5 70 usa buick skylark 320
2 18.0 8 318.0 150.0 3436 11.0 70 usa plymouth satellite
3 16.0 8 304.0 150.0 3433 12.0 70 usa amc rebel sst
4 17.0 8 302.0 140.0 3449 10.5 70 usa ford torino
... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86.0 2790 15.6 82 usa ford mustang gl
394 44.0 4 97.0 52.0 2130 24.6 82 europe vw pickup
395 32.0 4 135.0 84.0 2295 11.6 82 usa dodge rampage
396 28.0 4 120.0 79.0 2625 18.6 82 usa ford ranger
397 31.0 4 119.0 82.0 2720 19.4 82 usa chevy s-10

398 rows × 9 columns

In [ ]:
df.head(10)
Out[ ]:
mpg cylinders displacement horsepower weight acceleration model_year origin name
0 18.0 8 307.0 130.0 3504 12.0 70 usa chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693 11.5 70 usa buick skylark 320
2 18.0 8 318.0 150.0 3436 11.0 70 usa plymouth satellite
3 16.0 8 304.0 150.0 3433 12.0 70 usa amc rebel sst
4 17.0 8 302.0 140.0 3449 10.5 70 usa ford torino
5 15.0 8 429.0 198.0 4341 10.0 70 usa ford galaxie 500
6 14.0 8 454.0 220.0 4354 9.0 70 usa chevrolet impala
7 14.0 8 440.0 215.0 4312 8.5 70 usa plymouth fury iii
8 14.0 8 455.0 225.0 4425 10.0 70 usa pontiac catalina
9 15.0 8 390.0 190.0 3850 8.5 70 usa amc ambassador dpl
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB
In [ ]:
df.describe()
Out[ ]:
mpg cylinders displacement horsepower weight acceleration model_year
count 398.000000 398.000000 398.000000 392.000000 398.000000 398.000000 398.000000
mean 23.514573 5.454774 193.425879 104.469388 2970.424623 15.568090 76.010050
std 7.815984 1.701004 104.269838 38.491160 846.841774 2.757689 3.697627
min 9.000000 3.000000 68.000000 46.000000 1613.000000 8.000000 70.000000
25% 17.500000 4.000000 104.250000 75.000000 2223.750000 13.825000 73.000000
50% 23.000000 4.000000 148.500000 93.500000 2803.500000 15.500000 76.000000
75% 29.000000 8.000000 262.000000 126.000000 3608.000000 17.175000 79.000000
max 46.600000 8.000000 455.000000 230.000000 5140.000000 24.800000 82.000000
In [ ]:
df_mpg = sns.load_dataset('mpg')
df_mpg
Out[ ]:
mpg cylinders displacement horsepower weight acceleration model_year origin name
0 18.0 8 307.0 130.0 3504 12.0 70 usa chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693 11.5 70 usa buick skylark 320
2 18.0 8 318.0 150.0 3436 11.0 70 usa plymouth satellite
3 16.0 8 304.0 150.0 3433 12.0 70 usa amc rebel sst
4 17.0 8 302.0 140.0 3449 10.5 70 usa ford torino
... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86.0 2790 15.6 82 usa ford mustang gl
394 44.0 4 97.0 52.0 2130 24.6 82 europe vw pickup
395 32.0 4 135.0 84.0 2295 11.6 82 usa dodge rampage
396 28.0 4 120.0 79.0 2625 18.6 82 usa ford ranger
397 31.0 4 119.0 82.0 2720 19.4 82 usa chevy s-10

398 rows × 9 columns

In [ ]:
df_mpg.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB
In [ ]:
# set style of sns, also "dark"
sns.set_style('dark')

# set palette of sns
# supported values are 'Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r',
# 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r',
# 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r',
#'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r',
# 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu',
#'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r',
#'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r',
#'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r',
# 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm',
#'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'flare',
# 'flare_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar',
#'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r',
#'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire',
#'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magm...
#sns.set_palette('gray')
sns.set_palette('gist_rainbow')
# plot distribution
# kde is for kernel dis
sns.distplot(df['horsepower'],kde=False,bins=10)
<ipython-input-11-b81c662db852>:23: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df['horsepower'],kde=False,bins=10)
Out[ ]:
<Axes: xlabel='horsepower'>
In [ ]:
sns.set_style('white')
sns.set_palette('gray')
# jointplot is for bi-variate plot
sns.jointplot(x='horsepower',y='mpg',data=df_mpg,kind='scatter')
#sns.jointplot(x='horsepower',y='mpg',data=df_mpg,kind='scatter')
# more kinds, such as hex, scatter, reg, hist
Out[ ]:
<seaborn.axisgrid.JointGrid at 0x7e26e554bcd0>

For more details about color palette. https://seaborn.pydata.org/tutorial/color_palettes.html

For more details about jointplot: https://seaborn.pydata.org/generated/seaborn.jointplot.html

In [ ]:
sns.set_palette('Set2')
sns.jointplot(x='horsepower',y='mpg',data=df_mpg,hue='origin')
Out[ ]:
<seaborn.axisgrid.JointGrid at 0x7f40c5d312d0>
In [ ]: