How to know my data is following which distribution?¶
Part 1. Get Data¶
For more information about the statios, please check the following site.¶
https://e-service.cwa.gov.tw/wdps/obs/state.htm¶
In [157]:
import pandas as pd
AMR = pd.read_excel('AMR_1979-2020.xlsx')
AMR.head()
Out[157]:
stno | 466900 | 466910 | 466920 | 466930 | 466940 | 467571 | 467490 | 467530 | 467550 | 467650 | 467410 | 467350 | 467440 | 467590 | 467660 | 467620 | 467610 | 466990 | 467080 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | lon | 121.449997 | 121.529999 | 121.510002 | 121.540001 | 121.739998 | 121.010002 | 120.680000 | 120.809998 | 120.959999 | 120.910004 | 120.199997 | 119.559998 | 120.320000 | 120.750000 | 121.150002 | 121.559998 | 121.370003 | 121.610001 | 121.760002 |
1 | lat | 25.160000 | 25.180000 | 25.040001 | 25.160000 | 25.129999 | 24.830000 | 24.150000 | 23.510000 | 23.490000 | 23.879999 | 22.990000 | 23.570000 | 22.570000 | 22.000000 | 22.750000 | 22.040001 | 23.100000 | 23.980000 | 24.760000 |
2 | 1979 | 218.500000 | 419.500031 | 157.000000 | 489.899963 | 154.000000 | 0.000000 | 127.499992 | 556.200012 | 266.400024 | 254.399994 | 120.500008 | 105.600006 | 146.399994 | 247.699997 | 186.000000 | 123.900009 | 222.200012 | 115.199997 | 181.000000 |
3 | 1980 | 103.100006 | 227.699997 | 223.000000 | 423.700012 | 399.599976 | 0.000000 | 221.199997 | 591.500000 | 347.400024 | 383.399963 | 63.100002 | 63.899998 | 65.899994 | 160.400009 | 91.799995 | 208.200012 | 128.900009 | 215.100006 | 243.300003 |
4 | 1981 | 328.500031 | 409.100006 | 321.500000 | 429.899994 | 179.199997 | 0.000000 | 233.100006 | 493.699982 | 333.000000 | 262.200012 | 247.600006 | 168.300003 | 281.500000 | 263.500031 | 206.399994 | 141.599991 | 267.800018 | 261.500000 | 193.999985 |
Part 2. Simple statistics of the data¶
In [162]:
# Get 466900 淡水, TAMSUI
import matplotlib.pyplot as plt
import numpy as np
st = AMR['466910']
st_sorted = np.sort(st)
cdf = np.arange(1, len(st) + 1) / len(st)
In [163]:
stats = st.describe()
if stats.min() <=0:
print('The data is not larger then zero, it may cause problem when fitting log-normal distribution.')
else:
print(stats.min())
st.describe()
25.18000030517578
Out[163]:
466910 | |
---|---|
count | 44.000000 |
mean | 366.079770 |
std | 172.266071 |
min | 25.180000 |
25% | 264.625000 |
50% | 372.750000 |
75% | 451.324959 |
max | 882.300049 |
Part 3. Data Visualization¶
In [164]:
# Data Visualization
fig = plt.figure()
plt.plot(st, color='k', zorder=3)
plt.xlabel('Year')
plt.ylabel('Annual Max Rainfall (mm/day)')
plt.grid()
plt.show()
In [165]:
# Data Visualization
fig = plt.figure(figsize=(4,6))
ax1 = plt.subplot2grid((3,1), (0, 0))
ax1.hist(st, bins=5, color='skyblue', edgecolor='black', zorder=3)
ax1.set_xlabel('Year')
ax1.set_ylabel('Annual Max Rainfall (mm/day)')
ax1.grid()
ax2 = plt.subplot2grid((3,1), (1, 0))
ax2.plot(st_sorted, cdf, color='blue', marker='o', zorder=3)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Frequency')
ax2.grid()
ax3 = plt.subplot2grid((3,1), (2, 0))
ax3.boxplot(st,vert=False,patch_artist=True,boxprops=dict(facecolor='skyblue'),
flierprops=dict(marker='o', color='red', markerfacecolor='red', markersize=8), # Red outliers
)
ax3.set_xlabel('Rainfall (mm/day)')
ax3.set_ylabel('Cumulative Probability')
ax3.grid(zorder=0)
ax3.set_axisbelow(True)
ax3.set_yticklabels(['466900'], rotation=90, va='center')
plt.tight_layout()
plt.show()
In [166]:
# How to fit a theoretical distribution. For example, Gumbel distribution
from scipy.stats import gumbel_r
# Fit a Gumbel distribution to the data (for maxima)
params = gumbel_r.fit(st) # Fit the Gumbel distribution
loc_g, scale_g = params[0], params[1] # Extract location and scale parameters
print('loc is:',loc_g)
print('scale is:',scale_g)
# Generate the theoretical PDF using the fitted parameters
# Generate a series from min value to max value
x = np.linspace(min(st), max(st), 100)
print(x.shape)
print(x[:5])
#
pdf = gumbel_r.pdf(x, loc_g, scale_g)
loc is: 286.2636026530529 scale is: 146.6922141961499 (100,) [25.18000031 33.83777857 42.49555684 51.15333511 59.81111338]
In [176]:
# Fit the Gumbel distribution using Method of Moments (MM)
params_mm = gumbel_r.fit(st, method='MM')
print('Parameters estimated by Method of Moments (MM):')
print(f"Location (loc): {params_mm[0]:.3f}, Scale (scale): {params_mm[1]:.3f}")
# Fit the Gumbel distribution using Maximum Likelihood Estimation (MLE)
params_mle = gumbel_r.fit(st, method='MLE')
print('Parameters estimated by Maximum Likelihood Method (MLE):')
print(f"Location (loc): {params_mle[0]:.3f}, Scale (scale): {params_mle[1]:.3f}")
Parameters estimated by Method of Moments (MM): Location (loc): 289.437, Scale (scale): 132.780 Parameters estimated by Maximum Likelihood Method (MLE): Location (loc): 286.264, Scale (scale): 146.692
In [167]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gumbel_r
params = gumbel_r.fit(st) # Fit the Gumbel distribution
loc_g, scale_g = params[0], params[1] # Extract location and scale parameters
x = np.linspace(min(st), max(st), 100)
pdf = gumbel_r.pdf(x, loc_g, scale_g)
cdf = gumbel_r.cdf(x, loc_g, scale_g)
max_pdf = np.max(pdf)
fig = plt.figure(figsize=(6,6))
# First subplot: Histogram with fitted Gumbel PDF
ax1 = plt.subplot2grid((2, 1), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.vlines(loc_g, 0, max_pdf, color='black', linewidth=2, label='loc')
ax1.plot(x, pdf, 'r-', label=f'Fitted Gumbel\n loc={loc_g:.2f}, scale={scale_g:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)
# Second subplot: Empirical CDF with fitted Gumbel CDF
ax2 = plt.subplot2grid((2, 1), (1, 0))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf, 'r-', label='Fitted Gumbel CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)
plt.tight_layout()
plt.show()
In [168]:
# Try other distributions
from scipy.stats import lognorm
# Fit a Log-Normal distribution to the data
params = lognorm.fit(st, floc=0) # Fit with location parameter fixed at 0
shape, loc, scale = params # Extract the parameters
# Generate the theoretical PDF using the fitted parameters
x = np.linspace(min(st), max(st), 100)
pdf = lognorm.pdf(x, shape, loc, scale)
cdf = lognorm.cdf(x, shape, loc, scale)
print(shape, loc, scale)
0.5975669959179677 0 319.2227631788498
In [169]:
fig = plt.figure(figsize=(6, 6))
ax1 = plt.subplot2grid((2, 1), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.plot(x, pdf, 'r-', label=f'Fitted Log-Normal\nshape={shape:.2f}, scale={scale:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)
ax2 = plt.subplot2grid((2, 1), (1, 0))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf, 'r-', label='Fitted Log-Normal CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)
# Adjust layout and show the plot
plt.tight_layout()
plt.show()
In [170]:
# Fit Gumbel distribution to the data
params_gumbel = gumbel_r.fit(st)
loc_g, scale_g = params_gumbel[0], params_gumbel[1]
x = np.linspace(min(st), max(st), 100)
pdf_gumbel = gumbel_r.pdf(x, loc_g, scale_g)
cdf_gumbel = gumbel_r.cdf(x, loc_g, scale_g)
max_pdf = np.max(pdf_gumbel)
# Fit Log-Normal distribution to the data
params_lognorm = lognorm.fit(st, floc=0)
shape, loc, scale = params_lognorm
pdf_lognorm = lognorm.pdf(x, shape, loc, scale)
cdf_lognorm = lognorm.cdf(x, shape, loc, scale)
# Create a 2x2 subplot grid
fig = plt.figure(figsize=(10, 8))
# First subplot: Histogram with fitted Gumbel PDF
ax1 = plt.subplot2grid((2, 2), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.vlines(loc_g, 0, max_pdf, color='black', linewidth=2, label='loc')
ax1.plot(x, pdf_gumbel, 'r-', label=f'Fitted Gumbel\nloc={loc_g:.2f}, scale={scale_g:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)
# Second subplot: Empirical CDF with fitted Gumbel CDF
ax2 = plt.subplot2grid((2, 2), (0, 1))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf_gumbel, 'r-', label='Fitted Gumbel CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)
# Third subplot: Histogram with fitted Log-Normal PDF
ax3 = plt.subplot2grid((2, 2), (1, 0))
ax3.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax3.plot(x, pdf_lognorm, 'r-', label=f'Fitted Log-Normal\nshape={shape:.2f}, scale={scale:.2f}', linewidth=2)
ax3.set_xlabel('Rainfall (mm/day)')
ax3.set_ylabel('Density')
ax3.legend(loc='upper right')
ax3.grid(True)
# Fourth subplot: Empirical CDF with fitted Log-Normal CDF
ax4 = plt.subplot2grid((2, 2), (1, 1))
ax4.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax4.plot(x, cdf_lognorm, 'r-', label='Fitted Log-Normal CDF', linewidth=2)
ax4.set_xlabel('Rainfall (mm/day)')
ax4.set_ylabel('Cumulative Probability')
ax4.legend(loc='lower right')
ax4.grid(True)
plt.tight_layout()
plt.show()
Part 4. Goodness-of-Fit Tests¶
In [171]:
# Perform a KS test for goodness of fit to the Gumbel distribution
ks_statistic_g, p_value_g = kstest(st, 'gumbel_r', args=(loc_g, scale_g))
# Perform a KS test for goodness of fit to the Log-Normal distribution
ks_statistic_ln, p_value_ln = kstest(st, 'lognorm', args=(shape, loc, scale))
# Print the results with 3 digits precision for both distributions
print(f"Gumbel Distribution:")
print(f"KS Statistic: {ks_statistic_g:.4f}")
print(f"P-value: {p_value_g:.4f}\n")
print(f"Log-Normal Distribution:")
print(f"KS Statistic: {ks_statistic_ln:.4f}")
print(f"P-value: {p_value_ln:.4f}")
print('From the p-values, it suggested that the Gumbel distribution fitted bettern than Log-normal distribution.')
Gumbel Distribution: KS Statistic: 0.0932 P-value: 0.8051 Log-Normal Distribution: KS Statistic: 0.1532 P-value: 0.2286 From the p-values, it suggested that the Gumbel distribution fitted bettern than Log-normal distribution.
The p-value indicates how well the distribution fits the data. If the p-value is high, it suggests a good fit.