How to know my data is following which distribution?¶

Part 1. Get Data¶

For more information about the statios, please check the following site.¶

https://e-service.cwa.gov.tw/wdps/obs/state.htm¶

In [157]:
import pandas as pd

AMR = pd.read_excel('AMR_1979-2020.xlsx')
AMR.head()
Out[157]:
stno 466900 466910 466920 466930 466940 467571 467490 467530 467550 467650 467410 467350 467440 467590 467660 467620 467610 466990 467080
0 lon 121.449997 121.529999 121.510002 121.540001 121.739998 121.010002 120.680000 120.809998 120.959999 120.910004 120.199997 119.559998 120.320000 120.750000 121.150002 121.559998 121.370003 121.610001 121.760002
1 lat 25.160000 25.180000 25.040001 25.160000 25.129999 24.830000 24.150000 23.510000 23.490000 23.879999 22.990000 23.570000 22.570000 22.000000 22.750000 22.040001 23.100000 23.980000 24.760000
2 1979 218.500000 419.500031 157.000000 489.899963 154.000000 0.000000 127.499992 556.200012 266.400024 254.399994 120.500008 105.600006 146.399994 247.699997 186.000000 123.900009 222.200012 115.199997 181.000000
3 1980 103.100006 227.699997 223.000000 423.700012 399.599976 0.000000 221.199997 591.500000 347.400024 383.399963 63.100002 63.899998 65.899994 160.400009 91.799995 208.200012 128.900009 215.100006 243.300003
4 1981 328.500031 409.100006 321.500000 429.899994 179.199997 0.000000 233.100006 493.699982 333.000000 262.200012 247.600006 168.300003 281.500000 263.500031 206.399994 141.599991 267.800018 261.500000 193.999985

Part 2. Simple statistics of the data¶

In [162]:
# Get 466900 淡水, TAMSUI
import matplotlib.pyplot as plt
import numpy as np


st = AMR['466910']



st_sorted = np.sort(st)
cdf = np.arange(1, len(st) + 1) / len(st)
In [163]:
stats = st.describe()

if stats.min() <=0:
  print('The data is not larger then zero, it may cause problem when fitting log-normal distribution.')
else:
  print(stats.min())
st.describe()
25.18000030517578
Out[163]:
466910
count 44.000000
mean 366.079770
std 172.266071
min 25.180000
25% 264.625000
50% 372.750000
75% 451.324959
max 882.300049

Part 3. Data Visualization¶

In [164]:
# Data Visualization
fig = plt.figure()
plt.plot(st, color='k', zorder=3)
plt.xlabel('Year')
plt.ylabel('Annual Max Rainfall (mm/day)')
plt.grid()
plt.show()
No description has been provided for this image
In [165]:
# Data Visualization
fig = plt.figure(figsize=(4,6))

ax1 = plt.subplot2grid((3,1), (0, 0))
ax1.hist(st, bins=5, color='skyblue', edgecolor='black', zorder=3)
ax1.set_xlabel('Year')
ax1.set_ylabel('Annual Max Rainfall (mm/day)')
ax1.grid()

ax2 = plt.subplot2grid((3,1), (1, 0))
ax2.plot(st_sorted, cdf, color='blue', marker='o', zorder=3)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Frequency')
ax2.grid()

ax3 = plt.subplot2grid((3,1), (2, 0))
ax3.boxplot(st,vert=False,patch_artist=True,boxprops=dict(facecolor='skyblue'),
    flierprops=dict(marker='o', color='red', markerfacecolor='red', markersize=8),  # Red outliers
)
ax3.set_xlabel('Rainfall (mm/day)')
ax3.set_ylabel('Cumulative Probability')
ax3.grid(zorder=0)
ax3.set_axisbelow(True)
ax3.set_yticklabels(['466900'], rotation=90, va='center')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [166]:
# How to fit a theoretical distribution. For example, Gumbel distribution

from scipy.stats import gumbel_r
# Fit a Gumbel distribution to the data (for maxima)
params = gumbel_r.fit(st)  # Fit the Gumbel distribution

loc_g, scale_g = params[0], params[1]  # Extract location and scale parameters
print('loc is:',loc_g)
print('scale is:',scale_g)

# Generate the theoretical PDF using the fitted parameters
# Generate a series from min value to max value
x = np.linspace(min(st), max(st), 100)
print(x.shape)
print(x[:5])
#
pdf = gumbel_r.pdf(x, loc_g, scale_g)
loc is: 286.2636026530529
scale is: 146.6922141961499
(100,)
[25.18000031 33.83777857 42.49555684 51.15333511 59.81111338]
In [176]:
# Fit the Gumbel distribution using Method of Moments (MM)
params_mm = gumbel_r.fit(st, method='MM')
print('Parameters estimated by Method of Moments (MM):')
print(f"Location (loc): {params_mm[0]:.3f}, Scale (scale): {params_mm[1]:.3f}")

# Fit the Gumbel distribution using Maximum Likelihood Estimation (MLE)
params_mle = gumbel_r.fit(st, method='MLE')
print('Parameters estimated by Maximum Likelihood Method (MLE):')
print(f"Location (loc): {params_mle[0]:.3f}, Scale (scale): {params_mle[1]:.3f}")
Parameters estimated by Method of Moments (MM):
Location (loc): 289.437, Scale (scale): 132.780
Parameters estimated by Maximum Likelihood Method (MLE):
Location (loc): 286.264, Scale (scale): 146.692
In [167]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gumbel_r

params = gumbel_r.fit(st)  # Fit the Gumbel distribution
loc_g, scale_g = params[0], params[1]  # Extract location and scale parameters
x = np.linspace(min(st), max(st), 100)
pdf = gumbel_r.pdf(x, loc_g, scale_g)
cdf = gumbel_r.cdf(x, loc_g, scale_g)
max_pdf = np.max(pdf)
fig = plt.figure(figsize=(6,6))
# First subplot: Histogram with fitted Gumbel PDF
ax1 = plt.subplot2grid((2, 1), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.vlines(loc_g, 0, max_pdf, color='black', linewidth=2, label='loc')
ax1.plot(x, pdf, 'r-', label=f'Fitted Gumbel\n loc={loc_g:.2f}, scale={scale_g:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)
# Second subplot: Empirical CDF with fitted Gumbel CDF
ax2 = plt.subplot2grid((2, 1), (1, 0))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf, 'r-', label='Fitted Gumbel CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [168]:
# Try other distributions
from scipy.stats import lognorm
# Fit a Log-Normal distribution to the data
params = lognorm.fit(st, floc=0)  # Fit with location parameter fixed at 0
shape, loc, scale = params  # Extract the parameters

# Generate the theoretical PDF using the fitted parameters
x = np.linspace(min(st), max(st), 100)
pdf = lognorm.pdf(x, shape, loc, scale)
cdf = lognorm.cdf(x, shape, loc, scale)
print(shape, loc, scale)
0.5975669959179677 0 319.2227631788498
In [169]:
fig = plt.figure(figsize=(6, 6))


ax1 = plt.subplot2grid((2, 1), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.plot(x, pdf, 'r-', label=f'Fitted Log-Normal\nshape={shape:.2f}, scale={scale:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)


ax2 = plt.subplot2grid((2, 1), (1, 0))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf, 'r-', label='Fitted Log-Normal CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [170]:
# Fit Gumbel distribution to the data
params_gumbel = gumbel_r.fit(st)
loc_g, scale_g = params_gumbel[0], params_gumbel[1]
x = np.linspace(min(st), max(st), 100)
pdf_gumbel = gumbel_r.pdf(x, loc_g, scale_g)
cdf_gumbel = gumbel_r.cdf(x, loc_g, scale_g)
max_pdf = np.max(pdf_gumbel)
# Fit Log-Normal distribution to the data
params_lognorm = lognorm.fit(st, floc=0)
shape, loc, scale = params_lognorm
pdf_lognorm = lognorm.pdf(x, shape, loc, scale)
cdf_lognorm = lognorm.cdf(x, shape, loc, scale)
# Create a 2x2 subplot grid
fig = plt.figure(figsize=(10, 8))

# First subplot: Histogram with fitted Gumbel PDF
ax1 = plt.subplot2grid((2, 2), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.vlines(loc_g, 0, max_pdf, color='black', linewidth=2, label='loc')
ax1.plot(x, pdf_gumbel, 'r-', label=f'Fitted Gumbel\nloc={loc_g:.2f}, scale={scale_g:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)

# Second subplot: Empirical CDF with fitted Gumbel CDF
ax2 = plt.subplot2grid((2, 2), (0, 1))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf_gumbel, 'r-', label='Fitted Gumbel CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)

# Third subplot: Histogram with fitted Log-Normal PDF
ax3 = plt.subplot2grid((2, 2), (1, 0))
ax3.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax3.plot(x, pdf_lognorm, 'r-', label=f'Fitted Log-Normal\nshape={shape:.2f}, scale={scale:.2f}', linewidth=2)
ax3.set_xlabel('Rainfall (mm/day)')
ax3.set_ylabel('Density')
ax3.legend(loc='upper right')
ax3.grid(True)

# Fourth subplot: Empirical CDF with fitted Log-Normal CDF
ax4 = plt.subplot2grid((2, 2), (1, 1))
ax4.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax4.plot(x, cdf_lognorm, 'r-', label='Fitted Log-Normal CDF', linewidth=2)
ax4.set_xlabel('Rainfall (mm/day)')
ax4.set_ylabel('Cumulative Probability')
ax4.legend(loc='lower right')
ax4.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image

Part 4. Goodness-of-Fit Tests¶

In [171]:
# Perform a KS test for goodness of fit to the Gumbel distribution
ks_statistic_g, p_value_g = kstest(st, 'gumbel_r', args=(loc_g, scale_g))

# Perform a KS test for goodness of fit to the Log-Normal distribution
ks_statistic_ln, p_value_ln = kstest(st, 'lognorm', args=(shape, loc, scale))

# Print the results with 3 digits precision for both distributions
print(f"Gumbel Distribution:")
print(f"KS Statistic: {ks_statistic_g:.4f}")
print(f"P-value: {p_value_g:.4f}\n")

print(f"Log-Normal Distribution:")
print(f"KS Statistic: {ks_statistic_ln:.4f}")
print(f"P-value: {p_value_ln:.4f}")
print('From the p-values, it suggested that the Gumbel distribution fitted bettern than Log-normal distribution.')
Gumbel Distribution:
KS Statistic: 0.0932
P-value: 0.8051

Log-Normal Distribution:
KS Statistic: 0.1532
P-value: 0.2286
From the p-values, it suggested that the Gumbel distribution fitted bettern than Log-normal distribution.

The p-value indicates how well the distribution fits the data. If the p-value is high, it suggests a good fit.