import pandas as pd

AMR = pd.read_excel('AMR_1979-2020.xlsx')
AMR.head()

# Get 466900 淡水, TAMSUI
import matplotlib.pyplot as plt
import numpy as np


st = AMR['466910']



st_sorted = np.sort(st)
cdf = np.arange(1, len(st) + 1) / len(st)

stats = st.describe()

if stats.min() <=0:
  print('The data is not larger then zero, it may cause problem when fitting log-normal distribution.')
else:
  print(stats.min())
st.describe()

25.18000030517578

# Data Visualization
fig = plt.figure()
plt.plot(st, color='k', zorder=3)
plt.xlabel('Year')
plt.ylabel('Annual Max Rainfall (mm/day)')
plt.grid()
plt.show()

# Data Visualization
fig = plt.figure(figsize=(4,6))

ax1 = plt.subplot2grid((3,1), (0, 0))
ax1.hist(st, bins=5, color='skyblue', edgecolor='black', zorder=3)
ax1.set_xlabel('Year')
ax1.set_ylabel('Annual Max Rainfall (mm/day)')
ax1.grid()

ax2 = plt.subplot2grid((3,1), (1, 0))
ax2.plot(st_sorted, cdf, color='blue', marker='o', zorder=3)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Frequency')
ax2.grid()

ax3 = plt.subplot2grid((3,1), (2, 0))
ax3.boxplot(st,vert=False,patch_artist=True,boxprops=dict(facecolor='skyblue'),
    flierprops=dict(marker='o', color='red', markerfacecolor='red', markersize=8),  # Red outliers
)
ax3.set_xlabel('Rainfall (mm/day)')
ax3.set_ylabel('Cumulative Probability')
ax3.grid(zorder=0)
ax3.set_axisbelow(True)
ax3.set_yticklabels(['466900'], rotation=90, va='center')

plt.tight_layout()
plt.show()

# How to fit a theoretical distribution. For example, Gumbel distribution

from scipy.stats import gumbel_r
# Fit a Gumbel distribution to the data (for maxima)
params = gumbel_r.fit(st)  # Fit the Gumbel distribution

loc_g, scale_g = params[0], params[1]  # Extract location and scale parameters
print('loc is:',loc_g)
print('scale is:',scale_g)

# Generate the theoretical PDF using the fitted parameters
# Generate a series from min value to max value
x = np.linspace(min(st), max(st), 100)
print(x.shape)
print(x[:5])
#
pdf = gumbel_r.pdf(x, loc_g, scale_g)

loc is: 286.2636026530529
scale is: 146.6922141961499
(100,)
[25.18000031 33.83777857 42.49555684 51.15333511 59.81111338]

# Fit the Gumbel distribution using Method of Moments (MM)
params_mm = gumbel_r.fit(st, method='MM')
print('Parameters estimated by Method of Moments (MM):')
print(f"Location (loc): {params_mm[0]:.3f}, Scale (scale): {params_mm[1]:.3f}")

# Fit the Gumbel distribution using Maximum Likelihood Estimation (MLE)
params_mle = gumbel_r.fit(st, method='MLE')
print('Parameters estimated by Maximum Likelihood Method (MLE):')
print(f"Location (loc): {params_mle[0]:.3f}, Scale (scale): {params_mle[1]:.3f}")

Parameters estimated by Method of Moments (MM):
Location (loc): 289.437, Scale (scale): 132.780
Parameters estimated by Maximum Likelihood Method (MLE):
Location (loc): 286.264, Scale (scale): 146.692

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gumbel_r

params = gumbel_r.fit(st)  # Fit the Gumbel distribution
loc_g, scale_g = params[0], params[1]  # Extract location and scale parameters
x = np.linspace(min(st), max(st), 100)
pdf = gumbel_r.pdf(x, loc_g, scale_g)
cdf = gumbel_r.cdf(x, loc_g, scale_g)
max_pdf = np.max(pdf)
fig = plt.figure(figsize=(6,6))
# First subplot: Histogram with fitted Gumbel PDF
ax1 = plt.subplot2grid((2, 1), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.vlines(loc_g, 0, max_pdf, color='black', linewidth=2, label='loc')
ax1.plot(x, pdf, 'r-', label=f'Fitted Gumbel\n loc={loc_g:.2f}, scale={scale_g:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)
# Second subplot: Empirical CDF with fitted Gumbel CDF
ax2 = plt.subplot2grid((2, 1), (1, 0))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf, 'r-', label='Fitted Gumbel CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)
plt.tight_layout()
plt.show()

# Try other distributions
from scipy.stats import lognorm
# Fit a Log-Normal distribution to the data
params = lognorm.fit(st, floc=0)  # Fit with location parameter fixed at 0
shape, loc, scale = params  # Extract the parameters

# Generate the theoretical PDF using the fitted parameters
x = np.linspace(min(st), max(st), 100)
pdf = lognorm.pdf(x, shape, loc, scale)
cdf = lognorm.cdf(x, shape, loc, scale)
print(shape, loc, scale)

0.5975669959179677 0 319.2227631788498

fig = plt.figure(figsize=(6, 6))


ax1 = plt.subplot2grid((2, 1), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.plot(x, pdf, 'r-', label=f'Fitted Log-Normal\nshape={shape:.2f}, scale={scale:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)


ax2 = plt.subplot2grid((2, 1), (1, 0))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf, 'r-', label='Fitted Log-Normal CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

# Fit Gumbel distribution to the data
params_gumbel = gumbel_r.fit(st)
loc_g, scale_g = params_gumbel[0], params_gumbel[1]
x = np.linspace(min(st), max(st), 100)
pdf_gumbel = gumbel_r.pdf(x, loc_g, scale_g)
cdf_gumbel = gumbel_r.cdf(x, loc_g, scale_g)
max_pdf = np.max(pdf_gumbel)
# Fit Log-Normal distribution to the data
params_lognorm = lognorm.fit(st, floc=0)
shape, loc, scale = params_lognorm
pdf_lognorm = lognorm.pdf(x, shape, loc, scale)
cdf_lognorm = lognorm.cdf(x, shape, loc, scale)
# Create a 2x2 subplot grid
fig = plt.figure(figsize=(10, 8))

# First subplot: Histogram with fitted Gumbel PDF
ax1 = plt.subplot2grid((2, 2), (0, 0))
ax1.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax1.vlines(loc_g, 0, max_pdf, color='black', linewidth=2, label='loc')
ax1.plot(x, pdf_gumbel, 'r-', label=f'Fitted Gumbel\nloc={loc_g:.2f}, scale={scale_g:.2f}', linewidth=2)
ax1.set_xlabel('Rainfall (mm/day)')
ax1.set_ylabel('Density')
ax1.legend(loc='upper right')
ax1.grid(True)

# Second subplot: Empirical CDF with fitted Gumbel CDF
ax2 = plt.subplot2grid((2, 2), (0, 1))
ax2.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax2.plot(x, cdf_gumbel, 'r-', label='Fitted Gumbel CDF', linewidth=2)
ax2.set_xlabel('Rainfall (mm/day)')
ax2.set_ylabel('Cumulative Probability')
ax2.legend(loc='lower right')
ax2.grid(True)

# Third subplot: Histogram with fitted Log-Normal PDF
ax3 = plt.subplot2grid((2, 2), (1, 0))
ax3.hist(st, bins=5, density=True, color='skyblue', edgecolor='black', alpha=0.6, label='Data Histogram')
ax3.plot(x, pdf_lognorm, 'r-', label=f'Fitted Log-Normal\nshape={shape:.2f}, scale={scale:.2f}', linewidth=2)
ax3.set_xlabel('Rainfall (mm/day)')
ax3.set_ylabel('Density')
ax3.legend(loc='upper right')
ax3.grid(True)

# Fourth subplot: Empirical CDF with fitted Log-Normal CDF
ax4 = plt.subplot2grid((2, 2), (1, 1))
ax4.plot(np.sort(st), np.arange(1, len(st) + 1) / len(st), 'o', label='Empirical CDF')
ax4.plot(x, cdf_lognorm, 'r-', label='Fitted Log-Normal CDF', linewidth=2)
ax4.set_xlabel('Rainfall (mm/day)')
ax4.set_ylabel('Cumulative Probability')
ax4.legend(loc='lower right')
ax4.grid(True)
plt.tight_layout()
plt.show()

# Perform a KS test for goodness of fit to the Gumbel distribution
ks_statistic_g, p_value_g = kstest(st, 'gumbel_r', args=(loc_g, scale_g))

# Perform a KS test for goodness of fit to the Log-Normal distribution
ks_statistic_ln, p_value_ln = kstest(st, 'lognorm', args=(shape, loc, scale))

# Print the results with 3 digits precision for both distributions
print(f"Gumbel Distribution:")
print(f"KS Statistic: {ks_statistic_g:.4f}")
print(f"P-value: {p_value_g:.4f}\n")

print(f"Log-Normal Distribution:")
print(f"KS Statistic: {ks_statistic_ln:.4f}")
print(f"P-value: {p_value_ln:.4f}")
print('From the p-values, it suggested that the Gumbel distribution fitted bettern than Log-normal distribution.')

Gumbel Distribution:
KS Statistic: 0.0932
P-value: 0.8051

Log-Normal Distribution:
KS Statistic: 0.1532
P-value: 0.2286
From the p-values, it suggested that the Gumbel distribution fitted bettern than Log-normal distribution.

	stno	466900	466910	466920	466930	466940	467571	467490	467530	467550	467650	467410	467350	467440	467590	467660	467620	467610	466990	467080
0	lon	121.449997	121.529999	121.510002	121.540001	121.739998	121.010002	120.680000	120.809998	120.959999	120.910004	120.199997	119.559998	120.320000	120.750000	121.150002	121.559998	121.370003	121.610001	121.760002
1	lat	25.160000	25.180000	25.040001	25.160000	25.129999	24.830000	24.150000	23.510000	23.490000	23.879999	22.990000	23.570000	22.570000	22.000000	22.750000	22.040001	23.100000	23.980000	24.760000
2	1979	218.500000	419.500031	157.000000	489.899963	154.000000	0.000000	127.499992	556.200012	266.400024	254.399994	120.500008	105.600006	146.399994	247.699997	186.000000	123.900009	222.200012	115.199997	181.000000
3	1980	103.100006	227.699997	223.000000	423.700012	399.599976	0.000000	221.199997	591.500000	347.400024	383.399963	63.100002	63.899998	65.899994	160.400009	91.799995	208.200012	128.900009	215.100006	243.300003
4	1981	328.500031	409.100006	321.500000	429.899994	179.199997	0.000000	233.100006	493.699982	333.000000	262.200012	247.600006	168.300003	281.500000	263.500031	206.399994	141.599991	267.800018	261.500000	193.999985

	466910
count	44.000000
mean	366.079770
std	172.266071
min	25.180000
25%	264.625000
50%	372.750000
75%	451.324959
max	882.300049

How to know my data is following which distribution?¶

Part 1. Get Data¶

For more information about the statios, please check the following site.¶

https://e-service.cwa.gov.tw/wdps/obs/state.htm ¶

Part 2. Simple statistics of the data¶

Part 3. Data Visualization¶

Part 4. Goodness-of-Fit Tests¶

How to know my data is following which distribution?¶

Part 1. Get Data¶

For more information about the statios, please check the following site.¶

https://e-service.cwa.gov.tw/wdps/obs/state.htm¶

Part 2. Simple statistics of the data¶

Part 3. Data Visualization¶

Part 4. Goodness-of-Fit Tests¶

https://e-service.cwa.gov.tw/wdps/obs/state.htm ¶