OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056
Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html
%matplotlib inline
%load_ext autoreload
%autoreload 2
from import_file import *
from helpers.parallel_helper import *
load_libs()
plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True
# file_path= './data/NCDC/jerez/dat.txt' # time shift
# file_path= './data/NCDC/almeria/dat.txt'
# Greece
# file_path= './data/NCDC/eleftherios_intl/dat.txt'
# file_path= './data/NCDC/elefsis/dat.txt' # bad dataset
# file_path= './data/NCDC/malaga/dat.txt'
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit
# Turkey
# file_path= './data/NCDC/turkey/konya/dat.txt'
# file_path= './data/NCDC/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/turkey/bartin/dat.txt' # bad dataset
# Iran
# file_path= './data/NCDC/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/iran/torbat_heydarieh/dat.txt' # Unusable
# UAE
# file_path= './data/NCDC/al_maktoum/dat.txt'
# file_path= './data/NCDC/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/uk/marham/dat.txt'
# file_path= './data/NCDC/uk/tiree/dat.txt' # try 4
# file_path= './data/NCDC/uk/boscombe_down/dat.txt' # 4?, numpy bug
# file_path= './data/NCDC/uk/middle_wallop/dat.txt'
# file_path= './data/NCDC/uk/southhamption/dat.txt' # high 0, trend
# file_path= './data/NCDC/uk/bournemouth/dat.txt' # 4?
# file_path= "./data/NCDC/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/uk/skye_lusa/dat.txt" #
# file_path= "./data/NCDC/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/landsberg_lech/dat.txt", 4 # very good, can try 4
file_path= './data/NCDC/europe/pau_pyrenees/dat.txt' # unit shift, 2; force using knot
# file_path= "./data/NCDC/europe/vatry/dat.txt" # double peak, initial speed (should be good with m/s), mixed report type
# file_path= "./data/NCDC/europe/neuburg/dat.txt"
# file_path= "./data/NCDC/europe/valladolid/dat.txt"
# file_path= "./data/NCDC/europe/laupheim/dat.txt" # double peak, 4; very good, trend
# file_path= "./data/NCDC/europe/avord/dat.txt" # try 4, initial speed (should be good with m/s)
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/holzdorf/dat.txt" # 2008 year
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt" # integer, 4?
# file_path= "./data/NCDC/europe/barayas/dat.txt" # numpy problem
# file_path= './data/NCDC/europe/tenerife_sur/dat.txt' # some directions are blocked
# file_path= './data/NCDC/europe/nantes/dat.txt' # some dir R square / K-S differs big, unit detect fails
# file_path= "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt" # care for the sampling time
# file_path= "./data/NCDC/cn/shanghai/pudong/dat.txt"
# file_path= "./data/NCDC/cn/hefei_luogang/dat.txt" # few 0, trend, try 2
# file_path= "./data/NCDC/cn/nanjing_lukou/dat.txt"
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt"
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt"
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" #
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009 may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem
# file_path= "./data/NCDC/oceania/auckland_intl/dat.txt" # Good data, Weird KDE shape, might be blocked?
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/canberra/dat.txt" # high 0, numpy problem
# file_path= './data/NCDC/us/boston_16nm/dat.txt' # Offshore
# file_path = './data/asos/denver/hr_avg.csv'
# file_path = './data/asos/bismarck_ND/hr_avg.csv' # try 4
# file_path = './data/asos/aberdeen_SD/hr_avg.csv' # only to 2012, good fit, try 2
# file_path = './data/asos/minneapolis/hr_avg.csv'
# file_path = './data/asos/lincoln_NE/hr_avg.csv'
# file_path = './data/asos/des_moines_IA/hr_avg.csv'
# file_path = './data/asos/springfield_IL/hr_avg.csv' # good fit
# file_path = './data/asos/topeka/hr_avg.csv' # High 0
# file_path = './data/NDAWN/baker/hr_avg.csv' # 4 might be better
# file_path = './data/NDAWN/dickinson/hr_avg.csv'
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'
if "cn_database" in file_path:
df = read_cn_database(file_path)
elif 'NCDC' in file_path:
df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
df = df[['date','HrMn','type','dir','speed','wind_type' ]]
df.dropna(subset=['dir','speed'], inplace=True)
integer_data = True
elif 'NDAWN' in file_path:
df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
df['type']='default'
df['wind_type']='default'
df = df.dropna()
integer_data = False
knot_unit = False
else:
# ASOS
df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
df['type']='default'
df['wind_type']='default'
df = df.dropna()
integer_data = False
knot_unit = True
df
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) & \
(date >= 19700000) & (date < 20170000) ")
plot_speed_and_angle_distribution(df.speed, df.dir)
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x)
df['month'] = df['date']%10000//100
# Convert Windrose coordianates to Polar Cooridinates
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
df.describe()
df.plot(y='speed',legend=True,figsize=(20,5))
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
if 'knot_unit' not in globals():
# knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
knot_unit=True
if knot_unit:
df['speed'] = df['speed'] * 1.943845
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='knot')
# need more elaboration, some is not near an integer
df['speed'] = df['speed'].apply(lambda x: int(round(x)))
plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
dir_unit_text = ' (degree)'
if knot_unit == True:
speed_unit_text = ' (knot)'
else:
speed_unit_text = ' (m/s)'
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))
report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")
MID_YEAR = (min(df.date)//10000+max(df.date)//10000)//2
df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df.query('date > @MID_YEAR * 10000')['HrMn'].value_counts().sort_index().plot(
kind='bar', alpha=0.5, label='> %s' % MID_YEAR )
plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4),
title = 'Sampling Time Distribution, Overall and > %s ' % MID_YEAR)
df['sample_time'] = df.HrMn % 100
sample_time = df.query('date > 20000000')['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)
df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
if integer_data:
display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)
display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')
# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df.query("(date >= 20050000)"))
delete_zero = with_too_many_zero
if delete_zero:
df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
For some dataset, the 16 sectors are not record properly,
e.g. the sectors are [0,20,30,50], need to redistribute the angle into 22.5
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
SECTOR_LENGTH = 360/len(effective_column)
else:
SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
df.query('dir == 999')['speed'].value_counts()
df=realign_direction(df, effective_column)
df=fill_direction_999(df, SECTOR_LENGTH)
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
DIR_BIN = arange(-5, 360, 10)
elif DIR_REDISTRIBUTE == 'round_up':
DIR_BIN = arange(0, 360+10, 10)
# Comparison between mid_year, looking for:
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df.query('date < @MID_YEAR * 10000')['speed'].plot(
kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)
df.query('date > @MID_YEAR * 10000')['speed'].plot(
kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)
plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
df.query('date < @MID_YEAR * 10000')['dir'].plot(
kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)
df.query('date > @MID_YEAR * 10000')['dir'].plot(
kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)
plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
for column in ['speed', 'dir']:
if column == 'speed':
bins = arange(0, df[column].max()+1, 1)
else:
bins = arange(0, 361, 10)
den, _ = np.histogram(df[column], bins=bins, density=True)
y_top=max(den)*1.2
for year in arange(1980, 2016):
end_year = year
sub_df = df[str(year):str(end_year)]
if len(sub_df) > 5000:
plt.figure()
df[column].hist(bins=bins, alpha=0.3, normed=True)
sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
plt.gca().set_ylim(top=y_top)
plt_configure(title=str(year))
align_figures()
for column in ['speed', 'dir']:
if column == 'speed':
bins = arange(0, df[column].max()+1, 1)
else:
bins = arange(0, 361, 10)
density_all, _ = np.histogram(df[column], bins=bins, density=True)
df[column].hist(bins=bins, figsize=(5,3))
R_squares = []
years = []
for year in arange(1980, 2016):
start_year, end_year = year-1, year+1
sub_df = df[str(start_year):str(end_year)]
if len(sub_df) > 5000:
density, _ = np.histogram(sub_df[column], bins=bins, density=True)
y_mean = np.mean(density_all)
SS_tot = np.sum(np.power(density_all - y_mean, 2))
SS_res = np.sum(np.power(density_all - density, 2))
R_square = 1 - SS_res / SS_tot
R_squares.append(R_square)
years.append(year)
plt.figure()
plot(years, R_squares)
ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
plt.gca().set_ylim(bottom=ylim, top=1)
plt_configure(figsize=(5,3))
align_figures()
e.g. Dir 50 -> -45 ~ 55, to make KDE result better
if integer_data:
df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
if integer_data:
if delete_zero:
redistribute_method = 'down'
else:
redistribute_method = 'up'
df, speed_redistribution_info = randomize_speed(df, redistribute_method)
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
print('Speed redistribution info:', speed_redistribution_info )
df_all_years = df # for later across-year comparison
df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
df.plot(y='speed',legend=True,figsize=(20,5))
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
if len(df) > 1000000:
bins=arange(0,362)
df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
df = df_all_years.sample(n=500000, replace=True)
df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
plt_configure(legend=True, figsize=(20,4))
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)
# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)
plot(x, y_weibull, '-', color='black',label='Weibull')
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)
# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
if len(effective_column) == 16:
rebinned_angle = 22.5
else:
rebinned_angle = 10
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360
max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]
for angle in arange(start, end, incre):
start_angle, end_angle = angle-incre/2, angle+incre/2
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
fig = plt.figure()
sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df))
plt.axis(plot_range)
plt_configure(figsize=(3,1.5), title=title)
align_figures()
month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 12+month_incre, month_incre):
end_month = month+month_incre
sub_df = current_df.query('(month >= @month) and (month < @end_month)')
if len(sub_df) > 0:
if month_incre == 1:
title = 'Month: %s' % (month)
else:
title = 'Month: %s - %s ' % (month, end_month-1)
ax = WindroseAxes.from_ax()
ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
plt_configure(figsize=(3,3), title=title)
align_figures()
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)
FITTING_RANGE = []
for i in fitting_axis_range:
for j in fitting_axis_range:
FITTING_RANGE.append([i,j])
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
%%time
from sklearn.grid_search import GridSearchCV
# from sklearn.model_selection import GridSearchCV ## too slow
# The bandwidth value sometimes would be too radical
if knot_unit:
bandwidth_range = arange(0.7,2,0.2)
else:
bandwidth_range = arange(0.4,1,0.1)
# Grid search is unable to deal with too many data (a long time is needed)
if len(sample) > 50000:
df_resample=df.sample(n=50000, replace=True)
bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
else:
bandwidth_search_sample = sample
grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
{'bandwidth': bandwidth_range}, n_jobs=-1, cv=4)
grid.fit(bandwidth_search_sample)
bandwidth = grid.best_params_['bandwidth']
print(bandwidth)
if 'bandwidth' not in globals():
bandwidth = DEFAULT_BANDWDITH
kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)
points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()
plot_3d_prob_density(X,Y,kde_Z)
fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)
with sns.axes_style({'axes.grid' : False}):
from matplotlib import ticker
fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
_,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
ax2.set_aspect('equal')
cb = plt.colorbar(image)
tick_locator = ticker.MaxNLocator(nbins=6)
cb.locator = tick_locator
cb.update_ticks()
plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
kde_cdf = cdf_from_pdf(kde_result)
sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))
def residule_between_kde_and_gmm(points):
kde_vals = exp(kde.score_samples(points))
gmm_vals = exp(clf.score_samples(points))
return kde_vals - gmm_vals
residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)
plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
# from GMM,EM
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result
cons = [
# sum of every 6th element, which is the fraction of each gaussian
{'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
# # limit the width/height ratio of elliplse, optional
# {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
# {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]
bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
(0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)
result = sp.optimize.minimize(
lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
x0,
bounds = bonds,
constraints=cons,
tol = 0.000000000001,
options = {"maxiter": 500})
result
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
gof_df(gmm_pdf_result, kde_result)
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument
def residule_between_kde_and_gmm(points):
kde_vals = exp(kde.score_samples(points))
gmm_vals = mixed_model_pdf(points)
return kde_vals - gmm_vals
residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)
plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
def f(V,theta):
return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)
# 3. GMM distribution
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'})
# Calculate Speed Distribution
# 1. GMM Model
x = arange(0, max_speed, 0.5)
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])*len(df.speed)/0.02
# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm,'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull')
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir'])
df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
title='Direction Distribution Comparison'
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency',
legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print(title)
# %%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre)
for angle in arange(0, 360, incre))
# This R square is computed as in paper
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
start, end = -original_incre/2 + incre/2, 360
max_diff_array = []
curve_collection = []
max_speed = df.speed.max()
# Find a max count for plotting histogram
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]
for angle in arange(start, end, incre):
angle_radian, incre_radian = radians(angle), radians(incre)
start_angle, end_angle = angle-incre/2, angle+incre/2
# Select data from observation
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
data_size = len(sub_df.speed)
# 1. Get Weibull and ECDF
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
# 2. Get GMM PDF, CDF
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
# 3. Make Plots
fig = plt.figure(figsize=(10,1.9))
# fig = plt.figure(figsize=(10,1.7))
# 3.1. Frequency Comparison
ax1 = fig.add_subplot(1,3,1)
sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')
plot(x, y_gmm*data_size,'-', color='black', label='GMM')
plot(x, y_weibull*data_size, '--', color='black',label='Weibull')
# plt_configure(xlabel = "$V$", ylabel='Frequency', legend=True)
plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
plt.axis(plot_range)
# 3.2. CDF Comaprison
ax2 = fig.add_subplot(1,3,2)
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt.gca().set_xlim(right = max_speed)
# plt_configure(xlabel = "$V$", ylabel='$P$', legend=True)
plt_configure(xlabel = "V", ylabel='P', legend=True)
# 3.3. Weibull Comparison
# ax3 = fig.add_subplot(1,3,3)
# plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
# plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
# plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
# plt.gca().set_xlim(right = log(max_speed+1))
# # plt_configure(xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
# plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
bins = arange(0, sub_df.speed.max()+1)
density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
density_expected_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]])
for x_ in bins[:-1]]
density_expected_gmm = array(list(zip(*density_expected_ ))[0])/direction_prob
R_square_gmm = sector_r_square(density, density_expected_gmm)
density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params)
R_square_weibull = sector_r_square(density, density_expected_weibull)
diff, diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
max_diff_array.append([len(sub_df), angle, diff.max(), x[diff.argmax()],
diff_weibull.max(), x[diff_weibull.argmax()], R_square_gmm, R_square_weibull])
curves = {'angle': angle, 'data_size': data_size, 'weight': direction_prob,
'x': x, 'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf}
curve_collection.append(curves)
plt.tight_layout()
plt.show()
print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
print('GMM', 'Weibull')
print('R square', R_square_gmm, R_square_weibull)
print('max diff:', diff.max(), diff_weibull.max(), 'speed value:', x[diff.argmax()], x[diff_weibull.argmax()], 'y gmm', y_cdf_gmm[diff.argmax()])
print(' ')
return max_diff_array, curve_collection
%%time
if len(effective_column) == 16:
rebinned_angle = 22.5
else:
rebinned_angle = 20
max_diff_array, curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
diff_df = pd.DataFrame(max_diff_array,columns=['datasize','direction', 'gmm', 'speed_gmm',
'weibull', 'speed_weibull', 'r_square_gmm', 'r_square_weibull'])
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.gmm, diff_df.weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
# Compare direction weight with previous figure
display(dir_fig)
max_diff_element = max(max_diff_array, key=lambda x: x[2])
angle = max_diff_angle = max_diff_element[1]
incre = rebinned_angle
FRACTION = 1
# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)
# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')
# 3. Weilbull
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')
# 4. Data Resampled
count_collection = []
for i in range(1,100):
sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)
resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True)
count_collection.append(resampled_count)
ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
y_ecdf = ecdf(x)
ax2.plot(x, y_ecdf,':', label='Data Resampled')
ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
if i == 1:
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')
fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))
ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')
# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(20000000, 20150000, 50000):
end_time = start_time + 50000
time_label = start_time//10000
df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
if len(df_other_years_at_angle) > 0 :
ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
y_ecdf = ecdf(x)
ax2.plot(x, y_ecdf,':', label = time_label)
ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
title = '%s - %s' %(time_label, time_label+4)
count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
bins=arange(0, sub_max_speed_other_year))
ax1.bar(left=division[:-1], height=count, zs=time_label, zdir='x',
color=next(prop_cycle), alpha=0.8)
x_3d = time_label*np.ones_like(x)
ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM' if time_label == 2010 else '')
ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if time_label == 2010 else '')
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})
ax1.set_zlim(bottom = 0)
align_figures()
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))
legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
curve_df = pd.DataFrame(curve_collection)
for angle in angle_group:
curves = curve_df.query('angle == @angle%360').T.to_dict()
curves = curves[list(curves)[0]]
data_size, x = curves['data_size'], curves['x']
y_gmm, y_cdf_gmm = curves['gmm_pdf'], curves['gmm_cdf']
y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'], curves['weibull_cdf'], curves['ecdf']
linestyle = '-' if angle == max_diff_angle else ':'
alpha = 0.7 if angle == max_diff_angle else 0.3
ax2.plot(x, y_gmm*data_size, linestyle, label=angle)
ax3.plot(x, y_weibull*data_size, linestyle, label=angle)
start_angle, end_angle = angle-incre/2, angle+incre/2
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
x_3d = angle*np.ones_like(x)
ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')
count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)
if legend_3d == False:
ax1.legend()
legend_3d = True
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)
print(max_diff_angle)
print('GMM, Weibull, Histogram')
align_figures()
if 'bandwidth' not in globals():
bandwidth = DEFAULT_BANDWDITH
if 'FIT_METHOD' not in globals():
FIT_METHOD = 'square_error'
if 'KDE_KERNEL' not in globals():
KDE_KERNEL = 'gaussian'
config = {'bandwidth': bandwidth,
'fitting_range': FITTING_RANGE,
'fit_limit': fit_limit,
'kde_kernel': KDE_KERNEL}
print(bandwidth, FIT_METHOD)
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))
for result in results:
display(pretty_print_gmm(result['gmm']))
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
plt.show()
display(gof_df(result['gmm_pdf_result'], result['kde_result']))
display(gof_df(result['gmm_pdf_result'], kde_result))
print('')
# df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn%400 == 0)')
%%time
from sklearn.cross_validation import train_test_split, KFold
## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold)
for number_of_gaussian in gaussian_number_range:
print( ' ')
print('Number of gaussian', number_of_gaussian)
kf = KFold(len(df), n_folds=number_of_fold, shuffle=True)
CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)
CV_result_train, CV_result_test = list(zip(*CV_result))
CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
CV_result_train_all.append(CV_result_train)
CV_result_test_all.append(CV_result_test)
print('Train')
pretty_pd_display(CV_result_train)
print('Test')
pretty_pd_display(CV_result_test)
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)
test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
plot(gaussian_number_range, train_scores_mean[column],
'--', label = 'training', color=prop_cycle[0])
plt.fill_between(gaussian_number_range,
train_scores_mean[column] - train_scores_std[column],
train_scores_mean[column] + train_scores_std[column],
alpha=0.2, color=prop_cycle[0])
plot(gaussian_number_range, test_scores_mean[column],
'-', label = 'test',color=prop_cycle[1])
plt.fill_between(gaussian_number_range,
test_scores_mean[column] - test_scores_std[column],
test_scores_mean[column] + test_scores_std[column],
alpha=0.2,color=prop_cycle[1])
plt.xticks(gaussian_number_range)
print(column)
plt.locator_params(axis='y', nbins=5)
plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name,
figsize=(3,2), legend={'loc':'best'})
if column == 'R_square':
plt.gca().set_ylim(top=1)
if column == 'K_S' or column == 'Chi_square':
plt.gca().set_ylim(bottom=0)
plt.show()
# fig = plt.figure(figsize=(4.3,2.4))
fig = plt.figure(figsize=(5,2.5))
ax1 = fig.add_subplot(1,2,1)
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2)
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
xlabel='x'+speed_unit_text, ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull,
fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
display(fig)
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html'
output_HTML(current_file, output_file)