Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models¶

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up¶

1.1 Environment¶

%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data¶

# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth= './data/NCDC/europe/uk/tiree/dat.txt', 1.9
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path, convert_to_knot= './data/NCDC/europe/france/pau_pyrenees/dat.txt', True # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3
# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 0.9, 4 # good
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7, default 0.9

if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True

df

if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)

48.071 10.906

df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']

plot_speed_and_angle_distribution(df.speed, df.dir)

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0xcc40390>

1.3 General Data Info¶

1.3.1 Unit Detection¶

df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)

True

dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection¶

sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection¶

MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)

df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))

[20]

<matplotlib.axes._subplots.AxesSubplot at 0xce09dd8>

1.4 Error Data handling and Adjustment¶

1.4.1 Artefacts¶

wrong direction record

if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')

sudden increase in speed

# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))

<matplotlib.axes._subplots.AxesSubplot at 0xcd463c8>

incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)

sudden increase number 17

1.4.2 Direction re-aligment¶

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)

0       2604
10      3772
20      4141
30      4787
40      3730
50      3786
60      3238
70      3294
80      2445
90      2218
100     1621
110     1844
120     1687
130     1865
140     1811
150     2383
160     2799
170     4118
180     4730
190     6997
200     9256
210    12291
220     9079
230     7033
240     4321
250     4491
260     4051
270     4781
280     3931
290     4075
300     3375
310     3279
320     2623
330     2466
340     2004
350     2434
999     4194
Name: dir, dtype: int64

36 10.0

df=realign_direction(df, effective_column)

1.4.3 0 Speed¶

with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)

False 0.00677323785894

print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)

2    1718
1    1129
0     945
3     395
4       5
9       1
5       1
Name: speed, dtype: int64

1.5 Time Shift Comparison¶

DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))

df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')

display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)

# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)

1993 - 1995

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

1996 - 2000

2001 - 2005

2006 - 2010

2011 - 2015

df.resample('A').mean().plot(y='speed', figsize=(4,3))
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)

(0, 22.5)

%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()

Wall time: 13.7 s

for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)¶

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)

if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)

Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)¶

# Cook orientation
# df['dir']= (df['dir'] + 180)%360

# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview¶

2.1 Data Overview¶

## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2006':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()

Knot unit? True
Report type used: FM-15
Sampling time used: [20]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]

df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0x115cc908>

# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))

<matplotlib.axes._subplots.AxesSubplot at 0x19aaa208>

# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))

x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log

df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction¶

if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10

%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Wall time: 8.35 s

2.3 Overview by Month¶

%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Wall time: 18.2 s

df.describe()

3. Create input data and configuration¶

SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []

fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
        
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

[-16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1
   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16]

4. Kernel Density Estimation¶

sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1

%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1.1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)

0.9
Wall time: 0 ns

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])

bandwidth: 0.9 1089
[  8.00485424e-05   1.24002059e-04   1.89606224e-04   2.49645743e-04
   3.08579248e-04]

# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit¶

kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20))

Wall time: 29.7 s

for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit¶

%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(3,2.5))
fig2, ax2 = plt.subplots(figsize=(3,2.5))

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    speed_ = array(list(zip(df_standard.x, df_standard.y)))
    kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
    kde_result_standard = exp(kde_current.score_samples(points))
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))
    if len(gofs_bivar)>0:
        gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
        gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
        year_lim = end_year-year_length-5, end_year-year_length
        gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set_bivar.append(gofs_mean)
    
plt_configure(ax=ax1, ylabel='$\ R^2$', xlabel='start year')
plt_configure(ax=ax2, ylabel='K-S', xlabel='start year')
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))

Wall time: 38.7 s

4.3 Univariate GOF Limit¶

def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    # 1. Speed
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}

%%time
x = arange(0, df.speed.max() + 1)
fig1, ax1 = plt.subplots(figsize=(2.8,2.5))
fig2, ax2 = plt.subplots(figsize=(2.8,2.5))
fig3, ax3 = plt.subplots(figsize=(2.8,2.5))
gofs_mean_set = []

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
    density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]
    if len(gofs)>0:
        gofs = pd.DataFrame(gofs)
        gofs.set_index(['year'], inplace=True)  
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
        year_lim = end_year-year_length-5, end_year-year_length
        gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set.append(gofs_mean)
plt.tight_layout()

plt_configure(ax=ax1, ylabel='$\ R^2$', tight='xtight', legend=True)
plt_configure(ax=ax2, ylabel='K-S', tight='xtight', legend=True)
plt_configure(ax=ax3, ylabel='$\ R^2$', tight='xtight', legend=True)
align_figures()
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))

Wall time: 1.32 s

5. GMM by Expectation-maximization¶

sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)

True

gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)

fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.318577216602 [[ 0.62630225 -3.23285053]] [ 1.9489722   2.98585641] -64.2518494058
0.277016421177 [[ 2.98730136  3.63822642]] [ 2.67163829  3.71338702] -83.9770661341
0.233422494675 [[-5.38385875 -1.68271483]] [ 2.76801562  3.87597629] 162.772003755
0.170983867547 [[-10.04084984  -5.64047207]] [ 4.14922288  5.75151438] -63.3888448453

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics¶

points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)

gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization¶

sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)

square_error

# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result

     fun: -17.459085085829351
     jac: array([  3.06992292e+00,   4.76837158e-07,   4.76837158e-07,
         2.38418579e-07,   2.38418579e-07,  -2.38418579e-07,
         3.06992459e+00,   2.38418579e-07,   0.00000000e+00,
         2.38418579e-07,  -2.38418579e-07,  -2.38418579e-07,
         3.06991625e+00,   2.38418579e-07,  -2.38418579e-07,
         4.76837158e-07,   7.15255737e-07,   0.00000000e+00,
         3.06992579e+00,   0.00000000e+00,   0.00000000e+00,
        -2.38418579e-07,   0.00000000e+00,  -4.76837158e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 1739
     nit: 66
    njev: 66
  status: 0
 success: True
       x: array([ 0.14615741,  1.58485346, -2.73907552,  1.92697508,  1.75687609,
        0.38770181,  0.37896714, -5.34074838, -4.62861409,  6.32093165,
        3.31964506,  0.36410206,  0.2688106 ,  2.79332753,  3.65136353,
        3.54455637,  2.58694109, -0.16396335,  0.20606485, -3.93590089,
       -1.18692852,  3.18843986,  3.55531158, -0.33038242])

6.1 GMM Result¶

gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)

fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.378967142322 [[-5.34074838 -4.62861409]] [ 3.02102627  6.46897369] -76.0807538496
0.268810596731 [[ 2.79332753  3.65136353]] [ 2.5158854  3.5953393] -103.558835983
0.206064854237 [[-3.93590089 -1.18692852]] [ 2.73111714  3.91757429] -144.139281083
0.146157406709 [[ 1.58485346 -2.73907552]] [ 1.43195659  2.17929964] -51.7127366459

6.2 Bivariate Goodness-of-fit statistics¶

gof_df(gmm_pdf_result, kde_result)

pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit¶

def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V

%%time
x = arange(0, max_speed, 0.5)
# 1. Fit Weibull
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)

# 2. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibull*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))

Speed Distribution Comparison
(0.98198880665470267, 0.90533873684142596, 0.97753624137698691)
Wall time: 59 s

%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log

0.0224296017124 0.057969842225
2.0 7.0
Wall time: 59.2 s

# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])

Direction Distribution Comparison

0.91845309681583098

pd.DataFrame(gofs_mean_set).set_index('year_lim')

6.4 Sectoral Comaprison¶

%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))

0.9262141629
Wall time: 11.6 s

# Turn to parrallel would run 50% faster
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection

%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)

5.0 (-5.0 - 15.0) degree
data size: 2719 weight 0.031469907407407405
GMM Weibull
R square 0.903299974879 0.933308434914
max diff: 0.0800030875027 0.0572249265947 speed value: 7.03484373418 3.01493302893 y gmm 0.677260612387

25.0 (15.0 - 35.0) degree
data size: 5229 weight 0.060520833333333336
GMM Weibull
R square 0.947731552504 0.954545572495
max diff: 0.0677978899277 0.0346742756716 speed value: 9.66795359012 7.25096519259 y gmm 0.887266239517

45.0 (35.0 - 55.0) degree
data size: 4934 weight 0.05710648148148148
GMM Weibull
R square 0.95571847187 0.950316354164
max diff: 0.0642426854665 0.0368400185821 speed value: 9.34387330932 7.00790498199 y gmm 0.907371992317

65.0 (55.0 - 75.0) degree
data size: 4233 weight 0.048993055555555554
GMM Weibull
R square 0.970761840798 0.976884972092
max diff: 0.0575818537394 0.0248247016266 speed value: 7.44524510896 3.30899782621 y gmm 0.784017006114

85.0 (75.0 - 95.0) degree
data size: 2993 weight 0.0346412037037037
GMM Weibull
R square 0.934253686302 0.950088476599
max diff: 0.0461398328308 0.0454183418769 speed value: 6.52826634412 6.52826634412 y gmm 0.703275469541

105.0 (95.0 - 115.0) degree
data size: 2137 weight 0.024733796296296295
GMM Weibull
R square 0.909996883429 0.953427971267
max diff: 0.0684885336365 0.0382573228409 speed value: 4.95830873231 4.95830873231 y gmm 0.486963033982

125.0 (115.0 - 135.0) degree
data size: 2125 weight 0.02459490740740741
GMM Weibull
R square 0.927300967414 0.944020813311
max diff: 0.0671882082454 0.0546412384179 speed value: 2.97173800159 5.94347600318 y gmm 0.202247031775

145.0 (135.0 - 155.0) degree
data size: 2401 weight 0.027789351851851853
GMM Weibull
R square 0.980574073634 0.966091439639
max diff: 0.0299518078376 0.0390285204042 speed value: 6.38401833044 7.98002291306 y gmm 0.604367225898

165.0 (155.0 - 175.0) degree
data size: 3834 weight 0.044375
GMM Weibull
R square 0.963815177 0.95085417013
max diff: 0.0586732837589 0.0413224472374 speed value: 10.3029858824 6.86865725495 y gmm 0.915219971292

185.0 (175.0 - 195.0) degree
data size: 6256 weight 0.0724074074074074
GMM Weibull
R square 0.981776511479 0.984835564186
max diff: 0.0400305054579 0.0190498307782 speed value: 8.28459153303 8.28459153303 y gmm 0.59821464868

diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.9585012192583655 0.9502059223140222

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.05064821620094836 0.03679963920141404

# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation¶

(1) Data Variability, by Bootstrap (Resampling)¶

angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle

FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log

345.0 (335.0 - 355.0) Degree Speed Distribution
0.119370631716 6.0 0.730462516226

(2) Time Variability¶

fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log

345.0 (335.0 - 355.0) Degree Speed Distribution

(3) Adjacent Sector Variability¶

incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]

fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

345.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation¶

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)

0.9 square_error

7.1 Variability of the Result¶

%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')

GMM Plot Result
0.375448053484 [[-5.37594574 -4.66235194]] [ 3.04371082  6.54217909] -75.9549086503
0.26602238796 [[ 2.85664815  3.64014117]] [ 2.50854996  3.58874796] -104.983081227
0.212585666936 [[-3.91060429 -1.2104057 ]] [ 2.80873011  3.91616331] -145.357625383
0.14594389162 [[ 1.5771826  -2.76750495]] [ 1.42940973  2.14591925] -52.3617713338

GMM Plot Result
0.639490976765 [[-3.41496652 -1.34251593]] [ 4.0267695   7.48995103] -55.6099944253
0.178519766856 [[ 1.36298829 -3.03234838]] [ 1.49065553  2.36686602] -49.3238063512
0.11017247954 [[ 3.18358905  3.56316332]] [ 1.81085481  2.87632023] -117.798558473
0.0718167768389 [[-4.23616436 -3.7086695 ]] [ 1.69244608  2.48876111] 139.760769668

GMM Plot Result
0.604257341238 [[-3.79216734 -1.41677057]] [ 3.98633886  7.61658738] -54.8694809847
0.193825099153 [[ 1.3267304  -3.04024178]] [ 1.56171999  2.43501587] -50.5581543227
0.123138711771 [[ 3.30244037  3.56961851]] [ 1.93904306  3.03287411] -115.76491804
0.0787788478381 [[-4.19277349 -3.55861435]] [ 1.69819776  2.65974022] 152.214778199

GMM Plot Result
0.371465073157 [[-5.44805037 -4.66867073]] [ 3.00180572  6.30018748] -75.2008959545
0.277928044219 [[ 2.76878989  3.61185856]] [ 2.5211027   3.70873648] -102.351119403
0.216160186693 [[-3.6673569 -1.3768932]] [ 2.64983906  4.1885491 ] -135.6220456
0.134446695931 [[ 1.64355755 -2.70094079]] [ 1.3894515   2.14405705] -53.2840162484

GMM Plot Result
0.591624571809 [[-3.82349257 -1.48014061]] [ 3.95746222  7.77120246] -55.3507383999
0.197946035123 [[ 1.3009184  -3.04025902]] [ 1.56660784  2.47533174] -51.0269458039
0.122251122296 [[ 3.13081076  3.62062166]] [ 1.92313703  3.09067761] -114.139923281
0.0881782707725 [[-4.19198043 -3.47593024]] [ 1.89803468  2.76830624] 159.33820348

7.2 Cross-validation, to select the number of Gaussian¶

%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)

D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Number of train/test dataset 64800.0 21600.0
  
Number of gaussian 1
Train

Test

  
Number of gaussian 2
Train

Test

  
Number of gaussian 3
Train

Test

  
Number of gaussian 4
Train

Test

  
Number of gaussian 5
Train

train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)

Train gof mean, std

Test gof mean, std

prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()

R_square

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

K_S

Chi_square

# for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
#     display(fig)
# for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
#             fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
#     display(fig)

import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)

	date	HrMn	dir	speed	dir_windrose
count	3.332600e+05	333260.000000	333260.000000	333260.000000	333260.000000
mean	2.003340e+07	1166.548146	206.404069	3.225488	208.309536
std	7.020275e+04	687.794956	169.049772	2.319920	167.356881
min	1.990010e+07	0.000000	0.000000	0.000000	0.000000
25%	1.998072e+07	600.000000	110.000000	1.500000	110.000000
50%	2.004020e+07	1120.000000	200.000000	2.600000	210.000000
75%	2.009082e+07	1720.000000	250.000000	4.100000	250.000000
max	2.016020e+07	2359.000000	999.000000	34.500000	999.000000

	date	HrMn	dir	speed	dir_windrose	x	y
count	8.640000e+04	86400.000000	86400.000000	86400.000000	86400.000000	86400.000000	86400.000000
mean	2.010585e+07	1171.596065	184.688148	6.791369	206.554780	-1.946480	-1.379277
std	2.864623e+04	690.812702	95.187640	4.161544	175.873185	6.016332	4.642646
min	2.006010e+07	20.000000	-4.986791	0.001042	0.000000	-38.398653	-25.648915
25%	2.008071e+07	620.000000	103.095277	3.877681	100.000000	-5.593108	-4.447410
50%	2.011011e+07	1220.000000	206.829021	5.772452	210.000000	-1.098342	-1.852899
75%	2.013070e+07	1820.000000	252.098174	8.575966	250.000000	2.346723	2.113300
max	2.015123e+07	2320.000000	354.986789	39.430212	999.000000	20.796761	19.250923

	date	HrMn	dir	speed	dir_windrose	x	y
count	8.640000e+04	86400.000000	86400.000000	86400.000000	86400.000000	86400.000000	86400.000000
mean	2.010585e+07	1171.596065	184.688148	6.791369	206.554780	-1.946480	-1.379277
std	2.864623e+04	690.812702	95.187640	4.161544	175.873185	6.016332	4.642646
min	2.006010e+07	20.000000	-4.986791	0.001042	0.000000	-38.398653	-25.648915
25%	2.008071e+07	620.000000	103.095277	3.877681	100.000000	-5.593108	-4.447410
50%	2.011011e+07	1220.000000	206.829021	5.772452	210.000000	-1.098342	-1.852899
75%	2.013070e+07	1820.000000	252.098174	8.575966	250.000000	2.346723	2.113300
max	2.015123e+07	2320.000000	354.986789	39.430212	999.000000	20.796761	19.250923

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
year_lim
(2001, 2006)	9.139418e+06	0.034562	4.967776e-08	0.020889	0.245696	0.981110
(1991, 1996)	9.410803e+04	0.049701	1.276477e-07	0.033806	0.396092	0.948317

	k_s	r_square	r_square_dir
year_lim
(2001, 2006)	0.045466	0.991445	0.911881
(1997, 2002)	0.042175	0.992234	0.721662
(1991, 1996)	0.021687	0.996530	0.576464

	date	HrMn	type	dir	speed	wind_type
0	19900101	0000	FM-12	999	0.0	C
1	19900101	0050	FM-15	999	0.0	C
2	19900101	0150	FM-15	999	0.0	C
3	19900101	0300	SY-MT	999	0.0	C
4	19900101	0400	FM-15	999	0.0	C
5	19900101	0450	FM-15	999	0.0	C
6	19900101	0600	FM-15	999	0.0	C
7	19900101	0650	FM-15	999	0.0	C
8	19900101	0750	FM-15	999	0.0	C
9	19900101	0900	FM-15	999	0.0	C
10	19900101	1050	FM-15	999	0.0	C
11	19900101	1150	FM-15	360	0.5	N
12	19900101	1250	FM-15	20	1.5	N
13	19900101	1400	FM-15	30	1.0	N
14	19900101	1450	FM-15	30	0.5	N
15	19900101	1600	FM-15	999	0.0	C
16	19900101	1630	FM-15	999	0.0	C
17	19900101	1650	FM-15	60	0.5	N
18	19900101	1750	FM-15	190	0.5	N
19	19900101	1900	FM-15	180	0.5	N
20	19900101	2000	FM-15	170	1.0	N
21	19900101	2100	SY-MT	170	0.5	N
22	19900101	2200	FM-15	180	0.5	N
23	19900101	2250	FM-15	190	1.0	N
24	19900101	2350	FM-15	180	0.5	N
25	19900102	0000	SY-MT	190	1.0	N
26	19900102	0150	FM-15	260	0.5	N
27	19900102	0250	FM-15	190	1.5	N
28	19900102	0300	FM-12	190	1.5	N
29	19900102	0400	FM-15	200	1.0	N
...	...	...	...	...	...	...
333361	20160131	2320	FM-15	250	9.8	N
333362	20160131	2346	FM-16	240	11.8	N
333363	20160201	0020	FM-15	240	12.9	N
333364	20160201	0120	FM-15	240	13.4	N
333365	20160201	0220	FM-15	240	13.4	N
333366	20160201	0320	FM-15	240	13.4	N
333367	20160201	0420	FM-15	240	13.4	N
333368	20160201	0520	FM-15	240	13.9	N
333369	20160201	0620	FM-15	250	14.9	N
333370	20160201	0625	FM-16	250	13.9	N
333371	20160201	0720	FM-15	250	14.9	N
333372	20160201	0820	FM-15	240	11.3	N
333373	20160201	0920	FM-15	240	11.8	N
333374	20160201	1020	FM-15	240	11.8	N
333375	20160201	1048	FM-16	240	11.8	N
333376	20160201	1120	FM-15	240	11.8	N
333377	20160201	1211	FM-16	240	9.3	N
333378	20160201	1220	FM-15	240	12.4	N
333379	20160201	1320	FM-15	250	12.4	N
333380	20160201	1420	FM-15	250	12.9	N
333381	20160201	1520	FM-15	240	12.4	N
333382	20160201	1604	FM-16	240	9.3	N
333383	20160201	1620	FM-15	240	10.3	N
333384	20160201	1720	FM-15	240	6.7	N
333385	20160201	1820	FM-15	270	7.2	N
333386	20160201	1920	FM-15	260	6.7	N
333387	20160201	2020	FM-15	260	8.8	N
333388	20160201	2120	FM-15	250	9.8	N
333389	20160201	2220	FM-15	250	8.8	N
333390	20160201	2320	FM-15	240	9.8	N

	date	HrMn	type	dir	speed	wind_type	dir_windrose
time
1994-03-22 18:20:00	19940322	1820	FM-15	176	4	N	274
1996-04-18 17:20:00	19960418	1720	FM-15	88	4	N	2
1998-02-05 13:20:00	19980205	1320	FM-15	87	2	N	3
1998-11-11 00:20:00	19981111	20	FM-15	159	14	N	291
2000-02-20 15:20:00	20000220	1520	FM-15	157	7	N	293
2000-05-20 08:20:00	20000520	820	FM-15	193	13	N	257
2000-10-27 00:20:00	20001027	20	FM-15	229	7	N	221
2000-12-06 01:20:00	20001206	120	FM-15	85	3	N	5
2001-03-02 12:20:00	20010302	1220	FM-15	221	4	N	229
2001-05-21 08:20:00	20010521	820	FM-15	29	34	N	61
2002-03-16 16:20:00	20020316	1620	FM-15	32	8	N	58
2002-04-06 03:20:00	20020406	320	FM-15	83	5	N	7
2002-06-09 17:20:00	20020609	1720	FM-15	206	4	N	244
2002-08-13 09:20:00	20020813	920	FM-15	159	2	N	291
2002-12-30 02:20:00	20021230	220	FM-15	198	25	N	252
2003-01-27 05:20:00	20030127	520	FM-15	209	2	N	241
2003-02-09 03:20:00	20030209	320	FM-15	85	3	N	5
2003-02-21 16:20:00	20030221	1620	FM-15	1	4	N	89
2003-12-06 08:20:00	20031206	820	FM-15	163	14	N	287
2005-07-24 10:20:00	20050724	1020	FM-15	195	5	N	255
2005-11-25 15:20:00	20051125	1520	FM-15	239	8	N	211
2005-12-31 11:20:00	20051231	1120	FM-15	216	14	N	234
2006-01-13 05:20:00	20060113	520	FM-15	229	2	N	221
2006-03-10 02:20:00	20060310	220	FM-15	209	67	N	241
2006-03-10 03:20:00	20060310	320	FM-15	209	67	N	241
2006-12-04 00:20:00	20061204	20	FM-15	178	28	N	272

	date	HrMn	type	dir	speed	wind_type	dir_windrose	incre	incre_reverse
time
2006-10-13 22:20:00	20061013	2220	FM-15	60	53	N	30	50.0	48.0
2015-11-15 11:20:00	20151115	1120	FM-15	200	52	N	250	27.0	22.0
2001-06-17 00:20:00	20010617	20	FM-15	270	47	N	180	42.0	39.0
1998-02-14 17:20:00	19980214	1720	FM-15	40	46	N	50	40.0	43.0
2007-01-25 03:20:00	20070125	320	FM-15	150	45	N	300	40.0	42.0
2002-10-03 03:20:00	20021003	320	FM-15	350	44	N	100	40.0	39.0
2006-06-14 06:20:00	20060614	620	FM-15	250	44	N	200	40.0	41.0
2006-04-29 00:20:00	20060429	20	FM-15	170	42	N	280	40.0	39.0
2005-12-21 17:20:00	20051221	1720	FM-15	270	42	N	180	40.0	38.0
1995-01-26 22:20:00	19950126	2220	FM-15	200	40	N	250	18.0	13.0

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.319	0.626	-3.233	2.820	2.183	0.325
2	0.277	2.987	3.638	3.704	2.685	0.070
3	0.233	-5.384	-1.683	2.882	3.792	0.191
4	0.171	-10.041	-5.640	5.468	4.517	0.257

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.379	-5.341	-4.629	6.321	3.320	0.364
2	0.269	2.793	3.651	3.545	2.587	-0.164
3	0.206	-3.936	-1.187	3.188	3.555	-0.330
4	0.146	1.585	-2.739	1.927	1.757	0.388

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.375	-5.376	-4.662	6.389	3.353	0.369
2	0.266	2.857	3.640	3.527	2.595	-0.180
3	0.213	-3.911	-1.210	3.209	3.596	-0.302
4	0.146	1.577	-2.768	1.910	1.732	0.374

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.639	-3.415	-1.343	6.586	5.380	0.525
2	0.179	1.363	-3.032	2.041	1.913	0.428
3	0.110	3.184	3.563	2.681	2.089	-0.368
4	0.072	-4.236	-3.709	2.062	2.192	0.363

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.604	-3.792	-1.417	6.638	5.462	0.547
2	0.194	1.327	-3.040	2.126	1.962	0.411
3	0.123	3.302	3.570	2.858	2.188	-0.340
4	0.079	-4.193	-3.559	1.948	2.483	0.357

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.371	-5.448	-4.669	6.139	3.319	0.372
2	0.278	2.769	3.612	3.663	2.587	-0.163
3	0.216	-3.667	-1.377	3.488	3.521	-0.428
4	0.134	1.644	-2.701	1.909	1.698	0.394

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.592	-3.823	-1.480	6.777	5.488	0.562
2	0.198	1.301	-3.040	2.162	1.977	0.420
3	0.122	3.131	3.621	2.928	2.163	-0.345
4	0.088	-4.192	-3.476	2.027	2.675	0.247

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.599	-3.621	-1.519	6.786	5.463	0.539
2	0.194	1.246	-3.042	2.154	1.938	0.426
3	0.124	3.123	3.595	2.846	2.133	-0.322
4	0.083	-4.310	-3.348	1.933	2.728	0.304

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.621	-3.632	-1.449	6.533	5.390	0.527
2	0.189	1.371	-3.049	2.108	1.979	0.432
3	0.115	3.310	3.653	2.862	2.119	-0.315
4	0.075	-4.178	-3.615	2.125	2.272	0.387

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.609	-3.687	-1.423	6.650	5.470	0.547
2	0.190	1.349	-3.022	2.131	2.001	0.447
3	0.118	3.164	3.627	2.882	2.096	-0.345
4	0.083	-4.191	-3.504	2.049	2.497	0.321

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.617	-3.498	-1.297	6.625	5.427	0.530
2	0.185	1.324	-3.057	2.071	1.929	0.411
3	0.115	3.284	3.469	2.657	2.146	-0.369
4	0.083	-4.264	-3.584	2.125	2.372	0.326

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.615	-3.666	-1.321	6.596	5.455	0.551
2	0.195	1.319	-3.038	2.177	2.000	0.444
3	0.115	3.326	3.498	2.861	2.120	-0.373
4	0.075	-4.295	-3.722	2.044	2.404	0.378

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.391	-5.365	-4.515	6.286	3.358	0.310
2	0.261	2.897	3.594	3.362	2.620	-0.193
3	0.194	-4.007	-0.918	2.856	3.660	-0.255
4	0.155	1.469	-2.811	1.971	1.754	0.357

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.626	-3.511	-1.382	6.664	5.409	0.545
2	0.183	1.341	-3.052	2.075	1.922	0.422
3	0.116	3.205	3.506	2.774	2.163	-0.356
4	0.076	-4.239	-3.707	2.047	2.357	0.324

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.176884	0.074135	3.878059e-07	0.058818	0.691612	0.843048
1	0.188004	0.075327	3.903619e-07	0.058975	0.694287	0.841985
2	0.147722	0.073739	3.761285e-07	0.058262	0.681272	0.847264
3	0.194849	0.074582	3.886130e-07	0.059470	0.692341	0.843088

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.158188	0.072191	3.786549e-07	0.059010	0.683959	0.846734
1	0.161473	0.069775	3.711260e-07	0.058531	0.675957	0.849828
2	0.400425	0.078487	4.176790e-07	0.060905	0.717859	0.832612
3	0.175761	0.077330	3.857625e-07	0.057812	0.690320	0.842759

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.097518	0.027092	9.745246e-08	0.029703	0.346822	0.960542
1	0.090336	0.026964	9.927790e-08	0.029933	0.349997	0.959877
2	0.118123	0.028223	1.001892e-07	0.029457	0.351516	0.959630
3	0.092103	0.028640	1.025926e-07	0.030770	0.355836	0.958208

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.108317	0.028759	1.106624e-07	0.031202	0.369354	0.955269
1	0.151458	0.026761	1.037106e-07	0.030346	0.357744	0.957823
2	0.078305	0.025324	1.043670e-07	0.032452	0.359131	0.957217
3	0.162889	0.030428	9.734076e-08	0.028461	0.346455	0.961379

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.068710	0.025547	4.540130e-08	0.020376	0.236601	0.981606
1	0.072838	0.026597	4.598658e-08	0.020414	0.238236	0.981320
2	0.064041	0.026661	4.416640e-08	0.019915	0.233523	0.982091
3	0.057280	0.025609	4.485194e-08	0.019837	0.235238	0.981946

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.069924	0.024775	4.696082e-08	0.020029	0.240988	0.981047
1	0.078546	0.026479	4.620981e-08	0.020135	0.238708	0.981500
2	0.072029	0.024602	5.168073e-08	0.021585	0.252284	0.979201
3	0.113478	0.029428	4.909832e-08	0.021803	0.246181	0.979794

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.790776	0.014052	2.675520e-08	0.015484	0.181751	0.989118
1	0.270129	0.013748	2.702811e-08	0.015655	0.182638	0.989048
2	0.030087	0.008200	2.381264e-08	0.014633	0.171409	0.990395
3	0.030497	0.007856	2.444087e-08	0.014776	0.173599	0.990124

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.435547	0.011291	2.845708e-08	0.016066	0.187218	0.988649
1	10.859439	0.015400	2.763835e-08	0.015557	0.184621	0.988850
2	0.039884	0.011268	3.413733e-08	0.017505	0.205259	0.986049
3	0.033414	0.009308	3.005793e-08	0.016591	0.192792	0.987772

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.420206	0.011472	1.752818e-08	0.012426	0.147077	0.992930
1	0.200281	0.009313	1.760153e-08	0.012656	0.147374	0.992861
2	0.265383	0.014130	1.827162e-08	0.012857	0.150181	0.992604
3	0.137819	0.015933	1.803788e-08	0.012743	0.149148	0.992684

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.362061	0.013057	2.269030e-08	0.014730	0.167287	0.990721
1	0.134426	0.009454	2.468141e-08	0.014626	0.174511	0.990080
2	0.185746	0.013282	1.969058e-08	0.013175	0.155785	0.992032
3	0.216990	0.015787	2.149176e-08	0.013867	0.162980	0.991354

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.176865	0.074446	3.857273e-07	0.058881	0.689878	0.843846
2	0.099520	0.027730	9.987805e-08	0.029966	0.351043	0.959564
3	0.065718	0.026104	4.510155e-08	0.020136	0.235899	0.981741
4	0.280372	0.010964	2.550920e-08	0.015137	0.177349	0.989671
5	0.255922	0.012712	1.785980e-08	0.012670	0.148445	0.992770

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.223962	0.074446	3.883056e-07	0.059064	0.692024	0.842983
2	0.125242	0.027818	1.040202e-07	0.030615	0.358171	0.957922
3	0.083495	0.026321	4.848742e-08	0.020888	0.244540	0.980386
4	2.842071	0.011817	3.007267e-08	0.016430	0.192473	0.987830
5	0.224806	0.012895	2.213851e-08	0.014099	0.165141	0.991047