Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models¶

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up¶

1.1 Environment¶

%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data¶

# file_path= './data/NCDC/jerez/dat.txt' # time shift
# file_path= './data/NCDC/almeria/dat.txt'

# Greece
# file_path= './data/NCDC/eleftherios_intl/dat.txt'
# file_path= './data/NCDC/elefsis/dat.txt' # bad dataset
# file_path= './data/NCDC/malaga/dat.txt'
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# Turkey
# file_path= './data/NCDC/turkey/konya/dat.txt' 
# file_path= './data/NCDC/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/turkey/bartin/dat.txt' # bad dataset

# Iran
# file_path= './data/NCDC/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/iran/torbat_heydarieh/dat.txt' # Unusable

# UAE
# file_path= './data/NCDC/al_maktoum/dat.txt' 
# file_path= './data/NCDC/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/buraimi/dat.txt' # not good dataset

# file_path= './data/NCDC/uk/marham/dat.txt' 
# file_path= './data/NCDC/uk/tiree/dat.txt'  # try 4
# file_path= './data/NCDC/uk/boscombe_down/dat.txt' # 4?, numpy bug
# file_path= './data/NCDC/uk/middle_wallop/dat.txt' 
# file_path= './data/NCDC/uk/southhamption/dat.txt' # high 0, trend
# file_path= './data/NCDC/uk/bournemouth/dat.txt' # 4?
# file_path= "./data/NCDC/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?

# file_path, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/landsberg_lech/dat.txt", 4 # very good, can try 4
file_path= './data/NCDC/europe/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/vatry/dat.txt"  # double peak, initial speed (should be good with m/s), mixed report type
# file_path= "./data/NCDC/europe/neuburg/dat.txt"
# file_path= "./data/NCDC/europe/valladolid/dat.txt"
# file_path= "./data/NCDC/europe/laupheim/dat.txt" # double peak, 4; very good, trend
# file_path= "./data/NCDC/europe/avord/dat.txt" # try 4, initial speed (should be good with m/s)
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/holzdorf/dat.txt" # 2008 year
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= "./data/NCDC/europe/barayas/dat.txt" # numpy problem
# file_path= './data/NCDC/europe/tenerife_sur/dat.txt'  # some directions are blocked
# file_path= './data/NCDC/europe/nantes/dat.txt' # some dir R square / K-S differs big, unit detect fails

# file_path= "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt" # care for the sampling time
# file_path= "./data/NCDC/cn/shanghai/pudong/dat.txt"
# file_path= "./data/NCDC/cn/hefei_luogang/dat.txt" # few 0, trend, try 2
# file_path= "./data/NCDC/cn/nanjing_lukou/dat.txt" 
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path= "./data/NCDC/oceania/auckland_intl/dat.txt"  # Good data, Weird KDE shape, might be blocked?
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/canberra/dat.txt" # high 0, numpy problem

# file_path= './data/NCDC/us/boston_16nm/dat.txt' # Offshore

# file_path = './data/asos/denver/hr_avg.csv'
# file_path = './data/asos/bismarck_ND/hr_avg.csv' # try 4
# file_path = './data/asos/aberdeen_SD/hr_avg.csv' # only to 2012, good fit, try 2
# file_path = './data/asos/minneapolis/hr_avg.csv'
# file_path = './data/asos/lincoln_NE/hr_avg.csv' 
# file_path = './data/asos/des_moines_IA/hr_avg.csv'
# file_path = './data/asos/springfield_IL/hr_avg.csv' # good fit
# file_path = './data/asos/topeka/hr_avg.csv' # High 0

# file_path = './data/NDAWN/baker/hr_avg.csv' # 4 might be better
# file_path = './data/NDAWN/dickinson/hr_avg.csv'
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = False
else:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = True

df

df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) & \
              (date >= 19700000) & (date < 20170000) ")

plot_speed_and_angle_distribution(df.speed, df.dir)

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
df['month'] = df['date']%10000//100
# Convert Windrose coordianates to Polar Cooridinates 
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
df.describe()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0xde94b38>

1.2.1 Unit Detection¶

df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
if 'knot_unit' not in globals():
#     knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    knot_unit=True
    if knot_unit:
        df['speed'] = df['speed'] * 1.943845
        df['decimal'] = df.speed % 1
        df.decimal.hist(alpha=0.5, label='knot')
        # need more elaboration, some is not near an integer
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

True

dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.2.2 Sampling Type Selection¶

sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.2.3 Sampling Time Selection¶

MID_YEAR = (min(df.date)//10000+max(df.date)//10000)//2

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df.query('date > @MID_YEAR * 10000')['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='> %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)

df['sample_time'] = df.HrMn % 100 
sample_time = df.query('date > 20000000')['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))

[0]

<matplotlib.axes._subplots.AxesSubplot at 0xe91c5f8>

1.3 Data Wrangling¶

1.3.1 Artefacts¶

1.3.1.1 wrong direction record¶

if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')

1.3.1.2 sudden increase in speed¶

# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))

<matplotlib.axes._subplots.AxesSubplot at 0xe8fcfd0>

incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)

sudden increase number 2

1.3.2 0 Speed¶

with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df.query("(date >= 20050000)"))
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)

False 0.0572973129237

1.3.3 Direction re-aligment and 999¶

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,30,50], need to redistribute the angle into 22.5

display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)

0      3789
10     3928
20     3016
30     3102
40     2448
50     2630
60     1922
70     2066
80     1730
90     2197
100    2083
110    2891
120    2725
130    3563
140    3421
150    4754
160    4626
170    6436
180    5347
190    6230
200    4789
210    5008
220    3309
230    2888
240    1958
250    1685
260    1284
270    1495
280    1389
290    2040
300    2566
310    4939
320    5425
330    7138
340    4891
350    4832
999    7612
Name: dir, dtype: int64

36 10.0

df.query('dir == 999')['speed'].value_counts()

0    7611
2       1
Name: speed, dtype: int64

df=realign_direction(df, effective_column)
df=fill_direction_999(df, SECTOR_LENGTH)

1.4 Time Shift Comparison¶

DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df.query('date < @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))

df.query('date < @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')

# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)

1979 - 1979

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

1980 - 1984

1985 - 1989

1990 - 1994

1995 - 1999

2000 - 2004

2005 - 2009

2010 - 2013

df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)

(0, 10.0)

display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)

for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 5000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()

for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 5000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.5 Re-distribute Direction and Speed (Optional)¶

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)

if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)

Redistribute upward, e.g. 0 -> [0,1]

1.6 Generate (x,y) from (speed,dir)¶

# Cook orientation
# df['dir']= (df['dir'] + 180)%360

# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview¶

2.1 Data Overview¶

## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()

Knot unit? True
Report type used: SY-MT
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]

df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0x15154860>

# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))

<matplotlib.axes._subplots.AxesSubplot at 0x14d92cc0>

# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))

x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log

df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction¶

if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10

original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

2.3 Overview by Month¶

month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 12+month_incre, month_incre): 
    end_month = month+month_incre
    sub_df = current_df.query('(month >= @month) and (month < @end_month)')
    if len(sub_df) > 0:
        if month_incre == 1:
            title = 'Month: %s' % (month)
        else:
            title = 'Month: %s - %s ' % (month, end_month-1)
        ax = WindroseAxes.from_ax()
        ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
        plt_configure(figsize=(3,3), title=title)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

3. Create input data and configuration¶

SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []

fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])

[-12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5
   6   7   8   9  10  11  12]

plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation¶

sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1

%%time
from sklearn.grid_search import GridSearchCV
# from sklearn.model_selection import GridSearchCV  ## too slow

# The bandwidth value sometimes would be too radical
if knot_unit:
    bandwidth_range = arange(0.7,2,0.2)
else:
    bandwidth_range = arange(0.4,1,0.1)

# Grid search is unable to deal with too many data (a long time is needed)
if len(sample) > 50000:    
    df_resample=df.sample(n=50000, replace=True)
    bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
else:
    bandwidth_search_sample = sample

grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

grid.fit(bandwidth_search_sample)
bandwidth = grid.best_params_['bandwidth']
print(bandwidth)

D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

0.7
Wall time: 1min 21s

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])

bandwidth: 0.7 625
[  1.75323162e-06   3.07138842e-06   2.42519722e-05   6.02241347e-05
   1.15292813e-04]

# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

kde_cdf = cdf_from_pdf(kde_result)

5. GMM by Expectation-maximization¶

sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)

True

gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)

fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.49894287172 [[ 1.92804623 -0.74776635]] [ 2.10370223  2.28727599] -123.885295177
0.264029083066 [[-3.99359868 -0.58532644]] [ 2.72829546  3.80058901] -50.0121680074
0.237028045214 [[-4.8139851   1.94498738]] [ 3.75711339  6.26118233] -88.0266187571

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics¶

points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)

6. GMM by Optimization¶

sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)

square_error

# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result

     fun: -15.758814713123712
     jac: array([  6.87428713e-01,  -2.38418579e-07,   3.57627869e-07,
         2.38418579e-07,  -1.19209290e-07,  -1.19209290e-07,
         6.87425017e-01,   4.76837158e-07,   3.57627869e-07,
         0.00000000e+00,   3.57627869e-07,   0.00000000e+00,
         6.87428713e-01,   0.00000000e+00,   1.19209290e-07,
         1.19209290e-07,   1.19209290e-07,   0.00000000e+00,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 1687
     nit: 83
    njev: 83
  status: 0
 success: True
       x: array([ 0.11555615,  0.9283853 , -0.07450164,  2.05073704,  1.61659019,
        0.44224838,  0.175232  ,  2.73774064, -1.89303041,  1.3175924 ,
        1.35885988, -0.05841041,  0.70921185, -1.83446419,  0.30115834,
        4.94554499,  3.54161939,  0.10208762])

6.1 GMM Result¶

gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)

fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.709211852928 [[-1.83446419  0.30115834]] [ 3.5043575   4.97201786] -81.6469067896
0.175231995805 [[ 2.73774064 -1.89303041]] [ 1.29344877  1.38186114] -148.918595509
0.115556151267 [[ 0.9283853  -0.07450164]] [ 1.31951349  2.25339087] -59.2504359635

6.2 Goodness-of-fit statistics¶

gof_df(gmm_pdf_result, kde_result)

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 3. GMM distribution
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'})

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:9: RuntimeWarning: divide by zero encountered in log

# Calculate Speed Distribution
# 1. GMM Model
x = arange(0, max_speed, 0.5)
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])*len(df.speed)/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm,'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

Speed Distribution Comparison

# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir']) 

df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
title='Direction Distribution Comparison'
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print(title)

Direction Distribution Comparison

# %%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))

0.891591426227

6.3 Sectoral Comaprison¶

# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    max_diff_array = []
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = radians(angle), radians(incre)  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. Make Plots
        fig = plt.figure(figsize=(10,1.9))
#         fig = plt.figure(figsize=(10,1.7))
        # 3.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
#         plt_configure(xlabel = "$V$", ylabel='Frequency', legend=True)
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 3.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
#         plt_configure(xlabel = "$V$", ylabel='$P$', legend=True)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        # 3.3. Weibull Comparison
#         ax3 = fig.add_subplot(1,3,3)
#         plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
#         plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
#         plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
#         plt.gca().set_xlim(right = log(max_speed+1))
# #         plt_configure(xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
#         plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
        
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        diff, diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
        max_diff_array.append([len(sub_df), angle, diff.max(), x[diff.argmax()], 
                               diff_weibull.max(), x[diff_weibull.argmax()], R_square_gmm, R_square_weibull])
        curves = {'angle': angle, 'data_size': data_size, 'weight': direction_prob, 
                  'x': x, 'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', diff.max(), diff_weibull.max(), 'speed value:', x[diff.argmax()], x[diff_weibull.argmax()], 'y gmm', y_cdf_gmm[diff.argmax()])
        print(' ')
    return max_diff_array, curve_collection

%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
max_diff_array, curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)

5.0 (-5.0 - 15.0) degree
data size: 1190 weight 0.04240005700848001
GMM Weibull
R square 0.941013596534 0.901249735501
max diff: 0.0431872267562 0.0423372188494 speed value: 8.75519572132 2.18879893033 y gmm 0.922358991731

25.0 (15.0 - 35.0) degree
data size: 1248 weight 0.044466614408893324
GMM Weibull
R square 0.920220103213 0.93613647517
max diff: 0.0505266198494 0.0345875030929 speed value: 7.26012650975 2.17803795293 y gmm 0.834890046817

45.0 (35.0 - 55.0) degree
data size: 1129 weight 0.04022660870804532
GMM Weibull
R square 0.913527922499 0.965828067128
max diff: 0.0715573736226 0.0575893346446 speed value: 6.59846424683 4.12404015427 y gmm 0.795581687493

65.0 (55.0 - 75.0) degree
data size: 846 weight 0.030143233806028645
GMM Weibull
R square 0.816321204173 0.946425739321
max diff: 0.139287104627 0.0641914829661 speed value: 5.09885335072 3.82414001304 y gmm 0.655039136508

85.0 (75.0 - 95.0) degree
data size: 815 weight 0.02903869450580774
GMM Weibull
R square 0.758140866016 0.916818777993
max diff: 0.162880560666 0.103834738458 speed value: 4.74364424627 4.06598078252 y gmm 0.605217598844

105.0 (95.0 - 115.0) degree
data size: 1033 weight 0.03680609990736122
GMM Weibull
R square 0.875978898275 0.896339637303
max diff: 0.0550435561007 0.0551538838729 speed value: 5.97420457682 5.97420457682 y gmm 0.736824788527

125.0 (115.0 - 135.0) degree
data size: 1318 weight 0.04696073540939215
GMM Weibull
R square 0.917451099982 0.924860844398
max diff: 0.0697196973489 0.0493741250357 speed value: 4.7384018363 5.92300229538 y gmm 0.509021670035

145.0 (135.0 - 155.0) degree
data size: 1732 weight 0.061711679612342335
GMM Weibull
R square 0.875958436948 0.899972284745
max diff: 0.107254375725 0.0369739168742 speed value: 9.10859521109 6.50613943649 y gmm 0.870533821452

165.0 (155.0 - 175.0) degree
data size: 2474 weight 0.08814936221762987
GMM Weibull
R square 0.837399457279 0.924895299138
max diff: 0.0985288001349 0.0318231684276 speed value: 9.02285813041 6.01523875361 y gmm 0.783249899569

185.0 (175.0 - 195.0) degree
data size: 2531 weight 0.09018028931803605
GMM Weibull
R square 0.878083875687 0.932952441763
max diff: 0.0753235181465 0.0445611915504 speed value: 9.25372932591 5.55223759555 y gmm 0.761613522097

diff_df = pd.DataFrame(max_diff_array,columns=['datasize','direction', 'gmm', 'speed_gmm',
                                               'weibull', 'speed_weibull', 'r_square_gmm', 'r_square_weibull'])  

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.8934799879528382 0.9154792615039528

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.gmm, diff_df.weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.07497111119903163 0.058622503211435084

# Compare direction weight with previous figure
display(dir_fig)

6.4 Insufficient-fit Sector Investigation¶

6.4.1 Data Variability, by Bootstrap (Resampling)¶

max_diff_element = max(max_diff_array, key=lambda x: x[2])
angle =  max_diff_angle = max_diff_element[1]
incre = rebinned_angle

FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log

85.0 (75.0 - 95.0) Degree Speed Distribution
0.182819388482 5.0 0.640493494954

6.4.2 Time Variability¶

fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(20000000, 20150000, 50000):
    end_time = start_time + 50000 
    time_label = start_time//10000
    df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = time_label)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
        
        title = '%s - %s' %(time_label, time_label+4)
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=time_label, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = time_label*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if time_label == 2010 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if time_label == 2010 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:25: RuntimeWarning: divide by zero encountered in log

85.0 (75.0 - 95.0) Degree Speed Distribution

6.4.3 Adjacent Sector Variability¶

incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]

fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('angle == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['data_size'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

85.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation¶

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)

0.7 square_error

7.1 Variability of the Result¶

%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')

GMM Plot Result
0.69137052063 [[-1.91298226  0.33648134]] [ 3.55492455  4.97233077] -82.9388753409
0.158931991777 [[ 2.77996832 -1.96708623]] [ 1.25006013  1.35608387] -149.381619661
0.149697487593 [[ 1.20584246 -0.16764222]] [ 1.47656805  2.31831812] -57.9145175723

GMM Plot Result
0.770774833266 [[-1.42521621  0.21038266]] [ 3.31601787  4.81934537] -80.9451158036
0.21452921717 [[ 2.5387039  -1.58274106]] [ 1.34388437  1.64329598] -149.240743183
0.0146959495642 [[-0.00682602 -0.15263387]] [ 0.17851161  1.5785485 ] 164.079346947

GMM Plot Result
0.754418238477 [[-1.54681106  0.21860424]] [ 3.33261867  4.87530749] -79.6168978146
0.223198275012 [[ 2.53494386 -1.57089651]] [ 1.34089495  1.60279271] -157.058557609
0.0223834865117 [[-0.19229805 -0.1963364 ]] [ 0.60739584  0.62228173] 171.360495256

GMM Plot Result
0.552207363838 [[-2.2747951   0.94453528]] [ 3.36691567  5.14472759] -86.385709621
0.249940015747 [[ 2.44953365 -1.59177156]] [ 1.36630286  1.81481627] -138.021552808
0.197852620415 [[-0.75851572 -0.78874869]] [ 1.88765505  4.28922625] -58.444674436

GMM Plot Result
0.696951364224 [[-1.89733324  0.43484936]] [ 3.50784616  4.9950252 ] -82.882703965
0.19854348956 [[ 2.72204852 -1.79661253]] [ 1.32235551  1.45352989] -147.275889541
0.104505146216 [[ 0.49252249 -0.12962736]] [ 1.286218    2.59914759] -55.3611295199

7.2 Cross-validation, to select the number of Gaussian¶

# df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn%400 == 0)')

%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)

Number of train/test dataset 21049.5 7016.5
  
Number of gaussian 1
Train

Test

  
Number of gaussian 2
Train

Test

  
Number of gaussian 3
Train

Test

  
Number of gaussian 4
Train

Test

  
Number of gaussian 5
Train

Test

train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)

Train gof mean, std

Test gof mean, std

prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()

R_square

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

K_S

Chi_square

# fig = plt.figure(figsize=(4.3,2.4))
fig = plt.figure(figsize=(5,2.5))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)

for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)

import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)

	date	HrMn	type	dir	speed	wind_type
0	19790101	0000	FM-12	260	6.2	N
1	19790101	0100	FM-15	260	5.1	N
2	19790101	0200	FM-15	260	7.2	N
3	19790101	0300	FM-12	260	10.3	N
4	19790101	0600	FM-12	280	10.3	N
5	19790101	0700	FM-15	280	7.2	N
6	19790101	0800	FM-15	280	6.1	N
7	19790101	0900	FM-12	360	5.1	N
8	19790101	1000	FM-15	360	3.6	N
9	19790101	1100	FM-15	60	1.0	N
10	19790101	1200	FM-12	999	0.0	C
11	19790101	1300	FM-15	20	1.0	N
12	19790101	1400	FM-15	999	0.0	C
13	19790101	1500	FM-12	180	1.0	N
14	19790101	1600	FM-15	999	0.0	C
15	19790101	1700	FM-15	140	2.0	N
16	19790101	1800	FM-12	140	2.6	N
17	19790101	1900	FM-15	120	2.0	N
18	19790101	2000	FM-15	140	1.5	N
19	19790101	2100	FM-12	80	1.0	N
20	19790102	0100	FM-15	60	3.6	N
21	19790102	0200	FM-15	40	4.1	N
22	19790102	0300	FM-12	40	4.1	N
23	19790102	0500	FM-15	60	2.5	N
24	19790102	0600	FM-12	100	2.6	N
25	19790102	0800	FM-15	80	2.5	N
26	19790102	0900	FM-12	60	3.1	N
27	19790102	1000	FM-15	80	5.1	N
28	19790102	1200	FM-12	80	3.1	N
29	19790102	1300	FM-15	80	4.1	N
...	...	...	...	...	...	...
366609	20170401	0900	FM-15	310	4.1	N
366610	20170401	0930	FM-15	999	1.5	V
366611	20170401	1000	FM-15	230	3.6	V
366612	20170401	1030	FM-15	260	6.7	V
366613	20170401	1100	FM-12	270	6.7	N
366614	20170401	1130	FM-15	260	6.2	N
366615	20170401	1200	FM-15	260	4.1	N
366616	20170401	1230	FM-15	250	5.7	V
366617	20170401	1300	FM-15	270	8.2	V
366618	20170401	1330	FM-15	280	9.3	N
366619	20170401	1400	FM-15	270	8.2	N
366620	20170401	1430	FM-15	250	7.7	N
366621	20170401	1500	FM-15	250	5.1	V
366622	20170401	1530	FM-15	230	4.1	V
366623	20170401	1600	FM-15	220	3.6	V
366624	20170401	1630	FM-15	230	4.1	V
366625	20170401	1700	FM-15	270	6.2	N
366626	20170401	1730	FM-15	250	6.2	N
366627	20170401	1800	FM-15	250	5.1	N
366628	20170401	1830	FM-15	270	5.1	N
366629	20170401	1900	FM-12	260	5.1	N
366630	20170401	1930	FM-15	250	5.1	N
366631	20170401	2000	FM-15	260	5.1	N
366632	20170401	2030	FM-15	260	4.6	N
366633	20170401	2100	FM-15	260	3.6	V
366634	20170401	2130	FM-15	260	4.6	N
366635	20170401	2200	FM-15	250	4.1	N
366636	20170401	2230	FM-15	240	3.1	N
366637	20170401	2300	FM-15	230	3.1	V
366638	20170401	2330	FM-15	250	3.6	V

	date	HrMn	dir	speed	month	dir_windrose
count	3.622460e+05	362246.00000	362246.000000	362246.000000	362246.000000	362246.000000
mean	2.001603e+07	1142.66531	287.071871	2.378766	6.545927	283.826955
std	1.113944e+05	678.73662	277.331580	1.792495	3.452641	276.351481
min	1.979010e+07	0.00000	0.000000	0.000000	1.000000	0.000000
25%	1.993013e+07	600.00000	140.000000	1.000000	4.000000	120.000000
50%	2.004122e+07	1130.00000	200.000000	2.100000	7.000000	230.000000
75%	2.011102e+07	1700.00000	320.000000	3.100000	10.000000	300.000000
max	2.016123e+07	2330.00000	999.000000	26.200000	12.000000	999.000000

	date	HrMn	dir	speed	month	dir_windrose	x	y
count	2.806600e+04	28066.000000	28066.000000	28066.000000	28066.000000	28066.000000	28066.000000	28066.000000
mean	2.011260e+07	1148.574788	190.745288	4.970712	6.138780	234.555013	-1.233491	-0.066619
std	9.704539e+03	696.268540	103.438455	3.369369	3.519564	214.959077	4.985859	3.110743
min	2.010010e+07	0.000000	-4.986196	0.000259	1.000000	0.000000	-35.059729	-21.181042
25%	2.010110e+07	500.000000	120.250914	2.801845	3.000000	110.000000	-4.190693	-2.186894
50%	2.011090e+07	1100.000000	189.209888	4.143457	6.000000	210.000000	-0.185727	-0.353921
75%	2.012070e+07	1800.000000	291.390825	6.319304	9.000000	280.000000	2.465441	1.904472
max	2.013043e+07	2300.000000	354.989938	35.164171	12.000000	999.000000	13.824255	19.257946

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.499	1.928	-0.748	2.232	2.162	-0.077
2	0.264	-3.994	-0.585	3.399	3.215	0.315
3	0.237	-4.814	1.945	6.259	3.761	0.037

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.709	-1.834	0.301	4.946	3.542	0.102
2	0.175	2.738	-1.893	1.318	1.359	-0.058
3	0.116	0.928	-0.075	2.051	1.617	0.442

	date	HrMn	type	dir	speed	wind_type	month	dir_windrose	incre	incre_reverse
time
1999-12-27 18:00:00	19991227	1800	SY-MT	160	51	N	12	290	24.0	14.0
1984-11-28 00:00:00	19841128	0	SY-MT	350	40	N	11	100	36.0	38.0
1984-06-17 12:00:00	19840617	1200	SY-MT	70	40	N	6	20	36.0	36.0
2002-05-13 16:00:00	20020513	1600	SY-MT	170	37	N	5	280	27.0	20.0
1999-12-27 19:00:00	19991227	1900	SY-MT	180	37	N	12	270	-14.0	4.0
2007-02-14 11:00:00	20070214	1100	SY-MT	180	35	N	2	270	14.0	10.0
2009-01-24 08:00:00	20090124	800	SY-MT	180	35	N	1	270	2.0	2.0
2009-01-24 04:00:00	20090124	400	SY-MT	160	35	N	1	290	15.0	18.0
2009-01-24 10:00:00	20090124	1000	SY-MT	170	35	N	1	280	2.0	6.0
2012-12-21 14:00:00	20121221	1400	SY-MT	180	35	N	12	270	14.0	12.0

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.691	-1.913	0.336	4.954	3.581	0.083
2	0.159	2.780	-1.967	1.278	1.329	-0.071
3	0.150	1.206	-0.168	2.115	1.755	0.387

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.771	-1.425	0.210	4.788	3.361	0.118
2	0.215	2.539	-1.583	1.428	1.571	-0.175
3	0.015	-0.007	-0.153	0.466	1.519	0.917

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.754	-1.547	0.219	4.833	3.394	0.137
2	0.223	2.535	-1.571	1.384	1.566	-0.128
3	0.022	-0.192	-0.196	0.608	0.622	0.007

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.552	-2.275	0.945	5.139	3.376	0.055
2	0.250	2.450	-1.592	1.583	1.630	-0.275
3	0.198	-0.759	-0.789	3.786	2.761	0.633

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.697	-1.897	0.435	4.976	3.535	0.088
2	0.199	2.722	-1.797	1.362	1.416	-0.086
3	0.105	0.493	-0.130	2.260	1.817	0.581

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.553	-2.374	0.907	5.099	3.539	0.037
2	0.231	2.516	-1.688	1.516	1.582	-0.224
3	0.216	-0.220	-0.339	3.713	2.571	0.590

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.576	-2.172	0.925	5.062	3.338	0.064
2	0.239	2.511	-1.637	1.544	1.585	-0.219
3	0.185	-0.669	-0.815	3.652	2.823	0.668

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.618	-2.461	0.458	4.907	3.703	0.094
2	0.208	2.623	-1.751	1.442	1.465	-0.216
3	0.174	0.927	0.044	2.798	2.177	0.482

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.413	0.721	0.098	3.458	2.693	0.230
2	0.382	-4.646	0.587	4.486	4.078	0.144
3	0.205	2.494	-1.688	1.563	1.520	-0.300

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.404	-4.353	0.763	4.500	4.002	0.152
2	0.384	0.575	0.025	3.465	2.638	0.278
3	0.212	2.571	-1.719	1.498	1.510	-0.256

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.265381	0.108183	0.000002	0.076498	0.930303	0.752522
1	0.271511	0.109214	0.000002	0.075540	0.949822	0.745532
2	0.259815	0.107343	0.000002	0.074970	0.920760	0.756905
3	0.266679	0.107844	0.000002	0.075247	0.933354	0.754030

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.281642	0.111782	0.000002	0.074315	0.961829	0.741995
1	0.264095	0.103723	0.000002	0.075341	0.890157	0.770350
2	0.302522	0.108734	0.000002	0.077602	0.976649	0.736717
3	0.283964	0.108973	0.000002	0.075091	0.929209	0.750392

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.093485	0.025495	1.904029e-07	0.022566	0.281410	0.977583
1	0.093327	0.025289	1.858053e-07	0.022539	0.278121	0.977985
2	0.093168	0.025208	1.882939e-07	0.022680	0.279999	0.977750
3	0.103914	0.023478	1.851965e-07	0.022378	0.277511	0.977994

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.109078	0.024248	1.983231e-07	0.023292	0.287468	0.976241
1	0.104267	0.022728	2.182793e-07	0.023501	0.301163	0.974358
2	0.140586	0.027707	2.061258e-07	0.023632	0.292590	0.975581
3	0.095697	0.026695	2.197475e-07	0.024149	0.302678	0.974404

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.049021	0.018897	1.372903e-07	0.019246	0.239079	0.983827
1	0.029695	0.017334	1.512606e-07	0.020295	0.250984	0.982238
2	0.028738	0.015156	1.420586e-07	0.019655	0.243099	0.983143
3	0.029703	0.014688	1.470808e-07	0.020164	0.247250	0.982452

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.061905	0.021988	1.953150e-07	0.023431	0.284849	0.976660
1	0.046087	0.015687	1.597586e-07	0.020906	0.257510	0.980726
2	0.042487	0.015647	1.835019e-07	0.022303	0.276430	0.978542
3	0.052393	0.018637	1.664804e-07	0.020488	0.263645	0.980837

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.018732	0.010832	1.254059e-07	0.018202	0.228522	0.985225
1	0.019183	0.010825	1.318249e-07	0.018870	0.234120	0.984437
2	0.021785	0.013834	1.211338e-07	0.017861	0.224571	0.985692
3	0.023578	0.013100	1.314486e-07	0.019476	0.233808	0.984334

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.028185	0.020733	1.688495e-07	0.022492	0.264762	0.979836
1	0.028705	0.016085	1.411449e-07	0.019312	0.242618	0.983238
2	0.040397	0.014818	1.818124e-07	0.021988	0.274825	0.978450
3	0.028356	0.023615	1.650184e-07	0.019565	0.262262	0.980978

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.013925	0.007528	2.707624e-08	0.008522	0.106156	0.996806
1	0.025893	0.012991	9.183516e-08	0.015546	0.195437	0.989269
2	0.019773	0.009528	4.734432e-08	0.011470	0.140337	0.994341
3	0.074982	0.012568	5.969539e-08	0.012861	0.157648	0.992908

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.025186	0.015270	6.595483e-08	0.013747	0.165607	0.992156
1	0.040379	0.014970	1.551327e-07	0.021166	0.254245	0.981035
2	0.030375	0.011237	7.875900e-08	0.014246	0.181113	0.990985
3	0.071792	0.028037	1.140744e-07	0.016487	0.217696	0.986737

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.265846	0.108146	2.094744e-06	0.075563	0.933560	0.752247
2	0.095974	0.024868	1.874246e-07	0.022541	0.279260	0.977828
3	0.034289	0.016519	1.444226e-07	0.019840	0.245103	0.982915
4	0.020819	0.012148	1.274533e-07	0.018602	0.230255	0.984922
5	0.033643	0.010654	5.648778e-08	0.012100	0.149895	0.993331

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.283056	0.108303	2.123634e-06	0.075588	0.939461	0.749864
2	0.112407	0.025345	2.106189e-07	0.023643	0.295975	0.975146
3	0.050718	0.017990	1.762640e-07	0.021782	0.270608	0.979191
4	0.031411	0.018813	1.642063e-07	0.020839	0.261117	0.980626
5	0.041933	0.017379	1.034802e-07	0.016411	0.204665	0.987728