Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path= './data/NCDC/jerez/dat.txt' # time shift
# file_path= './data/NCDC/almeria/dat.txt'

# Greece
# file_path= './data/NCDC/eleftherios_intl/dat.txt'
# file_path= './data/NCDC/elefsis/dat.txt' # bad dataset
# file_path= './data/NCDC/malaga/dat.txt'
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# Turkey
# file_path= './data/NCDC/turkey/konya/dat.txt' 
# file_path= './data/NCDC/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/turkey/bartin/dat.txt' # bad dataset

# Iran
# file_path= './data/NCDC/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/iran/torbat_heydarieh/dat.txt' # Unusable

# UAE
# file_path= './data/NCDC/al_maktoum/dat.txt' 
# file_path= './data/NCDC/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/buraimi/dat.txt' # not good dataset

# file_path= './data/NCDC/uk/marham/dat.txt' 
# file_path= './data/NCDC/uk/tiree/dat.txt'  # try 4
# file_path= './data/NCDC/uk/boscombe_down/dat.txt' # 4?, numpy bug
# file_path= './data/NCDC/uk/middle_wallop/dat.txt' 
# file_path= './data/NCDC/uk/southhamption/dat.txt' # high 0, trend
# file_path= './data/NCDC/uk/bournemouth/dat.txt' # 4?
# file_path= "./data/NCDC/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?

# file_path, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/landsberg_lech/dat.txt", 4 # very good, can try 4
file_path= './data/NCDC/europe/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/vatry/dat.txt"  # double peak, initial speed (should be good with m/s), mixed report type
# file_path= "./data/NCDC/europe/neuburg/dat.txt"
# file_path= "./data/NCDC/europe/valladolid/dat.txt"
# file_path= "./data/NCDC/europe/laupheim/dat.txt" # double peak, 4; very good, trend
# file_path= "./data/NCDC/europe/avord/dat.txt" # try 4, initial speed (should be good with m/s)
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/holzdorf/dat.txt" # 2008 year
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= "./data/NCDC/europe/barayas/dat.txt" # numpy problem
# file_path= './data/NCDC/europe/tenerife_sur/dat.txt'  # some directions are blocked
# file_path= './data/NCDC/europe/nantes/dat.txt' # some dir R square / K-S differs big, unit detect fails

# file_path= "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt" # care for the sampling time
# file_path= "./data/NCDC/cn/shanghai/pudong/dat.txt"
# file_path= "./data/NCDC/cn/hefei_luogang/dat.txt" # few 0, trend, try 2
# file_path= "./data/NCDC/cn/nanjing_lukou/dat.txt" 
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path= "./data/NCDC/oceania/auckland_intl/dat.txt"  # Good data, Weird KDE shape, might be blocked?
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/canberra/dat.txt" # high 0, numpy problem

# file_path= './data/NCDC/us/boston_16nm/dat.txt' # Offshore

# file_path = './data/asos/denver/hr_avg.csv'
# file_path = './data/asos/bismarck_ND/hr_avg.csv' # try 4
# file_path = './data/asos/aberdeen_SD/hr_avg.csv' # only to 2012, good fit, try 2
# file_path = './data/asos/minneapolis/hr_avg.csv'
# file_path = './data/asos/lincoln_NE/hr_avg.csv' 
# file_path = './data/asos/des_moines_IA/hr_avg.csv'
# file_path = './data/asos/springfield_IL/hr_avg.csv' # good fit
# file_path = './data/asos/topeka/hr_avg.csv' # High 0

# file_path = './data/NDAWN/baker/hr_avg.csv' # 4 might be better
# file_path = './data/NDAWN/dickinson/hr_avg.csv'
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = False
else:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19790101 0000 FM-12 260 6.2 N
1 19790101 0100 FM-15 260 5.1 N
2 19790101 0200 FM-15 260 7.2 N
3 19790101 0300 FM-12 260 10.3 N
4 19790101 0600 FM-12 280 10.3 N
5 19790101 0700 FM-15 280 7.2 N
6 19790101 0800 FM-15 280 6.1 N
7 19790101 0900 FM-12 360 5.1 N
8 19790101 1000 FM-15 360 3.6 N
9 19790101 1100 FM-15 60 1.0 N
10 19790101 1200 FM-12 999 0.0 C
11 19790101 1300 FM-15 20 1.0 N
12 19790101 1400 FM-15 999 0.0 C
13 19790101 1500 FM-12 180 1.0 N
14 19790101 1600 FM-15 999 0.0 C
15 19790101 1700 FM-15 140 2.0 N
16 19790101 1800 FM-12 140 2.6 N
17 19790101 1900 FM-15 120 2.0 N
18 19790101 2000 FM-15 140 1.5 N
19 19790101 2100 FM-12 80 1.0 N
20 19790102 0100 FM-15 60 3.6 N
21 19790102 0200 FM-15 40 4.1 N
22 19790102 0300 FM-12 40 4.1 N
23 19790102 0500 FM-15 60 2.5 N
24 19790102 0600 FM-12 100 2.6 N
25 19790102 0800 FM-15 80 2.5 N
26 19790102 0900 FM-12 60 3.1 N
27 19790102 1000 FM-15 80 5.1 N
28 19790102 1200 FM-12 80 3.1 N
29 19790102 1300 FM-15 80 4.1 N
... ... ... ... ... ... ...
366609 20170401 0900 FM-15 310 4.1 N
366610 20170401 0930 FM-15 999 1.5 V
366611 20170401 1000 FM-15 230 3.6 V
366612 20170401 1030 FM-15 260 6.7 V
366613 20170401 1100 FM-12 270 6.7 N
366614 20170401 1130 FM-15 260 6.2 N
366615 20170401 1200 FM-15 260 4.1 N
366616 20170401 1230 FM-15 250 5.7 V
366617 20170401 1300 FM-15 270 8.2 V
366618 20170401 1330 FM-15 280 9.3 N
366619 20170401 1400 FM-15 270 8.2 N
366620 20170401 1430 FM-15 250 7.7 N
366621 20170401 1500 FM-15 250 5.1 V
366622 20170401 1530 FM-15 230 4.1 V
366623 20170401 1600 FM-15 220 3.6 V
366624 20170401 1630 FM-15 230 4.1 V
366625 20170401 1700 FM-15 270 6.2 N
366626 20170401 1730 FM-15 250 6.2 N
366627 20170401 1800 FM-15 250 5.1 N
366628 20170401 1830 FM-15 270 5.1 N
366629 20170401 1900 FM-12 260 5.1 N
366630 20170401 1930 FM-15 250 5.1 N
366631 20170401 2000 FM-15 260 5.1 N
366632 20170401 2030 FM-15 260 4.6 N
366633 20170401 2100 FM-15 260 3.6 V
366634 20170401 2130 FM-15 260 4.6 N
366635 20170401 2200 FM-15 250 4.1 N
366636 20170401 2230 FM-15 240 3.1 N
366637 20170401 2300 FM-15 230 3.1 V
366638 20170401 2330 FM-15 250 3.6 V

366639 rows × 6 columns

In [5]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) & \
              (date >= 19700000) & (date < 20170000) ")
In [6]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [7]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
df['month'] = df['date']%10000//100
# Convert Windrose coordianates to Polar Cooridinates 
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
df.describe()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[7]:
date HrMn dir speed month dir_windrose
count 3.622460e+05 362246.00000 362246.000000 362246.000000 362246.000000 362246.000000
mean 2.001603e+07 1142.66531 287.071871 2.378766 6.545927 283.826955
std 1.113944e+05 678.73662 277.331580 1.792495 3.452641 276.351481
min 1.979010e+07 0.00000 0.000000 0.000000 1.000000 0.000000
25% 1.993013e+07 600.00000 140.000000 1.000000 4.000000 120.000000
50% 2.004122e+07 1130.00000 200.000000 2.100000 7.000000 230.000000
75% 2.011102e+07 1700.00000 320.000000 3.100000 10.000000 300.000000
max 2.016123e+07 2330.00000 999.000000 26.200000 12.000000 999.000000
In [8]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xde94b38>

1.2.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
if 'knot_unit' not in globals():
#     knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    knot_unit=True
    if knot_unit:
        df['speed'] = df['speed'] * 1.943845
        df['decimal'] = df.speed % 1
        df.decimal.hist(alpha=0.5, label='knot')
        # need more elaboration, some is not near an integer
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
True
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.2.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.2.3 Sampling Time Selection

In [12]:
MID_YEAR = (min(df.date)//10000+max(df.date)//10000)//2

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df.query('date > @MID_YEAR * 10000')['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='> %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df.query('date > 20000000')['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xe91c5f8>

1.3 Data Wrangling

1.3.1 Artefacts

1.3.1.1 wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type month dir_windrose
time

1.3.1.2 sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1999-12-27 18:00:00 19991227 1800 SY-MT 160 51 N 12 290 24.0 14.0
1984-11-28 00:00:00 19841128 0 SY-MT 350 40 N 11 100 36.0 38.0
1984-06-17 12:00:00 19840617 1200 SY-MT 70 40 N 6 20 36.0 36.0
2002-05-13 16:00:00 20020513 1600 SY-MT 170 37 N 5 280 27.0 20.0
1999-12-27 19:00:00 19991227 1900 SY-MT 180 37 N 12 270 -14.0 4.0
2007-02-14 11:00:00 20070214 1100 SY-MT 180 35 N 2 270 14.0 10.0
2009-01-24 08:00:00 20090124 800 SY-MT 180 35 N 1 270 2.0 2.0
2009-01-24 04:00:00 20090124 400 SY-MT 160 35 N 1 290 15.0 18.0
2009-01-24 10:00:00 20090124 1000 SY-MT 170 35 N 1 280 2.0 6.0
2012-12-21 14:00:00 20121221 1400 SY-MT 180 35 N 12 270 14.0 12.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xe8fcfd0>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 2
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1999-12-27 18:00:00 19991227 1800 SY-MT 160 51 N 12 290 24.0 14.0
1999-12-27 19:00:00 19991227 1900 SY-MT 180 37 N 12 270 -14.0 4.0
2012-12-21 14:00:00 20121221 1400 SY-MT 180 35 N 12 270 14.0 12.0
2009-01-24 04:00:00 20090124 400 SY-MT 160 35 N 1 290 15.0 18.0
2009-01-24 08:00:00 20090124 800 SY-MT 180 35 N 1 270 2.0 2.0
2009-01-24 10:00:00 20090124 1000 SY-MT 170 35 N 1 280 2.0 6.0
2007-02-14 11:00:00 20070214 1100 SY-MT 180 35 N 2 270 14.0 10.0
2006-02-18 00:00:00 20060218 0 SY-MT 140 35 N 2 310 14.0 25.0
2009-01-24 07:00:00 20090124 700 SY-MT 190 33 N 1 260 2.0 -2.0
1996-02-07 18:00:00 19960207 1800 SY-MT 160 33 N 2 290 4.0 23.0

1.3.2 0 Speed

In [17]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df.query("(date >= 20050000)"))
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.0572973129237

1.3.3 Direction re-aligment and 999

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,30,50], need to redistribute the angle into 22.5

In [18]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0      3789
10     3928
20     3016
30     3102
40     2448
50     2630
60     1922
70     2066
80     1730
90     2197
100    2083
110    2891
120    2725
130    3563
140    3421
150    4754
160    4626
170    6436
180    5347
190    6230
200    4789
210    5008
220    3309
230    2888
240    1958
250    1685
260    1284
270    1495
280    1389
290    2040
300    2566
310    4939
320    5425
330    7138
340    4891
350    4832
999    7612
Name: dir, dtype: int64
36 10.0
In [19]:
df.query('dir == 999')['speed'].value_counts()
Out[19]:
0    7611
2       1
Name: speed, dtype: int64
In [20]:
df=realign_direction(df, effective_column)
df=fill_direction_999(df, SECTOR_LENGTH)

1.4 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df.query('date < @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df.query('date < @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1979 - 1979
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1980 - 1984
1985 - 1989
1990 - 1994
1995 - 1999
2000 - 2004
2005 - 2009
2010 - 2013
In [24]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[24]:
(0, 10.0)
In [25]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type month dir_windrose
time
In [26]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 5000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 5000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.5 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.6 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? True
Report type used: SY-MT
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed month dir_windrose x y
count 2.806600e+04 28066.000000 28066.000000 28066.000000 28066.000000 28066.000000 28066.000000 28066.000000
mean 2.011260e+07 1148.574788 190.745288 4.970712 6.138780 234.555013 -1.233491 -0.066619
std 9.704539e+03 696.268540 103.438455 3.369369 3.519564 214.959077 4.985859 3.110743
min 2.010010e+07 0.000000 -4.986196 0.000259 1.000000 0.000000 -35.059729 -21.181042
25% 2.010110e+07 500.000000 120.250914 2.801845 3.000000 110.000000 -4.190693 -2.186894
50% 2.011090e+07 1100.000000 189.209888 4.143457 6.000000 210.000000 -0.185727 -0.353921
75% 2.012070e+07 1800.000000 291.390825 6.319304 9.000000 280.000000 2.465441 1.904472
max 2.013043e+07 2300.000000 354.989938 35.164171 12.000000 999.000000 13.824255 19.257946
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x15154860>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x14d92cc0>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

2.3 Overview by Month

In [41]:
month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 12+month_incre, month_incre): 
    end_month = month+month_incre
    sub_df = current_df.query('(month >= @month) and (month < @end_month)')
    if len(sub_df) > 0:
        if month_incre == 1:
            title = 'Month: %s' % (month)
        else:
            title = 'Month: %s - %s ' % (month, end_month-1)
        ax = WindroseAxes.from_ax()
        ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
        plt_configure(figsize=(3,3), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

3. Create input data and configuration

In [42]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [43]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5
   6   7   8   9  10  11  12]
In [44]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [45]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [46]:
%%time
from sklearn.grid_search import GridSearchCV
# from sklearn.model_selection import GridSearchCV  ## too slow

# The bandwidth value sometimes would be too radical
if knot_unit:
    bandwidth_range = arange(0.7,2,0.2)
else:
    bandwidth_range = arange(0.4,1,0.1)

# Grid search is unable to deal with too many data (a long time is needed)
if len(sample) > 50000:    
    df_resample=df.sample(n=50000, replace=True)
    bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
else:
    bandwidth_search_sample = sample

grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

grid.fit(bandwidth_search_sample)
bandwidth = grid.best_params_['bandwidth']
print(bandwidth)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)
0.7
Wall time: 1min 21s
In [47]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 0.7 625
[  1.75323162e-06   3.07138842e-06   2.42519722e-05   6.02241347e-05
   1.15292813e-04]
In [48]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
In [49]:
kde_cdf = cdf_from_pdf(kde_result)

5. GMM by Expectation-maximization

In [50]:
sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [51]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[51]:
weight mean_x mean_y sig_x sig_y corr
1 0.499 1.928 -0.748 2.232 2.162 -0.077
2 0.264 -3.994 -0.585 3.399 3.215 0.315
3 0.237 -4.814 1.945 6.259 3.761 0.037
In [52]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.49894287172 [[ 1.92804623 -0.74776635]] [ 2.10370223  2.28727599] -123.885295177
0.264029083066 [[-3.99359868 -0.58532644]] [ 2.72829546  3.80058901] -50.0121680074
0.237028045214 [[-4.8139851   1.94498738]] [ 3.75711339  6.26118233] -88.0266187571
In [53]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [54]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[54]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.948 0.029 0.051 4.413006e-07 0.035 0.429

6. GMM by Optimization

In [55]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [56]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[56]:
     fun: -15.758814713123712
     jac: array([  6.87428713e-01,  -2.38418579e-07,   3.57627869e-07,
         2.38418579e-07,  -1.19209290e-07,  -1.19209290e-07,
         6.87425017e-01,   4.76837158e-07,   3.57627869e-07,
         0.00000000e+00,   3.57627869e-07,   0.00000000e+00,
         6.87428713e-01,   0.00000000e+00,   1.19209290e-07,
         1.19209290e-07,   1.19209290e-07,   0.00000000e+00,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 1687
     nit: 83
    njev: 83
  status: 0
 success: True
       x: array([ 0.11555615,  0.9283853 , -0.07450164,  2.05073704,  1.61659019,
        0.44224838,  0.175232  ,  2.73774064, -1.89303041,  1.3175924 ,
        1.35885988, -0.05841041,  0.70921185, -1.83446419,  0.30115834,
        4.94554499,  3.54161939,  0.10208762])

6.1 GMM Result

In [57]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[57]:
weight mean_x mean_y sig_x sig_y corr
1 0.709 -1.834 0.301 4.946 3.542 0.102
2 0.175 2.738 -1.893 1.318 1.359 -0.058
3 0.116 0.928 -0.075 2.051 1.617 0.442
In [58]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.709211852928 [[-1.83446419  0.30115834]] [ 3.5043575   4.97201786] -81.6469067896
0.175231995805 [[ 2.73774064 -1.89303041]] [ 1.29344877  1.38186114] -148.918595509
0.115556151267 [[ 0.9283853  -0.07450164]] [ 1.31951349  2.25339087] -59.2504359635

6.2 Goodness-of-fit statistics

In [59]:
gof_df(gmm_pdf_result, kde_result)
Out[59]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.983 0.019 0.048 1.432299e-07 0.020 0.244
In [60]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
In [61]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V
In [62]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 3. GMM distribution
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'})
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:9: RuntimeWarning: divide by zero encountered in log
In [63]:
# Calculate Speed Distribution
# 1. GMM Model
x = arange(0, max_speed, 0.5)
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])*len(df.speed)/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm,'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)
Speed Distribution Comparison
In [64]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir']) 

df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
title='Direction Distribution Comparison'
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print(title)
Direction Distribution Comparison
In [65]:
# %%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.891591426227

6.3 Sectoral Comaprison

In [66]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    max_diff_array = []
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = radians(angle), radians(incre)  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. Make Plots
        fig = plt.figure(figsize=(10,1.9))
#         fig = plt.figure(figsize=(10,1.7))
        # 3.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
#         plt_configure(xlabel = "$V$", ylabel='Frequency', legend=True)
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 3.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
#         plt_configure(xlabel = "$V$", ylabel='$P$', legend=True)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        # 3.3. Weibull Comparison
#         ax3 = fig.add_subplot(1,3,3)
#         plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
#         plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
#         plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
#         plt.gca().set_xlim(right = log(max_speed+1))
# #         plt_configure(xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
#         plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
        
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        diff, diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
        max_diff_array.append([len(sub_df), angle, diff.max(), x[diff.argmax()], 
                               diff_weibull.max(), x[diff_weibull.argmax()], R_square_gmm, R_square_weibull])
        curves = {'angle': angle, 'data_size': data_size, 'weight': direction_prob, 
                  'x': x, 'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', diff.max(), diff_weibull.max(), 'speed value:', x[diff.argmax()], x[diff_weibull.argmax()], 'y gmm', y_cdf_gmm[diff.argmax()])
        print(' ')
    return max_diff_array, curve_collection
In [67]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
max_diff_array, curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 1190 weight 0.04240005700848001
GMM Weibull
R square 0.941013596534 0.901249735501
max diff: 0.0431872267562 0.0423372188494 speed value: 8.75519572132 2.18879893033 y gmm 0.922358991731
 
25.0 (15.0 - 35.0) degree
data size: 1248 weight 0.044466614408893324
GMM Weibull
R square 0.920220103213 0.93613647517
max diff: 0.0505266198494 0.0345875030929 speed value: 7.26012650975 2.17803795293 y gmm 0.834890046817
 
45.0 (35.0 - 55.0) degree
data size: 1129 weight 0.04022660870804532
GMM Weibull
R square 0.913527922499 0.965828067128
max diff: 0.0715573736226 0.0575893346446 speed value: 6.59846424683 4.12404015427 y gmm 0.795581687493
 
65.0 (55.0 - 75.0) degree
data size: 846 weight 0.030143233806028645
GMM Weibull
R square 0.816321204173 0.946425739321
max diff: 0.139287104627 0.0641914829661 speed value: 5.09885335072 3.82414001304 y gmm 0.655039136508
 
85.0 (75.0 - 95.0) degree
data size: 815 weight 0.02903869450580774
GMM Weibull
R square 0.758140866016 0.916818777993
max diff: 0.162880560666 0.103834738458 speed value: 4.74364424627 4.06598078252 y gmm 0.605217598844
 
105.0 (95.0 - 115.0) degree
data size: 1033 weight 0.03680609990736122
GMM Weibull
R square 0.875978898275 0.896339637303
max diff: 0.0550435561007 0.0551538838729 speed value: 5.97420457682 5.97420457682 y gmm 0.736824788527
 
125.0 (115.0 - 135.0) degree
data size: 1318 weight 0.04696073540939215
GMM Weibull
R square 0.917451099982 0.924860844398
max diff: 0.0697196973489 0.0493741250357 speed value: 4.7384018363 5.92300229538 y gmm 0.509021670035
 
145.0 (135.0 - 155.0) degree
data size: 1732 weight 0.061711679612342335
GMM Weibull
R square 0.875958436948 0.899972284745
max diff: 0.107254375725 0.0369739168742 speed value: 9.10859521109 6.50613943649 y gmm 0.870533821452
 
165.0 (155.0 - 175.0) degree
data size: 2474 weight 0.08814936221762987
GMM Weibull
R square 0.837399457279 0.924895299138
max diff: 0.0985288001349 0.0318231684276 speed value: 9.02285813041 6.01523875361 y gmm 0.783249899569
 
185.0 (175.0 - 195.0) degree
data size: 2531 weight 0.09018028931803605
GMM Weibull
R square 0.878083875687 0.932952441763
max diff: 0.0753235181465 0.0445611915504 speed value: 9.25372932591 5.55223759555 y gmm 0.761613522097
 
205.0 (195.0 - 215.0) degree
data size: 2333 weight 0.08312548991662509
GMM Weibull
R square 0.913308809511 0.92236783857
max diff: 0.0279952295248 0.036898438529 speed value: 12.0563772664 6.0281886332 y gmm 0.941411431839
 
225.0 (215.0 - 235.0) degree
data size: 1537 weight 0.05476377111095276
GMM Weibull
R square 0.86173592319 0.886024495202
max diff: 0.0558787024516 0.0576929314187 speed value: 1.06748076242 1.06748076242 y gmm 0.0430152468002
 
245.0 (235.0 - 255.0) degree
data size: 962 weight 0.03427634860685527
GMM Weibull
R square 0.845325588138 0.871607808436
max diff: 0.0796094399807 0.0438449080986 speed value: 6.37239484823 6.37239484823 y gmm 0.796689936319
 
265.0 (255.0 - 275.0) degree
data size: 743 weight 0.02647331290529466
GMM Weibull
R square 0.789926707478 0.821187066166
max diff: 0.151520353617 0.0834755889584 speed value: 3.93259621136 1.96629810568 y gmm 0.602180857689
 
285.0 (275.0 - 295.0) degree
data size: 880 weight 0.03135466400627093
GMM Weibull
R square 0.902181755341 0.909764746177
max diff: 0.118163087528 0.0669110313499 speed value: 3.82318511089 3.82318511089 y gmm 0.621609639745
 
305.0 (295.0 - 315.0) degree
data size: 1802 weight 0.06420580061284116
GMM Weibull
R square 0.951972400081 0.937556152654
max diff: 0.0612783191275 0.134287486349 speed value: 3.85128547643 3.85128547643 y gmm 0.561918129263
 
325.0 (315.0 - 335.0) degree
data size: 3022 weight 0.10767476662153495
GMM Weibull
R square 0.9622180807 0.922702889326
max diff: 0.0386709409176 0.0861840862144 speed value: 4.95907033239 4.95907033239 y gmm 0.76940318218
 
345.0 (335.0 - 355.0) degree
data size: 2033 weight 0.07243639991448728
GMM Weibull
R square 0.924250753951 0.895145752095
max diff: 0.0909625359314 0.051278041528 speed value: 3.74630857337 3.74630857337 y gmm 0.515530331752
 
Wall time: 53.8 s
In [68]:
diff_df = pd.DataFrame(max_diff_array,columns=['datasize','direction', 'gmm', 'speed_gmm',
                                               'weibull', 'speed_weibull', 'r_square_gmm', 'r_square_weibull'])  

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.8934799879528382 0.9154792615039528
In [69]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.gmm, diff_df.weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.07497111119903163 0.058622503211435084
In [70]:
# Compare direction weight with previous figure
display(dir_fig)

6.4 Insufficient-fit Sector Investigation

6.4.1 Data Variability, by Bootstrap (Resampling)

In [71]:
max_diff_element = max(max_diff_array, key=lambda x: x[2])
angle =  max_diff_angle = max_diff_element[1]
incre = rebinned_angle
In [72]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [73]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
85.0 (75.0 - 95.0) Degree Speed Distribution
0.182819388482 5.0 0.640493494954

6.4.2 Time Variability

In [74]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(20000000, 20150000, 50000):
    end_time = start_time + 50000 
    time_label = start_time//10000
    df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = time_label)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
        
        title = '%s - %s' %(time_label, time_label+4)
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=time_label, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = time_label*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if time_label == 2010 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if time_label == 2010 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:25: RuntimeWarning: divide by zero encountered in log
85.0 (75.0 - 95.0) Degree Speed Distribution

6.4.3 Adjacent Sector Variability

In [75]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [76]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('angle == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['data_size'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
85.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [77]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
0.7 square_error

7.1 Variability of the Result

In [78]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.691 -1.913 0.336 4.954 3.581 0.083
2 0.159 2.780 -1.967 1.278 1.329 -0.071
3 0.150 1.206 -0.168 2.115 1.755 0.387
GMM Plot Result
0.69137052063 [[-1.91298226  0.33648134]] [ 3.55492455  4.97233077] -82.9388753409
0.158931991777 [[ 2.77996832 -1.96708623]] [ 1.25006013  1.35608387] -149.381619661
0.149697487593 [[ 1.20584246 -0.16764222]] [ 1.47656805  2.31831812] -57.9145175723
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.983 0.020 0.044 1.452490e-07 0.020 0.246
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.983 0.018 0.042 1.473480e-07 0.020 0.248

weight mean_x mean_y sig_x sig_y corr
1 0.771 -1.425 0.210 4.788 3.361 0.118
2 0.215 2.539 -1.583 1.428 1.571 -0.175
3 0.015 -0.007 -0.153 0.466 1.519 0.917
GMM Plot Result
0.770774833266 [[-1.42521621  0.21038266]] [ 3.31601787  4.81934537] -80.9451158036
0.21452921717 [[ 2.5387039  -1.58274106]] [ 1.34388437  1.64329598] -149.240743183
0.0146959495642 [[-0.00682602 -0.15263387]] [ 0.17851161  1.5785485 ] 164.079346947
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.987 0.025 0.077 1.079191e-07 0.017 0.212
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.987 0.024 0.083 1.057431e-07 0.017 0.210

weight mean_x mean_y sig_x sig_y corr
1 0.754 -1.547 0.219 4.833 3.394 0.137
2 0.223 2.535 -1.571 1.384 1.566 -0.128
3 0.022 -0.192 -0.196 0.608 0.622 0.007
GMM Plot Result
0.754418238477 [[-1.54681106  0.21860424]] [ 3.33261867  4.87530749] -79.6168978146
0.223198275012 [[ 2.53494386 -1.57089651]] [ 1.34089495  1.60279271] -157.058557609
0.0223834865117 [[-0.19229805 -0.1963364 ]] [ 0.60739584  0.62228173] 171.360495256
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.023 0.078 8.287897e-08 0.015 0.186
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.025 0.079 8.712190e-08 0.015 0.190

weight mean_x mean_y sig_x sig_y corr
1 0.552 -2.275 0.945 5.139 3.376 0.055
2 0.250 2.450 -1.592 1.583 1.630 -0.275
3 0.198 -0.759 -0.789 3.786 2.761 0.633
GMM Plot Result
0.552207363838 [[-2.2747951   0.94453528]] [ 3.36691567  5.14472759] -86.385709621
0.249940015747 [[ 2.44953365 -1.59177156]] [ 1.36630286  1.81481627] -138.021552808
0.197852620415 [[-0.75851572 -0.78874869]] [ 1.88765505  4.28922625] -58.444674436
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.981 0.017 0.037 1.600852e-07 0.020 0.258
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.982 0.017 0.033 1.526316e-07 0.020 0.252

weight mean_x mean_y sig_x sig_y corr
1 0.697 -1.897 0.435 4.976 3.535 0.088
2 0.199 2.722 -1.797 1.362 1.416 -0.086
3 0.105 0.493 -0.130 2.260 1.817 0.581
GMM Plot Result
0.696951364224 [[-1.89733324  0.43484936]] [ 3.50784616  4.9950252 ] -82.882703965
0.19854348956 [[ 2.72204852 -1.79661253]] [ 1.32235551  1.45352989] -147.275889541
0.104505146216 [[ 0.49252249 -0.12962736]] [ 1.286218    2.59914759] -55.3611295199
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.985 0.017 0.039 1.262270e-07 0.018 0.229
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.982 0.018 0.042 1.480819e-07 0.020 0.248

weight mean_x mean_y sig_x sig_y corr
1 0.553 -2.374 0.907 5.099 3.539 0.037
2 0.231 2.516 -1.688 1.516 1.582 -0.224
3 0.216 -0.220 -0.339 3.713 2.571 0.590
GMM Plot Result
0.553006348382 [[-2.37362169  0.90734554]] [ 3.53429223  5.10194466] -87.1924376865
0.230688214994 [[ 2.51568246 -1.68771328]] [ 1.36144236  1.7161455 ] -140.394521611
0.216305436624 [[-0.21987811 -0.33851051]] [ 1.87554186  4.10844323] -61.2472474233
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.981 0.016 0.034 1.567372e-07 0.020 0.255
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.982 0.017 0.030 1.482759e-07 0.020 0.248

weight mean_x mean_y sig_x sig_y corr
1 0.576 -2.172 0.925 5.062 3.338 0.064
2 0.239 2.511 -1.637 1.544 1.585 -0.219
3 0.185 -0.669 -0.815 3.652 2.823 0.668
GMM Plot Result
0.575665825148 [[-2.17181824  0.92498337]] [ 3.32640383  5.06976599] -85.7774057802
0.238866167676 [[ 2.51108965 -1.63741566]] [ 1.38164832  1.72856976] -138.411338562
0.185468007176 [[-0.66918412 -0.81458175]] [ 1.8059524   4.24772827] -55.6508093814
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.982 0.018 0.039 1.558605e-07 0.021 0.254
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.982 0.018 0.037 1.480144e-07 0.020 0.248

weight mean_x mean_y sig_x sig_y corr
1 0.618 -2.461 0.458 4.907 3.703 0.094
2 0.208 2.623 -1.751 1.442 1.465 -0.216
3 0.174 0.927 0.044 2.798 2.177 0.482
GMM Plot Result
0.617598564956 [[-2.4607273   0.45842767]] [ 3.66628441  4.93438568] -80.8872104063
0.207958209428 [[ 2.62321841 -1.75068357]] [ 1.2865313   1.60325147] -137.099752566
0.174443225617 [[ 0.92679388  0.04443883]] [ 1.72210418  3.09842956] -58.8827118042
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.983 0.019 0.036 1.423218e-07 0.019 0.244
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.982 0.017 0.034 1.503005e-07 0.020 0.250

weight mean_x mean_y sig_x sig_y corr
1 0.413 0.721 0.098 3.458 2.693 0.230
2 0.382 -4.646 0.587 4.486 4.078 0.144
3 0.205 2.494 -1.688 1.563 1.520 -0.300
GMM Plot Result
0.413012853431 [[ 0.72105451  0.09790008]] [ 2.53474412  3.57561092] -68.8457523351
0.382351752001 [[-4.64586335  0.58676693]] [ 3.90036964  4.64110331] -61.7754692803
0.204635394568 [[ 2.49428554 -1.68838647]] [ 1.28829582  1.75884791] -132.298384307
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.983 0.016 0.030 1.464372e-07 0.020 0.247
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.982 0.017 0.026 1.504285e-07 0.020 0.250

weight mean_x mean_y sig_x sig_y corr
1 0.404 -4.353 0.763 4.500 4.002 0.152
2 0.384 0.575 0.025 3.465 2.638 0.278
3 0.212 2.571 -1.719 1.498 1.510 -0.256
GMM Plot Result
0.404051618787 [[-4.35251887  0.76286715]] [ 3.83038741  4.64693699] -63.8535012465
0.383515344405 [[ 0.57521336  0.02454561]] [ 2.42848756  3.61498428] -67.3914856066
0.212433036808 [[ 2.5706995 -1.7186576]] [ 1.29730268  1.68560058] -135.910225126
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.983 0.016 0.026 1.486341e-07 0.020 0.248
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.983 0.018 0.029 1.463449e-07 0.020 0.247
Wall time: 18.3 s

7.2 Cross-validation, to select the number of Gaussian

In [79]:
# df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn%400 == 0)')
In [80]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
Number of train/test dataset 21049.5 7016.5
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.265381 0.108183 0.000002 0.076498 0.930303 0.752522
1 0.271511 0.109214 0.000002 0.075540 0.949822 0.745532
2 0.259815 0.107343 0.000002 0.074970 0.920760 0.756905
3 0.266679 0.107844 0.000002 0.075247 0.933354 0.754030
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.281642 0.111782 0.000002 0.074315 0.961829 0.741995
1 0.264095 0.103723 0.000002 0.075341 0.890157 0.770350
2 0.302522 0.108734 0.000002 0.077602 0.976649 0.736717
3 0.283964 0.108973 0.000002 0.075091 0.929209 0.750392
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.093485 0.025495 1.904029e-07 0.022566 0.281410 0.977583
1 0.093327 0.025289 1.858053e-07 0.022539 0.278121 0.977985
2 0.093168 0.025208 1.882939e-07 0.022680 0.279999 0.977750
3 0.103914 0.023478 1.851965e-07 0.022378 0.277511 0.977994
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.109078 0.024248 1.983231e-07 0.023292 0.287468 0.976241
1 0.104267 0.022728 2.182793e-07 0.023501 0.301163 0.974358
2 0.140586 0.027707 2.061258e-07 0.023632 0.292590 0.975581
3 0.095697 0.026695 2.197475e-07 0.024149 0.302678 0.974404
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.049021 0.018897 1.372903e-07 0.019246 0.239079 0.983827
1 0.029695 0.017334 1.512606e-07 0.020295 0.250984 0.982238
2 0.028738 0.015156 1.420586e-07 0.019655 0.243099 0.983143
3 0.029703 0.014688 1.470808e-07 0.020164 0.247250 0.982452
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.061905 0.021988 1.953150e-07 0.023431 0.284849 0.976660
1 0.046087 0.015687 1.597586e-07 0.020906 0.257510 0.980726
2 0.042487 0.015647 1.835019e-07 0.022303 0.276430 0.978542
3 0.052393 0.018637 1.664804e-07 0.020488 0.263645 0.980837
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.018732 0.010832 1.254059e-07 0.018202 0.228522 0.985225
1 0.019183 0.010825 1.318249e-07 0.018870 0.234120 0.984437
2 0.021785 0.013834 1.211338e-07 0.017861 0.224571 0.985692
3 0.023578 0.013100 1.314486e-07 0.019476 0.233808 0.984334
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.028185 0.020733 1.688495e-07 0.022492 0.264762 0.979836
1 0.028705 0.016085 1.411449e-07 0.019312 0.242618 0.983238
2 0.040397 0.014818 1.818124e-07 0.021988 0.274825 0.978450
3 0.028356 0.023615 1.650184e-07 0.019565 0.262262 0.980978
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.013925 0.007528 2.707624e-08 0.008522 0.106156 0.996806
1 0.025893 0.012991 9.183516e-08 0.015546 0.195437 0.989269
2 0.019773 0.009528 4.734432e-08 0.011470 0.140337 0.994341
3 0.074982 0.012568 5.969539e-08 0.012861 0.157648 0.992908
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.025186 0.015270 6.595483e-08 0.013747 0.165607 0.992156
1 0.040379 0.014970 1.551327e-07 0.021166 0.254245 0.981035
2 0.030375 0.011237 7.875900e-08 0.014246 0.181113 0.990985
3 0.071792 0.028037 1.140744e-07 0.016487 0.217696 0.986737
Wall time: 1min 2s
In [81]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.265846 0.108146 2.094744e-06 0.075563 0.933560 0.752247
2 0.095974 0.024868 1.874246e-07 0.022541 0.279260 0.977828
3 0.034289 0.016519 1.444226e-07 0.019840 0.245103 0.982915
4 0.020819 0.012148 1.274533e-07 0.018602 0.230255 0.984922
5 0.033643 0.010654 5.648778e-08 0.012100 0.149895 0.993331
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.283056 0.108303 2.123634e-06 0.075588 0.939461 0.749864
2 0.112407 0.025345 2.106189e-07 0.023643 0.295975 0.975146
3 0.050718 0.017990 1.762640e-07 0.021782 0.270608 0.979191
4 0.031411 0.018813 1.642063e-07 0.020839 0.261117 0.980626
5 0.041933 0.017379 1.034802e-07 0.016411 0.204665 0.987728
In [82]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [83]:
# fig = plt.figure(figsize=(4.3,2.4))
fig = plt.figure(figsize=(5,2.5))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)