Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth= './data/NCDC/europe/uk/tiree/dat.txt', 1.9
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path, convert_to_knot= './data/NCDC/europe/france/pau_pyrenees/dat.txt', True # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend, try 2
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 1.1, 4 # miss peak
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19560820 0000 FM-12 340 2.1 N
1 19560820 0600 FM-12 360 2.1 N
2 19560820 0900 FM-12 360 2.1 N
3 19560820 1800 FM-12 999 0.0 C
4 19560820 2100 FM-12 999 0.0 C
5 19560821 0000 FM-12 140 4.1 N
6 19560821 0300 FM-12 160 2.1 N
7 19560821 0600 FM-12 180 2.1 N
8 19560821 0900 FM-12 160 6.2 N
9 19560821 1200 FM-12 200 5.1 N
10 19560821 1800 FM-12 180 5.1 N
11 19560821 2100 FM-12 140 5.1 N
12 19560822 0000 FM-12 160 7.2 N
13 19560822 0300 FM-12 180 7.2 N
14 19560822 0600 FM-12 200 7.2 N
15 19560822 1200 FM-12 290 7.2 N
16 19560822 1800 FM-12 340 4.1 N
17 19560823 0000 FM-12 340 5.1 N
18 19560823 0300 FM-12 340 4.1 N
19 19560823 0600 FM-12 360 4.1 N
20 19560823 0900 FM-12 320 2.1 N
21 19560823 1800 FM-12 320 2.1 N
22 19560823 2100 FM-12 250 2.1 N
23 19560824 0000 FM-12 320 2.1 N
24 19560824 0300 FM-12 999 0.0 C
25 19560824 0600 FM-12 140 3.1 N
26 19560824 0900 FM-12 180 2.1 N
27 19560824 1200 FM-12 999 0.0 C
28 19560824 1800 FM-12 999 0.0 C
29 19560824 2100 FM-12 110 1.0 N
... ... ... ... ... ... ...
222884 20160131 1800 FM-15 350 3.0 N
222885 20160131 1900 FM-15 340 2.0 N
222886 20160131 2000 FM-15 320 2.0 N
222887 20160131 2100 FM-12 70 3.0 N
222888 20160131 2200 FM-15 350 1.0 N
222889 20160131 2300 FM-15 330 2.0 N
222890 20160201 0000 FM-15 350 1.0 N
222891 20160201 0100 FM-15 999 0.0 C
222892 20160201 0200 FM-15 330 2.0 V
222893 20160201 0300 FM-15 320 3.0 N
222894 20160201 0400 FM-15 330 2.0 V
222895 20160201 0500 FM-15 340 4.0 V
222896 20160201 0600 FM-15 330 3.0 V
222897 20160201 0700 FM-15 330 2.0 V
222898 20160201 0800 FM-15 330 2.0 V
222899 20160201 0900 FM-15 310 2.0 N
222900 20160201 1000 FM-15 340 3.0 N
222901 20160201 1100 FM-15 330 2.0 N
222902 20160201 1200 FM-15 340 2.0 N
222903 20160201 1300 FM-15 350 3.0 N
222904 20160201 1400 FM-15 320 2.0 N
222905 20160201 1500 FM-15 300 2.0 N
222906 20160201 1600 FM-15 330 1.0 N
222907 20160201 1700 FM-15 290 2.0 N
222908 20160201 1800 FM-15 310 2.0 N
222909 20160201 1900 FM-15 280 2.0 N
222910 20160201 2000 FM-15 270 2.0 N
222911 20160201 2100 FM-15 230 2.0 N
222912 20160201 2200 FM-15 250 2.0 N
222913 20160201 2300 FM-15 260 2.0 N

222914 rows × 6 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
31.78 117.298
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
date HrMn dir speed dir_windrose
count 2.010630e+05 201063.000000 201063.000000 201063.000000 201063.000000
mean 2.000563e+07 1088.860506 232.890288 2.646362 221.360320
std 1.186954e+05 703.079666 259.049020 1.672618 255.875427
min 1.973010e+07 0.000000 0.000000 0.000000 0.000000
25% 1.992112e+07 500.000000 60.000000 2.000000 70.000000
50% 2.004071e+07 1000.000000 160.000000 2.000000 140.000000
75% 2.010042e+07 1800.000000 300.000000 4.000000 280.000000
max 2.016020e+07 2300.000000 999.000000 24.000000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xc7b45c0>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
False
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [12]:
MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xce2d160>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type dir_windrose
time

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
2002-03-20 10:00:00 20020320 1000 FM-15 120 14.0 N 330 10.0 4.0
2012-04-02 10:00:00 20120402 1000 FM-15 140 13.0 N 310 10.0 4.0
2005-12-21 04:00:00 20051221 400 FM-15 140 13.0 N 310 2.0 2.0
2010-03-19 23:00:00 20100319 2300 FM-15 120 12.0 N 330 10.0 7.0
2004-03-16 23:00:00 20040316 2300 FM-15 50 12.0 N 40 1.0 7.0
2001-11-13 05:00:00 20011113 500 FM-15 140 12.0 N 310 0.0 1.0
2015-05-10 22:00:00 20150510 2200 FM-15 150 12.0 N 300 2.0 2.0
2002-04-15 17:00:00 20020415 1700 FM-15 140 12.0 N 310 9.0 5.0
2001-11-13 04:00:00 20011113 400 FM-15 140 12.0 N 310 6.0 0.0
2009-06-05 13:00:00 20090605 1300 FM-15 50 12.0 N 40 9.0 7.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xd46ff60>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 0
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
2002-03-20 10:00:00 20020320 1000 FM-15 120 14.0 N 330 10.0 4.0
2005-12-21 04:00:00 20051221 400 FM-15 140 13.0 N 310 2.0 2.0
2012-04-02 10:00:00 20120402 1000 FM-15 140 13.0 N 310 10.0 4.0
2009-06-05 13:00:00 20090605 1300 FM-15 50 12.0 N 40 9.0 7.0
2010-03-19 23:00:00 20100319 2300 FM-15 120 12.0 N 330 10.0 7.0
2004-03-16 23:00:00 20040316 2300 FM-15 50 12.0 N 40 1.0 7.0
2002-04-15 17:00:00 20020415 1700 FM-15 140 12.0 N 310 9.0 5.0
2015-05-10 22:00:00 20150510 2200 FM-15 150 12.0 N 300 2.0 2.0
2001-11-13 04:00:00 20011113 400 FM-15 140 12.0 N 310 6.0 0.0
2001-11-13 05:00:00 20011113 500 FM-15 140 12.0 N 310 0.0 1.0

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0      4049
10     4013
20     3812
30     4258
40     3997
50     3787
60     3138
70     2750
80     2232
90     2104
100    1627
110    1383
120    1603
130    2040
140    2204
150    2560
160    2479
170    2118
180    1603
190     996
200     772
210     788
220     797
230     915
240     956
250    1226
260    1622
270    1728
280    2201
290    3376
300    4231
310    3698
320    2899
330    2766
340    2958
350    3468
999    6527
Name: dir, dtype: int64
36 10.0
In [18]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.0486580061806
In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
0.0    4826
1.0    1251
2.0     368
3.0      77
4.0       5
Name: speed, dtype: int64

1.5 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type dir_windrose
time
In [24]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
2001 - 2005
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
2006 - 2010
2011 - 2015
In [25]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[25]:
(0, 4.5)
In [26]:
%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
Wall time: 7.82 s
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? False
Report type used: FM-15
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed dir_windrose x y
count 3.555300e+04 35553.000000 35553.000000 35553.000000 35553.000000 35553.000000 35553.000000
mean 2.013290e+07 1163.080471 164.685973 3.384006 215.826400 0.898205 0.184083
std 1.384736e+04 682.019610 119.491613 1.599733 252.099519 2.612942 2.518494
min 2.011010e+07 0.000000 -4.995724 0.001138 0.000000 -10.835943 -9.859464
25% 2.012072e+07 600.000000 47.863324 2.278931 70.000000 -0.803215 -1.526956
50% 2.013111e+07 1200.000000 148.654352 3.219337 130.000000 1.326992 0.292626
75% 2.014121e+07 1700.000000 289.450501 4.317136 270.000000 2.691972 1.900851
max 2.015123e+07 2300.000000 355.000000 13.115997 999.000000 11.685250 10.331285
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0xdc41208>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x185ce780>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 6.11 s

2.3 Overview by Month

In [41]:
%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
Wall time: 18.8 s
In [42]:
df.describe()
Out[42]:
date HrMn dir speed dir_windrose x y
count 3.555300e+04 35553.000000 35553.000000 35553.000000 35553.000000 35553.000000 35553.000000
mean 2.013290e+07 1163.080471 164.685973 3.384006 215.826400 0.898205 0.184083
std 1.384736e+04 682.019610 119.491613 1.599733 252.099519 2.612942 2.518494
min 2.011010e+07 0.000000 -4.995724 0.001138 0.000000 -10.835943 -9.859464
25% 2.012072e+07 600.000000 47.863324 2.278931 70.000000 -0.803215 -1.526956
50% 2.013111e+07 1200.000000 148.654352 3.219337 130.000000 1.326992 0.292626
75% 2.014121e+07 1700.000000 289.450501 4.317136 270.000000 2.691972 1.900851
max 2.015123e+07 2300.000000 355.000000 13.115997 999.000000 11.685250 10.331285

3. Create input data and configuration

In [43]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [44]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7]
In [45]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [46]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [47]:
%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1.1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
0.6
Wall time: 1e+03 µs
In [48]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 0.6 225
[  4.61420681e-06   8.14755472e-06   2.05762334e-05   2.45678886e-05
   2.96657063e-05]
In [49]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit

In [50]:
kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
In [51]:
%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20)) 
Wall time: 8.36 s
In [52]:
for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit

In [53]:
%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(4,3))
fig2, ax2 = plt.subplots(figsize=(4,3))

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    speed_ = array(list(zip(df_standard.x, df_standard.y)))
    kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
    kde_result_standard = exp(kde_current.score_samples(points))
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))

    gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
    gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
    year_lim = end_year-year_length-10, end_year-year_length
    gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set_bivar.append(gofs_mean)
    
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(1996, 2006) 7.310979 0.110309 0.000009 0.10308 0.661053 0.75971
(1986, 1996) NaN NaN NaN NaN NaN NaN
Wall time: 15.1 s

4.3 Univariate GOF Limit

In [54]:
def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    # 1. Speed
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}
In [55]:
%%time
x = arange(0, df.speed.max() + 1)
fig = plt.figure(figsize=(9,2.5))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
gofs_mean_set = []

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
    density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]

    gofs = pd.DataFrame(gofs)
    gofs.set_index(['year'], inplace=True)    
    if len(gofs)>0:
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
    year_lim = end_year-year_length-10, end_year-year_length
    gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set.append(gofs_mean)
plt.tight_layout()
plt.legend()
for ax in [ax1, ax2, ax3]:
    plt_configure(ax=ax, tight='xtight')
    
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))
k_s r_square r_square_dir
year_lim
(1996, 2006) 0.173635 0.815007 0.762900
(1992, 2002) 0.150627 0.862079 0.835821
(1986, 1996) NaN NaN NaN
Wall time: 542 ms

5. GMM by Expectation-maximization

In [56]:
sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [57]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[57]:
weight mean_x mean_y sig_x sig_y corr
1 0.423 2.452 1.081 1.695 1.886 -0.102
2 0.292 -1.846 1.365 2.149 2.122 -0.151
3 0.285 1.405 -2.360 1.843 1.808 0.166
In [58]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.42313200873 [[ 2.45209846  1.0806189 ]] [ 1.65609284  1.92027299] -158.207730875
0.292151652862 [[-1.84625579  1.36538077]] [ 1.96750975  2.29186983] -132.603710636
0.284716338409 [[ 1.40501428 -2.36045433]] [ 1.6654679   1.97209802] -48.2920089045
In [59]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [60]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[60]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.983 0.015 0.028 6.049927e-07 0.028 0.177
In [61]:
gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization

In [62]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [63]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[63]:
     fun: -15.123214499401938
     jac: array([  5.78891158e+00,  -9.53674316e-07,  -8.34465027e-07,
         2.38418579e-07,   8.34465027e-07,  -4.76837158e-07,
         5.78891313e+00,   5.96046448e-07,   1.19209290e-07,
        -2.38418579e-07,   3.57627869e-07,  -2.38418579e-07,
         5.78891802e+00,   5.96046448e-07,  -2.38418579e-07,
         0.00000000e+00,  -3.57627869e-07,  -8.34465027e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 746
     nit: 36
    njev: 36
  status: 0
 success: True
       x: array([ 0.46620869,  2.48144624,  0.86962119,  1.62332904,  1.94926289,
       -0.27896065,  0.41449775, -1.36809478,  0.25300579,  2.18691773,
        2.77651251, -0.41166618,  0.11929356,  1.78580662, -2.7938226 ,
        1.16414179,  1.44787784,  0.17880762])

6.1 GMM Result

In [64]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[64]:
weight mean_x mean_y sig_x sig_y corr
1 0.466 2.481 0.870 1.623 1.949 -0.279
2 0.414 -1.368 0.253 2.187 2.777 -0.412
3 0.119 1.786 -2.794 1.164 1.448 0.179
In [65]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.466208693878 [[ 2.48144624  0.86962119]] [ 1.46968772  2.06756886] -151.703885878
0.414497746718 [[-1.36809478  0.25300579]] [ 1.83013922  3.02361064] -150.171650378
0.119293559403 [[ 1.78580662 -2.7938226 ]] [ 1.11720078  1.48439845] 160.438881075

6.2 Bivariate Goodness-of-fit statistics

In [66]:
gof_df(gmm_pdf_result, kde_result)
Out[66]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.016 0.085 2.704403e-07 0.018 0.118
In [67]:
pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')
Out[67]:
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(1996, 2006) 7.310979 0.110309 0.000009 0.10308 0.661053 0.75971
(1986, 1996) NaN NaN NaN NaN NaN NaN
In [68]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit

In [69]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V
In [70]:
%%time
x = arange(0, max_speed, 0.5)
# 1. Fit Weibull
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)

# 2. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibull*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))
Speed Distribution Comparison
(0.978278202850417, 0.96402298952794396, 0.96897422040957226)
Wall time: 16.5 s
In [71]:
%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log
0.0256550331947 0.0682047156818
4.0 4.0
Wall time: 6.58 s
In [72]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])
Direction Distribution Comparison
Out[72]:
0.95026091426712855
In [73]:
pd.DataFrame(gofs_mean_set).set_index('year_lim')
Out[73]:
k_s r_square r_square_dir
year_lim
(1996, 2006) 0.173635 0.815007 0.762900
(1992, 2002) 0.150627 0.862079 0.835821
(1986, 1996) NaN NaN NaN

6.4 Sectoral Comaprison

In [74]:
%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.921588641443
Wall time: 8.67 s
In [75]:
# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
#                                      (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))  
In [76]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection
In [77]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 2655 weight 0.07467724242679942
GMM Weibull
R square 0.943766547674 0.931043743565
max diff: 0.0354669490879 0.0421477934635 speed value: 2.09634037724 2.09634037724 y gmm 0.183112900124
 
25.0 (15.0 - 35.0) degree
data size: 3278 weight 0.09220037690208983
GMM Weibull
R square 0.937971816078 0.920178630499
max diff: 0.03921695954 0.0468905617444 speed value: 5.35315620913 4.16356594043 y gmm 0.934274921712
 
45.0 (35.0 - 55.0) degree
data size: 3015 weight 0.0848029702134841
GMM Weibull
R square 0.938512934145 0.914940704629
max diff: 0.0427853445741 0.0554589503832 speed value: 5.72361147668 4.00652803368 y gmm 0.962188329649
 
65.0 (55.0 - 75.0) degree
data size: 2040 weight 0.05737912412454645
GMM Weibull
R square 0.960462021837 0.938287777572
max diff: 0.0363615197913 0.0559018394957 speed value: 4.56353571914 3.99309375425 y gmm 0.828518382536
 
85.0 (75.0 - 95.0) degree
data size: 1499 weight 0.04216240542288977
GMM Weibull
R square 0.986927030974 0.976052203404
max diff: 0.023170028665 0.0663995181557 speed value: 2.60699401864 3.6497916261 y gmm 0.394083971293
 
105.0 (95.0 - 115.0) degree
data size: 1400 weight 0.039377830281551486
GMM Weibull
R square 0.937187902778 0.954344848393
max diff: 0.0542683469331 0.0703966660705 speed value: 4.99410536065 3.88430416939 y gmm 0.80787451021
 
125.0 (115.0 - 135.0) degree
data size: 1707 weight 0.048012825921863136
GMM Weibull
R square 0.936610177186 0.979319189207
max diff: 0.0787868051389 0.0519049452585 speed value: 4.31723308731 4.31723308731 y gmm 0.579678338388
 
145.0 (135.0 - 155.0) degree
data size: 1887 weight 0.05307568981520547
GMM Weibull
R square 0.970103411676 0.979172184503
max diff: 0.0424660272503 0.0394685824037 speed value: 3.45157816839 3.45157816839 y gmm 0.432891683402
 
165.0 (155.0 - 175.0) degree
data size: 1751 weight 0.049250414873569036
GMM Weibull
R square 0.948819107067 0.939656077521
max diff: 0.0324805948402 0.0677221158971 speed value: 6.11582620367 3.89188940234 y gmm 0.931395500608
 
185.0 (175.0 - 195.0) degree
data size: 1136 weight 0.03195229657131606
GMM Weibull
R square 0.914964974699 0.919256638268
max diff: 0.0819200637547 0.042153874009 speed value: 2.73584864269 3.83018809976 y gmm 0.461213739062
 
205.0 (195.0 - 215.0) degree
data size: 806 weight 0.022670379433521785
GMM Weibull
R square 0.918468592091 0.918199679136
max diff: 0.106446452881 0.0682303132807 speed value: 2.85518219962 3.99725507947 y gmm 0.551121785332
 
225.0 (215.0 - 235.0) degree
data size: 840 weight 0.02362669816893089
GMM Weibull
R square 0.974844341309 0.959580627399
max diff: 0.0503556677311 0.0483539867183 speed value: 2.38659927709 2.98324909636 y gmm 0.430596713221
 
245.0 (235.0 - 255.0) degree
data size: 909 weight 0.025567462661378788
GMM Weibull
R square 0.984086667906 0.985430517252
max diff: 0.038838128671 0.0560020679981 speed value: 4.12420302755 2.94585930539 y gmm 0.79571381624
 
265.0 (255.0 - 275.0) degree
data size: 1430 weight 0.04022164093044187
GMM Weibull
R square 0.957788300432 0.992375594809
max diff: 0.0602806319854 0.111949610555 speed value: 2.794161676 4.191242514 y gmm 0.400140771846
 
285.0 (275.0 - 295.0) degree
data size: 2118 weight 0.059573031811661466
GMM Weibull
R square 0.974405911869 0.978699877427
max diff: 0.0346199979396 0.1521104374 speed value: 4.08639134247 4.08639134247 y gmm 0.622603798094
 
305.0 (295.0 - 315.0) degree
data size: 2788 weight 0.07841813630354681
GMM Weibull
R square 0.965806675517 0.959217301143
max diff: 0.029906079263 0.0918952026108 speed value: 3.74981788812 3.74981788812 y gmm 0.582719458757
 
325.0 (315.0 - 335.0) degree
data size: 2505 weight 0.07045818918234747
GMM Weibull
R square 0.976832799434 0.978283120561
max diff: 0.0325498463524 0.0555824932523 speed value: 3.96965589837 3.96965589837 y gmm 0.676831391173
 
345.0 (335.0 - 355.0) degree
data size: 2905 weight 0.08170899783421934
GMM Weibull
R square 0.969660553769 0.963250478278
max diff: 0.0353540652097 0.0406806544465 speed value: 2.02136541026 4.04273082052 y gmm 0.169261122008
 
Wall time: 42.8 s
In [78]:
diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9557407616643078 0.9526121394842565
In [79]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.04302696915719027 0.0639442590182609
In [80]:
# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation

(1) Data Variability, by Bootstrap (Resampling)

In [81]:
angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle
In [82]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [83]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
205.0 (195.0 - 215.0) Degree Speed Distribution
0.129749528195 3.0 0.589853449473

(2) Time Variability

In [84]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log
205.0 (195.0 - 215.0) Degree Speed Distribution

(3) Adjacent Sector Variability

In [85]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [86]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
205.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [87]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
0.6 square_error

7.1 Variability of the Result

In [88]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.448 -1.190 0.255 2.289 2.732 -0.389
2 0.430 2.546 0.919 1.597 1.898 -0.290
3 0.121 1.814 -2.738 1.158 1.498 0.219
GMM Plot Result
0.44825820657 [[-1.18984597  0.25514594]] [ 1.91685543  3.00451124] -147.291012705
0.430455635121 [[ 2.54572719  0.9185946 ]] [ 1.43251325  2.02498676] -150.473725111
0.121286158308 [[ 1.81357151 -2.73751651]] [ 1.09691226  1.54363396] 159.951214866
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.991 0.017 0.140 3.064773e-07 0.019 0.126
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.018 0.109 2.765145e-07 0.019 0.119

weight mean_x mean_y sig_x sig_y corr
1 0.448 2.517 0.907 1.617 1.931 -0.271
2 0.422 -1.298 0.286 2.211 2.766 -0.392
3 0.130 1.807 -2.723 1.183 1.547 0.243
GMM Plot Result
0.448105193015 [[ 2.51692995  0.90686013]] [ 1.46934309  2.04613192] -151.649843218
0.421704896062 [[-1.29828029  0.28596981]] [ 1.8714683   3.00652455] -149.951332912
0.130189910922 [[ 1.8074443 -2.7225971]] [ 1.10902352  1.60117279] 159.05369578
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.016 0.069 2.842804e-07 0.019 0.121
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.015 0.077 2.778264e-07 0.019 0.120

weight mean_x mean_y sig_x sig_y corr
1 0.456 -1.073 0.015 2.344 2.894 -0.456
2 0.455 2.498 0.850 1.614 1.975 -0.284
3 0.089 1.807 -2.866 1.090 1.293 0.243
GMM Plot Result
0.456246098853 [[-1.07349696  0.01479623]] [ 1.87765638  3.21661499] -147.499903702
0.455214333398 [[ 2.49833737  0.84957827]] [ 1.46247276  2.08986689] -152.805298048
0.0885395677494 [[ 1.80651992 -2.86592826]] [ 1.00557242  1.35943544] 152.626755554
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.014 0.112 2.805370e-07 0.019 0.120
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.015 0.085 2.888519e-07 0.019 0.122

weight mean_x mean_y sig_x sig_y corr
1 0.465 -1.095 0.104 2.344 2.818 -0.437
2 0.429 2.551 0.946 1.603 1.924 -0.309
3 0.106 1.844 -2.640 1.059 1.407 0.257
GMM Plot Result
0.464637845606 [[-1.0949381   0.10429566]] [ 1.89250129  3.13886168] -146.488833045
0.429161530834 [[ 2.5510872   0.94578048]] [ 1.42394966  2.06021856] -150.390688195
0.10620062356 [[ 1.84413452 -2.64004299]] [ 0.98752914  1.45765449] 159.091242074
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.015 0.150 2.714893e-07 0.019 0.118
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.016 0.112 2.889639e-07 0.019 0.122

weight mean_x mean_y sig_x sig_y corr
1 0.471 2.494 0.864 1.603 1.946 -0.288
2 0.415 -1.380 0.219 2.157 2.806 -0.423
3 0.114 1.788 -2.803 1.116 1.380 0.190
GMM Plot Result
0.470911930985 [[ 2.49425748  0.86367345]] [ 1.44673849  2.06419907] -152.07566216
0.414999970511 [[-1.380303    0.21851082]] [ 1.79899564  3.0478754 ] -151.091913536
0.114088098504 [[ 1.78816705 -2.80267364]] [ 1.06487514  1.41915109] 159.21874307
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.993 0.016 0.111 2.580750e-07 0.018 0.115
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.015 0.101 2.816636e-07 0.019 0.121

weight mean_x mean_y sig_x sig_y corr
1 0.474 2.426 0.939 1.687 1.932 -0.288
2 0.411 -1.377 0.047 2.216 2.834 -0.479
3 0.115 1.897 -2.655 1.095 1.483 0.206
GMM Plot Result
0.473881837082 [[ 2.42594709  0.93868437]] [ 1.50042581  2.0807041 ] -147.651396722
0.410826276594 [[-1.37725738  0.04678377]] [ 1.7558996   3.14057426] -148.710271209
0.115291886324 [[ 1.89741105 -2.65520083]] [ 1.04728564  1.51644857] 163.105796743
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.015 0.085 2.833812e-07 0.019 0.121
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.015 0.078 2.830567e-07 0.019 0.121

weight mean_x mean_y sig_x sig_y corr
1 0.461 2.473 0.888 1.639 1.930 -0.301
2 0.425 -1.299 0.209 2.285 2.839 -0.431
3 0.115 1.783 -2.715 1.147 1.449 0.164
GMM Plot Result
0.460598356009 [[ 2.473405   0.8879395]] [ 1.4565401   2.07092893] -149.273833759
0.424748448233 [[-1.29889955  0.20932646]] [ 1.87277569  3.12621413] -148.465023943
0.114653195759 [[ 1.78255665 -2.71523423]] [ 1.10938856  1.47818746] 162.537934833
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.016 0.130 2.820021e-07 0.018 0.121
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.016 0.104 2.824102e-07 0.019 0.121

weight mean_x mean_y sig_x sig_y corr
1 0.462 2.523 0.811 1.596 1.946 -0.257
2 0.438 -1.277 0.241 2.224 2.838 -0.414
3 0.100 1.770 -2.828 1.099 1.331 0.188
GMM Plot Result
0.461805071151 [[ 2.52347475  0.81055364]] [ 1.46862185  2.04363751] -153.963250423
0.438258940925 [[-1.27686429  0.24082097]] [ 1.85930379  3.08897869] -150.364069712
0.0999359879248 [[ 1.76958957 -2.82762473]] [ 1.04637007  1.3723461 ] 157.847058751
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.017 0.070 2.732426e-07 0.018 0.119
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.017 0.079 2.792555e-07 0.019 0.120

weight mean_x mean_y sig_x sig_y corr
1 0.467 2.484 0.857 1.619 1.945 -0.279
2 0.439 -1.170 -0.021 2.312 2.886 -0.457
3 0.094 1.893 -2.752 1.061 1.280 0.156
GMM Plot Result
0.466984028667 [[ 2.48411996  0.8570201 ]] [ 1.46576019  2.06276078] -151.73555024
0.439274293441 [[-1.17046547 -0.02082043]] [ 1.85483912  3.19941631] -148.023157994
0.093741677892 [[ 1.89329409 -2.75170038]] [ 1.02485588  1.30903172] 160.162576331
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.014 0.105 2.928048e-07 0.019 0.123
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.014 0.087 2.902274e-07 0.019 0.122

weight mean_x mean_y sig_x sig_y corr
1 0.466 2.496 0.877 1.625 1.908 -0.269
2 0.415 -1.349 0.192 2.151 2.836 -0.436
3 0.119 1.811 -2.787 1.088 1.458 0.112
GMM Plot Result
0.466222499293 [[ 2.4961095   0.87740135]] [ 1.47263511  2.02805014] -150.430706942
0.415263781549 [[-1.34902187  0.19216093]] [ 1.78210312  3.08115112] -151.37280023
0.118513719158 [[ 1.81119837 -2.78695426]] [ 1.07248948  1.46954242] 169.662037398
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.015 0.082 2.816547e-07 0.019 0.121
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.014 0.088 2.903091e-07 0.019 0.122

weight mean_x mean_y sig_x sig_y corr
1 0.463 2.506 0.795 1.638 1.947 -0.269
2 0.406 -1.416 0.335 2.177 2.701 -0.381
3 0.131 1.760 -2.790 1.172 1.543 0.186
GMM Plot Result
0.46291037636 [[ 2.50607178  0.795007  ]] [ 1.48857959  2.06356907] -151.374803609
0.405618562819 [[-1.41631446  0.33490898]] [ 1.85371218  2.93222171] -149.83504005
0.13147106082 [[ 1.76033248 -2.78973568]] [ 1.12773433  1.57602293] 163.123076495
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.015 0.098 2.854443e-07 0.019 0.121
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.991 0.017 0.080 3.015252e-07 0.019 0.125

weight mean_x mean_y sig_x sig_y corr
1 0.463 2.472 0.883 1.649 1.986 -0.290
2 0.394 -1.479 0.361 2.157 2.642 -0.406
3 0.143 1.734 -2.737 1.272 1.604 0.260
GMM Plot Result
0.462914557539 [[ 2.47223253  0.88322772]] [ 1.48333654  2.11207932] -151.393486315
0.393687110011 [[-1.47926418  0.36148877]] [ 1.79599583  2.8998609 ] -148.346501296
0.14339833245 [[ 1.73367091 -2.7369283 ]] [ 1.17594437  1.67589442] 155.971454139
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.016 0.068 2.969751e-07 0.020 0.124
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.992 0.016 0.080 2.909941e-07 0.019 0.122
Wall time: 13.9 s

7.2 Cross-validation, to select the number of Gaussian

In [89]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Number of train/test dataset 26664.75 8888.25
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.101079 0.058453 0.000004 0.070928 0.452627 0.888089
1 0.098859 0.057883 0.000004 0.071095 0.455296 0.886396
2 0.102106 0.059839 0.000004 0.071414 0.461868 0.883832
3 0.099026 0.059518 0.000004 0.072011 0.460794 0.882799
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.103870 0.056434 0.000004 0.072439 0.471387 0.877671
1 0.108891 0.060489 0.000004 0.071788 0.462310 0.883242
2 0.102206 0.055943 0.000004 0.072467 0.452872 0.885975
3 0.105057 0.063762 0.000004 0.071257 0.459973 0.886952
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.077730 0.016403 8.404524e-07 0.032460 0.208147 0.976151
1 0.035102 0.019094 9.520038e-07 0.034593 0.221527 0.973080
2 0.045255 0.017818 9.218632e-07 0.034296 0.218029 0.973892
3 0.073925 0.016925 8.610643e-07 0.032527 0.210727 0.975905
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.076335 0.013181 9.762876e-07 0.034985 0.224415 0.972873
1 0.044720 0.021708 9.448836e-07 0.034279 0.220787 0.973477
2 0.050567 0.021313 1.219009e-06 0.038092 0.250648 0.965961
3 0.086967 0.017085 9.145096e-07 0.034918 0.217066 0.973488
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.081170 0.015046 2.683114e-07 0.018355 0.117598 0.992524
1 0.081152 0.014699 2.636746e-07 0.018215 0.116574 0.992547
2 0.097652 0.016301 2.840183e-07 0.018919 0.121023 0.991914
3 0.081396 0.015878 2.824095e-07 0.018718 0.120698 0.991991
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.174536 0.024516 4.532202e-07 0.023781 0.152939 0.986705
1 0.094137 0.021009 4.176028e-07 0.022755 0.146819 0.988265
2 0.064858 0.018376 3.656892e-07 0.021244 0.137270 0.989941
3 0.079213 0.013721 3.651570e-07 0.021738 0.137107 0.989842
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.025669 0.007021 1.041670e-07 0.011325 0.073282 0.997063
1 0.029303 0.007272 1.035962e-07 0.011493 0.073077 0.997057
2 0.023162 0.006753 9.985800e-08 0.011155 0.071736 0.997169
3 0.017094 0.006635 9.266448e-08 0.010818 0.069147 0.997406
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.020712 0.011232 1.978685e-07 0.016191 0.101016 0.994397
1 0.027861 0.005382 1.854541e-07 0.014871 0.097813 0.994866
2 0.031309 0.014414 2.261072e-07 0.016992 0.108049 0.993703
3 0.036466 0.012865 2.860023e-07 0.018728 0.121295 0.991734
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.017456 0.003248 7.287353e-08 0.009439 0.061292 0.997940
1 0.007024 0.003147 6.236400e-08 0.008844 0.056716 0.998253
2 0.014145 0.002876 6.450583e-08 0.009050 0.057661 0.998175
3 0.015943 0.003189 6.304104e-08 0.008943 0.057013 0.998211
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.019837 0.011348 1.658036e-07 0.014982 0.092478 0.995345
1 0.020176 0.009790 2.012096e-07 0.015870 0.101793 0.994190
2 0.016021 0.010482 1.549167e-07 0.013674 0.089415 0.995657
3 0.018548 0.010329 1.610459e-07 0.013961 0.091114 0.995523
Wall time: 42.4 s
In [90]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.100268 0.058923 4.062478e-06 0.071362 0.457646 0.885279
2 0.058003 0.017560 8.938459e-07 0.033469 0.214607 0.974757
3 0.085343 0.015481 2.746034e-07 0.018552 0.118973 0.992244
4 0.023807 0.006921 1.000714e-07 0.011198 0.071811 0.997174
5 0.013642 0.003115 6.569610e-08 0.009069 0.058171 0.998145
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.105006 0.059157 4.134304e-06 0.071988 0.461635 0.883460
2 0.064647 0.018322 1.013672e-06 0.035568 0.228229 0.971450
3 0.103186 0.019405 4.004173e-07 0.022379 0.143534 0.988688
4 0.029087 0.010973 2.238580e-07 0.016695 0.107043 0.993675
5 0.018646 0.010487 1.707439e-07 0.014622 0.093700 0.995179
In [91]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [92]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)