Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
file_path, bandwidth= './data/NCDC/europe/uk/tiree/dat.txt', 1.9
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path, convert_to_knot= './data/NCDC/europe/france/pau_pyrenees/dat.txt', True # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3
# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 0.9, 4 # good
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7, default 0.9
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19790101 0000 FM-12 230 3.6 N
1 19790101 0100 FM-12 250 5.1 N
2 19790101 0200 FM-12 250 7.7 N
3 19790101 0300 FM-12 270 9.3 N
4 19790101 0400 FM-12 300 11.3 N
5 19790101 0500 FM-12 330 10.3 N
6 19790101 0600 FM-12 330 9.3 N
7 19790101 0700 FM-12 360 6.7 N
8 19790101 0800 FM-12 310 6.7 N
9 19790101 0900 FM-12 290 4.1 N
10 19790101 1000 FM-12 280 8.2 N
11 19790101 1100 FM-12 290 6.2 N
12 19790101 1200 FM-12 20 3.1 N
13 19790101 1300 FM-12 280 8.2 N
14 19790101 1400 FM-12 280 7.7 N
15 19790101 1500 FM-12 280 10.3 N
16 19790101 1600 FM-12 290 3.6 N
17 19790101 1700 FM-12 300 9.3 N
18 19790101 1800 FM-12 290 4.6 N
19 19790101 1900 FM-12 340 5.7 N
20 19790101 2000 FM-12 60 3.6 N
21 19790101 2100 FM-12 110 3.6 N
22 19790101 2200 FM-12 90 3.6 N
23 19790101 2300 FM-12 320 3.6 N
24 19790102 0000 FM-12 280 14.4 N
25 19790102 0100 FM-12 270 13.4 N
26 19790102 0200 FM-12 280 13.4 N
27 19790102 0300 FM-12 280 11.8 N
28 19790102 0400 FM-12 290 8.2 N
29 19790102 0500 FM-12 290 6.2 N
... ... ... ... ... ... ...
486093 20160801 1350 FM-15 170 3.1 V
486094 20160801 1400 FM-12 170 3.1 N
486095 20160801 1420 FM-15 150 2.1 V
486096 20160801 1450 FM-15 30 2.6 V
486097 20160801 1500 FM-12 30 2.6 N
486098 20160801 1520 FM-15 999 1.5 V
486099 20160801 1550 FM-15 130 2.1 N
486100 20160801 1600 FM-12 130 2.1 N
486101 20160801 1620 FM-15 100 2.1 V
486102 20160801 1650 FM-15 110 3.1 V
486103 20160801 1700 FM-12 110 3.1 N
486104 20160801 1720 FM-15 70 3.1 V
486105 20160801 1750 FM-15 50 4.6 V
486106 20160801 1800 FM-12 50 4.6 N
486107 20160801 1850 FM-15 60 5.1 N
486108 20160801 1900 FM-12 60 5.1 N
486109 20160801 1920 FM-15 60 5.1 N
486110 20160801 1950 FM-15 60 4.1 N
486111 20160801 2000 FM-12 60 4.1 N
486112 20160801 2020 FM-15 50 4.1 N
486113 20160801 2050 FM-15 50 3.6 N
486114 20160801 2100 FM-12 50 3.6 N
486115 20160801 2120 FM-15 50 3.6 N
486116 20160801 2150 FM-15 60 3.6 N
486117 20160801 2200 FM-12 60 3.6 N
486118 20160801 2220 FM-15 60 3.6 N
486119 20160801 2250 FM-15 60 3.1 N
486120 20160801 2300 FM-12 60 3.1 N
486121 20160801 2320 FM-15 70 4.1 N
486122 20160801 2350 FM-15 70 4.1 N

486123 rows × 6 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
56.499 -6.869
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
date HrMn dir speed dir_windrose
count 4.853830e+05 485383.000000 485383.000000 485383.000000 485383.000000
mean 2.000788e+07 1153.725957 205.073847 7.268022 204.830806
std 1.142918e+05 670.111036 129.567515 3.840033 134.306571
min 1.979010e+07 0.000000 0.000000 0.000000 0.000000
25% 1.991013e+07 600.000000 130.000000 4.600000 140.000000
50% 2.003032e+07 1120.000000 200.000000 6.700000 200.000000
75% 2.011102e+07 1700.000000 270.000000 9.800000 270.000000
max 2.016080e+07 2350.000000 999.000000 38.600000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xd177fd0>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
True
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [12]:
MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xd1312b0>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type dir_windrose
time

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
1991-01-03 06:00:00 19910103 600 FM-12 200 70 N 250 44.0 46.0
2004-04-21 11:00:00 20040421 1100 FM-12 290 66 N 160 33.0 32.0
1983-10-18 14:00:00 19831018 1400 FM-12 210 61 N 240 31.0 24.0
1979-05-09 01:00:00 19790509 100 FM-12 140 61 N 310 45.0 48.0
1989-02-13 15:00:00 19890213 1500 FM-12 160 56 N 290 6.0 6.0
1993-01-21 21:00:00 19930121 2100 FM-12 180 56 N 270 8.0 6.0
1993-01-17 05:00:00 19930117 500 FM-12 180 55 N 270 12.0 3.0
2011-12-08 14:00:00 20111208 1400 FM-12 180 55 N 270 3.0 6.0
1981-02-27 04:00:00 19810227 400 FM-12 320 55 N 130 19.0 22.0
1979-06-06 04:00:00 19790606 400 FM-12 250 55 N 200 49.0 48.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xd21f2b0>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 22
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
1993-01-21 21:00:00 19930121 2100 FM-12 180 56 N 270 8.0 6.0
1989-02-13 15:00:00 19890213 1500 FM-12 160 56 N 290 6.0 6.0
1993-01-17 05:00:00 19930117 500 FM-12 180 55 N 270 12.0 3.0
1981-02-27 04:00:00 19810227 400 FM-12 320 55 N 130 19.0 22.0
2011-12-08 14:00:00 20111208 1400 FM-12 180 55 N 270 3.0 6.0
1979-12-17 01:00:00 19791217 100 FM-12 210 54 N 240 1.0 2.0
1984-01-21 19:00:00 19840121 1900 FM-12 310 54 N 140 4.0 2.0
2011-12-08 12:00:00 20111208 1200 FM-12 190 54 N 260 2.0 2.0
2008-01-09 05:00:00 20080109 500 FM-12 170 53 N 280 13.0 15.0
1996-11-06 05:00:00 19961106 500 FM-12 210 53 N 240 34.0 5.0

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0       2920
10      2831
20      4235
30      4516
40      4623
50      4010
60      5202
70      6821
80      8805
90      8527
100     8698
110     7263
120     7665
130     7077
140     8106
150     8517
160    10467
170    10216
180    11171
190    11178
200    12361
210    11430
220    12151
230    10247
240    10279
250     9925
260    12179
270    11267
280    12124
290    11058
300    12298
310    10270
320     8852
330     5863
340     4178
350     2837
999     4034
Name: dir, dtype: int64
36 10.0
In [18]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.00653928680903
In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
0    4031
2       2
3       1
Name: speed, dtype: int64

1.5 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type dir_windrose
time
In [24]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1979 - 1980
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1981 - 1985
1986 - 1990
1991 - 1995
1996 - 2000
2001 - 2005
2006 - 2010
2011 - 2015
In [25]:
df.resample('A').mean().plot(y='speed', figsize=(4,3))
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[25]:
(0, 25.0)
In [26]:
%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 28.7 s
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
if 'ECMWF' in file_path:
    df = df_all_years['2006':'2015']
else:
    df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? True
Report type used: FM-12
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed dir_windrose x y
count 4.294100e+04 42941.000000 42941.000000 42941.000000 42941.000000 42941.000000 42941.000000
mean 2.013064e+07 1148.766913 199.031797 14.710244 207.088074 -3.266533 -3.481062
std 1.416565e+04 693.144069 85.930142 7.518509 127.680651 10.807243 11.547260
min 2.011010e+07 0.000000 -4.974596 0.004901 0.000000 -55.838431 -48.462423
25% 2.012033e+07 500.000000 139.421820 9.023588 140.000000 -10.570633 -11.554922
50% 2.013063e+07 1100.000000 207.120174 13.931340 210.000000 -3.087732 -3.740448
75% 2.014100e+07 1800.000000 268.786679 19.512330 270.000000 4.124950 4.737477
max 2.015123e+07 2300.000000 354.989194 55.848205 999.000000 36.553061 39.072912
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x2333c5c0>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x259e2400>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 10.5 s

2.3 Overview by Month

In [41]:
%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
Wall time: 18.7 s
In [42]:
df.describe()
Out[42]:
date HrMn dir speed dir_windrose x y
count 4.294100e+04 42941.000000 42941.000000 42941.000000 42941.000000 42941.000000 42941.000000
mean 2.013064e+07 1148.766913 199.031797 14.710244 207.088074 -3.266533 -3.481062
std 1.416565e+04 693.144069 85.930142 7.518509 127.680651 10.807243 11.547260
min 2.011010e+07 0.000000 -4.974596 0.004901 0.000000 -55.838431 -48.462423
25% 2.012033e+07 500.000000 139.421820 9.023588 140.000000 -10.570633 -11.554922
50% 2.013063e+07 1100.000000 207.120174 13.931340 210.000000 -3.087732 -3.740448
75% 2.014100e+07 1800.000000 268.786679 19.512330 270.000000 4.124950 4.737477
max 2.015123e+07 2300.000000 354.989194 55.848205 999.000000 36.553061 39.072912

3. Create input data and configuration

In [43]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [44]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
        
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)
[-29 -28 -27 -26 -25 -24 -23 -22 -21 -20 -19 -18 -17 -16 -15 -14 -13 -12
 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5   6
   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24
  25  26  27  28  29]

4. Kernel Density Estimation

In [45]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [46]:
%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1.1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
1.9
Wall time: 0 ns
In [47]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    
points = FITTING_RANGE    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
In [48]:
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result, kde = fit_kde(df, config)
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 1.9 3481
[  3.85220082e-06   4.56204332e-06   5.36130546e-06   6.51760944e-06
   8.12649435e-06]
In [49]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit

In [50]:
%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20)) 
Wall time: 40.1 s
In [51]:
for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit

In [52]:
%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(3,2.5))
fig2, ax2 = plt.subplots(figsize=(3,2.5))
gofs_bivar_set={}

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    kde_result_standard, _ = fit_kde(df_standard, config)
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years[str(start_year):str(start_year+year_length-1)], kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar_set[year_length] = gofs_bivar = pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))
    if len(gofs_bivar)>0:
        gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
        gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
        year_lim = end_year-year_length-5, end_year-year_length
        gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set_bivar.append(gofs_mean)
    
plt_configure(ax=ax1, ylabel='$\ R^2$', xlabel='start year')
plt_configure(ax=ax2, ylabel='K-S', xlabel='start year')
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(2001, 2006) 0.935304 0.041641 5.006552e-09 0.056988 0.251911 0.951559
(1991, 1996) 0.018954 0.035548 2.699932e-09 0.039571 0.184266 0.974143
Wall time: 2min 55s

4.3 Univariate GOF Limit

In [53]:
def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    _, _, density_expected, y_ecdf_previous, density_dir_expected = empirical_marginal_distribution(df_previous, x)
    
    # 1. Speed
    r_square = sector_r_square(density, density_expected)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}
In [54]:
%%time
x = arange(0, df.speed.max() + 1)
fig1, ax1 = plt.subplots(figsize=(2.8,2.5))
fig2, ax2 = plt.subplots(figsize=(2.8,2.5))
fig3, ax3 = plt.subplots(figsize=(2.8,2.5))
gofs_mean_set = []
gofs_univar_set={}

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    _, _, density, y_ecdf, density_dir = empirical_marginal_distribution(df_standard, x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]
    if len(gofs)>0:
        gofs = pd.DataFrame(gofs).set_index(['year'])  
        gofs_univar_set[year_length]=gofs
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
        year_lim = end_year-year_length-5, end_year-year_length
        gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set.append(gofs_mean)
plt.tight_layout()


plt_configure(ax=ax1, ylabel='$\ R^2$', tight='xtight', legend=True)
plt_configure(ax=ax2, ylabel='K-S', tight='xtight', legend=True)
plt_configure(ax=ax3, ylabel='$\ R^2$', tight='xtight', legend=True)
align_figures()
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))
k_s r_square r_square_dir
year_lim
(2001, 2006) 0.026264 0.982004 0.788685
(1997, 2002) 0.021312 0.990797 0.876860
(1991, 1996) 0.021696 0.975261 0.935257
Wall time: 1.88 s

5. GMM by Expectation-maximization

In [55]:
sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [56]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[56]:
weight mean_x mean_y sig_x sig_y corr
1 0.348 -12.391 -3.573 8.415 9.887 -0.207
2 0.328 4.195 -12.389 9.017 8.323 -0.116
3 0.324 -1.005 5.622 7.158 8.558 0.150
In [57]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.348263166412 [[-12.39080554  -3.57311126]] [  7.90181799  10.3016881 ] -154.046496414
0.327632513488 [[  4.19469305 -12.38863155]] [ 8.04255073  9.26724564] -117.708251079
0.3241043201 [[-1.00458331  5.62238558]] [ 6.92149954  8.75086632] 160.062716515
In [58]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [59]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[59]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.986 0.012 0.016 1.486589e-09 0.031 0.137
In [60]:
gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization

In [61]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [62]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[62]:
     fun: -20.744744145623141
     jac: array([  5.91536760e-01,  -2.38418579e-07,   0.00000000e+00,
        -4.76837158e-07,  -2.38418579e-07,   2.38418579e-07,
         5.91538191e-01,   0.00000000e+00,  -2.38418579e-07,
         0.00000000e+00,  -2.38418579e-07,  -4.76837158e-07,
         5.91527462e-01,   0.00000000e+00,  -2.38418579e-07,
         0.00000000e+00,   2.38418579e-07,  -2.38418579e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 1760
     nit: 87
    njev: 87
  status: 0
 success: True
       x: array([  0.23617014,  -0.40238567,   7.37544144,   7.28474655,
         7.77692707,   0.10835169,   0.04975056,  11.35518098,
       -12.45919888,   5.79229146,   4.36678382,  -0.31815735,
         0.7140793 ,  -5.64005108,  -6.34504276,  11.4299447 ,
        10.40638216,  -0.33269638])

6.1 GMM Result

In [63]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[63]:
weight mean_x mean_y sig_x sig_y corr
1 0.714 -5.640 -6.345 11.430 10.406 -0.333
2 0.236 -0.402 7.375 7.285 7.777 0.108
3 0.050 11.355 -12.459 5.792 4.367 -0.318
In [64]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.71407930211 [[-5.64005108 -6.34504276]] [  8.8514474   12.67234408] -127.115034158
0.236170140221 [[-0.40238567  7.37544144]] [ 7.042966    7.99654656] 150.562268139
0.0497505576687 [[ 11.35518098 -12.45919888]] [ 3.93500145  6.09386617] -114.009766785

6.2 Bivariate Goodness-of-fit statistics

In [65]:
gof_df(gmm_pdf_result, kde_result)
Out[65]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.991 0.006 0.014 9.787507e-10 0.025 0.111
In [66]:
# Empirical Power
pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')
Out[66]:
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(2001, 2006) 0.935304 0.041641 5.006552e-09 0.056988 0.251911 0.951559
(1991, 1996) 0.018954 0.035548 2.699932e-09 0.039571 0.184266 0.974143
In [67]:
%%time
# Solid is Empirical, Dash is against previous data
fig1, ax1 = plt.subplots(figsize=(3,2.5))
fig2, ax2 = plt.subplots(figsize=(3,2.5))
gofs_mean_set_bivar_previous = []
prop_cycle = iter(mpl.rcParams['axes.color_cycle'])
for year_length in [10]:
    color = next(prop_cycle)
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    gofs_bivar_previous=Parallel(n_jobs=-1)(delayed(gmm_gofs_in_previous)(df_all_years[str(sub_start_year):str(sub_start_year+year_length-1)], gmm_pdf_result, config) 
       for sub_start_year in arange(start_year, end_year+1)) 
    gofs_bivar_previous=pd.DataFrame(gofs_bivar_previous, index=arange(start_year, end_year+1))    
    
    gofs_bivar_set[year_length].plot(y='R_square',ax=ax1, style='--', color=color, label='')
    gofs_bivar_set[year_length].plot(y='K_S',ax=ax2, style='--', color=color, label='')
    gofs_bivar_previous.plot(y='R_square', color=color, ax=ax1, label=year_length)
    gofs_bivar_previous.plot(y='K_S', color=color, ax=ax2, label=year_length)
    year_lim = end_year-year_length-5, end_year-year_length
    gofs_mean = gofs_bivar_previous.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set_bivar_previous.append(gofs_mean)
    
plt_configure(ax=ax1, ylabel='$\ R^2$', xlabel='start year', legend=True)
plt_configure(ax=ax2, ylabel='K-S', xlabel='start year', legend=True)
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar_previous).set_index('year_lim'))
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(1991, 1996) 0.055729 0.048765 4.258783e-09 0.042808 0.232745 0.959424
Wall time: 1min 38s
In [68]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit

In [69]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V
In [70]:
%%time
x = arange(0, max_speed, 0.5)
# 1. Fit Weibull
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)

# 2. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibull*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))
Speed Distribution Comparison
(0.9835626493574906, 0.98441351856183845, 0.98270898106811155)
Wall time: 44.2 s
In [71]:
%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log
0.0111565439951 0.0384549636265
10.0 12.0
Wall time: 30.5 s
In [72]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 375, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])
Direction Distribution Comparison
Out[72]:
0.95885317500684397
In [73]:
pd.DataFrame(gofs_mean_set).set_index('year_lim')
Out[73]:
k_s r_square r_square_dir
year_lim
(2001, 2006) 0.026264 0.982004 0.788685
(1997, 2002) 0.021312 0.990797 0.876860
(1991, 1996) 0.021696 0.975261 0.935257
In [74]:
fig1, ax1 = plt.subplots(figsize=(2.8,2.5))
fig2, ax2 = plt.subplots(figsize=(2.8,2.5))
fig3, ax3 = plt.subplots(figsize=(2.8,2.5))
prop_cycle = iter(mpl.rcParams['axes.color_cycle'])
gofs_mean_set = []

x = bins = arange(0, df['speed'].max() + 1)
_, _, density_speed_expected_gmm, y_cdf_gmm, density_dir_expected = gmm_marginal_distribution(f, x)
_, _, density_expected_weibull, y_cdf_weibull = fit_weibull(df.speed, x, weibull_params)
    
for year_length in [10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    gofs = []
    for sub_start_year in arange(start_year, end_year+1):
        df_previous = df_all_years[str(sub_start_year):str(sub_start_year+year_length-1)]
        
        _, _, density_speed, y_ecdf_previous, density_dir = empirical_marginal_distribution(df_previous, x) 

        r_square_speed = sector_r_square(density_speed, density_speed_expected_gmm)
        r_square_speed_weibull = sector_r_square(density_speed, density_expected_weibull)

        k_s_speed = (np.abs(y_ecdf_previous - y_cdf_gmm)).max()
        k_s_speed_weibull = (np.abs(y_ecdf_previous - y_cdf_weibull)).max()
        
        r_square_dir = sector_r_square(density_dir*10, density_dir_expected[:-1])
        gofs.append({'year': sub_start_year, 
                     'r_square': r_square_speed, 'k_s': k_s_speed, 'r_square_dir': r_square_dir,
                      'r_square_weibulll': r_square_speed_weibull, 'k_s_weibulll': k_s_speed_weibull})

    gofs=pd.DataFrame(gofs).set_index(['year'])  
    year_lim = end_year-year_length-5, end_year-year_length
    gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set.append(gofs_mean)

    color = next(prop_cycle)
    # GMM agains previous data
    gofs.plot(y='r_square', ax=ax1, label=year_length, color=color)
    gofs.plot(y='k_s', ax=ax2, color=color, label=year_length)
    gofs.plot(y='r_square_dir', ax=ax3, label=year_length, color=color)
    # Empricial agains previous data
    ax1.plot(gofs_univar_set[year_length].r_square, label='', color=color, linestyle='--')
    ax2.plot(gofs_univar_set[year_length].k_s, label='', color=color, linestyle='--')
    ax3.plot(gofs_univar_set[year_length].r_square_dir, label='', color=color, linestyle='--')
    color = next(prop_cycle)
    gofs.plot(y='r_square_weibulll', ax=ax1, label='Weibull', color=color)
    gofs.plot(y='k_s_weibulll', ax=ax2, label='Weibull', color=color)
    
    plt_configure(ax=ax1, ylabel='$\ R^2$', xlabel='start year', legend=True)
    plt_configure(ax=ax2, ylabel='K-S', xlabel='start year', legend=True)
    plt_configure(ax=ax3, ylabel='$\ R^2$', xlabel='start year', legend=True)

align_figures()
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
k_s k_s_weibulll r_square r_square_dir r_square_weibulll
year_lim
(1991, 1996) 0.032412 0.060228 0.961456 0.821333 0.965117

6.4 Sectoral Comaprison

In [75]:
%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.906998995011
Wall time: 15.1 s
In [76]:
# Turn to parrallel would run 50% faster
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection
In [77]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 594 weight 0.013832933560000931
GMM Weibull
R square 0.788025484386 0.82147456985
max diff: 0.0573402658265 0.0900792534984 speed value: 8.72565279527 4.98608731158 y gmm 0.486430777945
 
25.0 (15.0 - 35.0) degree
data size: 944 weight 0.02198365198761091
GMM Weibull
R square 0.779090310598 0.845918812541
max diff: 0.0968559527875 0.0870285504522 speed value: 8.93674531266 5.95783020844 y gmm 0.481533877721
 
45.0 (35.0 - 55.0) degree
data size: 1054 weight 0.024545306350574045
GMM Weibull
R square 0.827884867792 0.898757891174
max diff: 0.103679247449 0.0428256730213 speed value: 9.99340033032 3.33113344344 y gmm 0.461785648187
 
65.0 (55.0 - 75.0) degree
data size: 1737 weight 0.04045085116788151
GMM Weibull
R square 0.963018100982 0.974676009845
max diff: 0.0514541292848 0.0117001775236 speed value: 15.3354042633 17.5261763009 y gmm 0.694516881156
 
85.0 (75.0 - 95.0) degree
data size: 1902 weight 0.04429333271232622
GMM Weibull
R square 0.944329292549 0.945771180447
max diff: 0.0329049557206 0.0357831285644 speed value: 6.04873417785 16.1299578076 y gmm 0.144366575069
 
105.0 (95.0 - 115.0) degree
data size: 1818 weight 0.04233716028969982
GMM Weibull
R square 0.933884227469 0.935443558186
max diff: 0.0433839996819 0.0462865524505 speed value: 5.75796819872 15.3545818632 y gmm 0.135793240606
 
125.0 (115.0 - 135.0) degree
data size: 2020 weight 0.047041289210777576
GMM Weibull
R square 0.955817771721 0.941079667655
max diff: 0.0361187398218 0.0348442774799 speed value: 4.96420231924 4.96420231924 y gmm 0.10097022497
 
145.0 (135.0 - 155.0) degree
data size: 2487 weight 0.05791667636990289
GMM Weibull
R square 0.952042734278 0.943840031934
max diff: 0.0340597468498 0.0330253150357 speed value: 16.0383200578 16.0383200578 y gmm 0.61411073968
 
165.0 (155.0 - 175.0) degree
data size: 3147 weight 0.0732866025476817
GMM Weibull
R square 0.950228871005 0.955941815595
max diff: 0.0384344496695 0.0183019160242 speed value: 24.3116312651 8.10387708838 y gmm 0.817936697455
 
185.0 (175.0 - 195.0) degree
data size: 3477 weight 0.08097156563657111
GMM Weibull
R square 0.936071312698 0.946825488814
max diff: 0.0617029396986 0.0691096745726 speed value: 20.575654556 20.575654556 y gmm 0.711377301889
 
205.0 (195.0 - 215.0) degree
data size: 3613 weight 0.08413870193987098
GMM Weibull
R square 0.951358938681 0.958962447718
max diff: 0.0325245721766 0.114491114391 speed value: 23.9836137313 15.9890758209 y gmm 0.861469338701
 
225.0 (215.0 - 235.0) degree
data size: 3655 weight 0.08511678815118419
GMM Weibull
R square 0.942673702944 0.946709019185
max diff: 0.0254128092316 0.0850490121314 speed value: 9.48859071086 16.605033744 y gmm 0.232042731124
 
245.0 (235.0 - 255.0) degree
data size: 2995 weight 0.06974686197340536
GMM Weibull
R square 0.895602488666 0.923145517783
max diff: 0.0647751460224 0.089107155351 speed value: 12.7904115795 12.7904115795 y gmm 0.399665588535
 
265.0 (255.0 - 275.0) degree
data size: 3614 weight 0.08416198970680701
GMM Weibull
R square 0.94248981415 0.947870317396
max diff: 0.0227943918807 0.102558767445 speed value: 20.4431636778 15.3323727583 y gmm 0.744432466037
 
285.0 (275.0 - 295.0) degree
data size: 3152 weight 0.07340304138236184
GMM Weibull
R square 0.939633081645 0.957227203443
max diff: 0.0419867574641 0.0472033323222 speed value: 12.3451781901 12.3451781901 y gmm 0.335234054719
 
305.0 (295.0 - 315.0) degree
data size: 3473 weight 0.080878414568827
GMM Weibull
R square 0.969042705553 0.975947402732
max diff: 0.0312836935742 0.0188496263964 speed value: 10.2827689492 23.1362301357 y gmm 0.211531318106
 
325.0 (315.0 - 335.0) degree
data size: 2229 weight 0.05190843250040754
GMM Weibull
R square 0.932919996283 0.955217154357
max diff: 0.0609199110404 0.0167309151211 speed value: 11.211513985 26.9076335641 y gmm 0.283441221045
 
345.0 (335.0 - 355.0) degree
data size: 858 weight 0.019980904031112457
GMM Weibull
R square 0.956952941715 0.946133656853
max diff: 0.0183427946863 0.0288578365133 speed value: 24.3127386526 13.8929935158 y gmm 0.944361168018
 
Wall time: 1min 4s
In [78]:
diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9349146033418321 0.9454992123132648
In [79]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.042976425019988836 0.05709926738942663
In [80]:
# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation

(1) Data Variability, by Bootstrap (Resampling)

In [81]:
angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle
In [82]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [83]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    if i == 1: 
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
45.0 (35.0 - 55.0) Degree Speed Distribution
0.10988472772 10.0 0.462221534139

(2) Time Variability

In [84]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
45.0 (35.0 - 55.0) Degree Speed Distribution

(3) Adjacent Sector Variability

In [85]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [86]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
45.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [87]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
1.9 square_error

7.1 Variability of the Result

In [88]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.688 -5.493 -6.704 11.390 10.450 -0.328
2 0.261 -1.045 6.785 7.872 8.186 0.186
3 0.052 11.263 -12.319 5.494 4.377 -0.257
GMM Plot Result
0.687559179281 [[-5.49329003 -6.70430955]] [  8.89430029  12.64205079] -127.628793007
0.260904739449 [[-1.04540022  6.7851384 ]] [ 7.22960477  8.75861197] 140.945702284
0.0515360812701 [[ 11.26304851 -12.31884172]] [ 4.04731886  5.74078196] -114.149233342
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.005 0.013 1.127741e-09 0.026 0.120
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.012 1.052174e-09 0.026 0.116

weight mean_x mean_y sig_x sig_y corr
1 0.727 -5.523 -6.177 11.392 10.417 -0.327
2 0.224 -0.293 7.933 7.230 7.863 0.115
3 0.049 11.755 -12.594 5.709 4.318 -0.274
GMM Plot Result
0.726995009657 [[-5.5233348  -6.17686125]] [  8.88295523  12.62517954] -127.341160038
0.224266226959 [[-0.29343737  7.9326475 ]] [ 6.99522931  8.0725642 ] 153.05170633
0.0487387633838 [[ 11.75473982 -12.59359142]] [ 3.98776651  5.94412341] -112.058245066
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.988 0.006 0.015 1.194037e-09 0.027 0.123
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.005 0.013 1.012398e-09 0.026 0.113

weight mean_x mean_y sig_x sig_y corr
1 0.471 -8.047 -8.280 11.279 10.828 -0.508
2 0.406 -2.102 4.562 8.274 8.639 0.132
3 0.123 9.256 -11.278 7.142 5.448 -0.334
GMM Plot Result
0.470994819752 [[-8.04698014 -8.2799435 ]] [  7.74182277  13.5840478 ] -132.705767109
0.406146856734 [[-2.10240155  4.5619939 ]] [ 7.85087378  9.02515585] 144.076941563
0.122858323513 [[  9.25640056 -11.27753783]] [ 4.8496647   7.56111862] -115.331773162
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.00492 0.018 1.100566e-09 0.027 0.118
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.008 0.016 1.091865e-09 0.027 0.118

weight mean_x mean_y sig_x sig_y corr
1 0.711 -5.534 -6.392 11.624 10.515 -0.344
2 0.238 -0.550 7.168 7.362 7.897 0.087
3 0.050 10.290 -11.897 6.873 4.158 -0.456
GMM Plot Result
0.711472541109 [[-5.53367465 -6.39220122]] [  8.88802334  12.91027449] -126.870983847
0.238088288229 [[-0.54986952  7.16843498]] [ 7.19499113  8.04943292] 154.415225813
0.0504391706617 [[ 10.29024966 -11.89709263]] [ 3.52258445  7.21941302] -110.513750284
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.006 0.015 1.018253e-09 0.025 0.114
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.006 0.015 1.041925e-09 0.026 0.115

weight mean_x mean_y sig_x sig_y corr
1 0.718 -5.660 -6.433 11.487 10.311 -0.339
2 0.235 -0.491 7.573 7.139 7.771 0.087
3 0.047 11.325 -12.399 5.436 4.277 -0.316
GMM Plot Result
0.717799058692 [[-5.65995447 -6.43344968]] [  8.77471621  12.69946836] -126.140535317
0.235267196757 [[-0.49061723  7.5733865 ]] [ 6.9955071   7.90053036] 157.195165042
0.0469337445506 [[ 11.3251964  -12.39917248]] [ 3.8303001   5.76002268] -116.26359409
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.006 0.016 1.160819e-09 0.027 0.121
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.016 1.000897e-09 0.026 0.113

weight mean_x mean_y sig_x sig_y corr
1 0.686 -5.785 -6.763 11.417 10.454 -0.347
2 0.260 -0.606 6.715 7.493 8.040 0.131
3 0.054 11.549 -12.574 5.871 4.672 -0.283
GMM Plot Result
0.686413662077 [[-5.78459534 -6.76276286]] [  8.777548    12.75059687] -127.874597338
0.259592187002 [[-0.60593959  6.71480855]] [ 7.17208252  8.32746973] 149.152429676
0.0539941509207 [[ 11.54883962 -12.57398983]] [ 4.25801887  6.17764642] -115.454323383
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.005 0.014 1.044346e-09 0.026 0.115
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.013 1.016373e-09 0.026 0.114

weight mean_x mean_y sig_x sig_y corr
1 0.670 -5.649 -6.891 11.850 10.372 -0.359
2 0.281 -1.105 6.573 7.685 8.032 0.134
3 0.050 10.471 -11.822 6.481 4.151 -0.399
GMM Plot Result
0.669615397861 [[-5.64866351 -6.89053898]] [  8.77237428  13.07863354] -124.788844121
0.280609398202 [[-1.10492454  6.57269365]] [ 7.28700308  8.39464909] 144.171927445
0.0497752039368 [[ 10.47055786 -11.82219967]] [ 3.63767728  6.78216933] -110.442693578
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.005 0.015 1.059466e-09 0.026 0.116
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.014 1.013753e-09 0.026 0.113

weight mean_x mean_y sig_x sig_y corr
1 0.740 -5.471 -6.038 11.375 10.370 -0.324
2 0.217 -0.229 7.907 7.206 7.703 0.099
3 0.043 11.797 -12.951 5.222 4.234 -0.269
GMM Plot Result
0.739799354217 [[-5.47110501 -6.03786666]] [  8.86896683  12.58025123] -127.030930747
0.217430387332 [[-0.22917049  7.90731797]] [ 7.00076002  7.88969024] 152.008366584
0.0427702584514 [[ 11.79685116 -12.9511928 ]] [ 3.87635669  5.49250352] -115.946597962
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.006 0.015 1.106198e-09 0.026 0.118
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.014 1.000428e-09 0.025 0.113

weight mean_x mean_y sig_x sig_y corr
1 0.712 -5.429 -6.553 11.496 10.437 -0.359
2 0.246 -0.537 7.293 7.476 7.850 0.103
3 0.042 11.507 -12.306 5.726 4.241 -0.350
GMM Plot Result
0.711554602417 [[-5.42881656 -6.55267156]] [  8.71603571  12.84996358] -127.452986067
0.246037813129 [[-0.53689663  7.29257933]] [ 7.21596571  8.08936792] 147.687187163
0.0424075844537 [[ 11.50699345 -12.30625488]] [ 3.75791229  6.05426749] -114.475876621
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.006 0.015 1.079032e-09 0.026 0.117
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.006 0.013 1.006971e-09 0.026 0.113

weight mean_x mean_y sig_x sig_y corr
1 0.493 -3.175 3.093 8.636 9.358 0.192
2 0.335 -9.169 -9.076 11.221 11.445 -0.565
3 0.172 7.840 -11.305 8.273 5.968 -0.356
GMM Plot Result
0.49308145703 [[-3.17493847  3.09286815]] [ 8.01657253  9.89318909] 146.370637065
0.334800773492 [[-9.16939347 -9.07604371]] [  7.46966817  14.18146993] -136.001941189
0.172117769478 [[  7.84019158 -11.3053554 ]] [ 5.28940911  8.72237173] -113.482854662
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.015 1.064294e-09 0.026 0.116
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.008 0.015 1.131268e-09 0.027 0.120

weight mean_x mean_y sig_x sig_y corr
1 0.717 -5.707 -6.126 11.355 10.374 -0.337
2 0.232 -0.172 7.496 7.082 7.834 0.070
3 0.051 11.172 -12.975 5.658 4.452 -0.312
GMM Plot Result
0.717178825775 [[-5.70663744 -6.12621104]] [  8.7850832   12.62454306] -127.490155075
0.231838279034 [[-0.1723078   7.49594719]] [ 6.99467207  7.91132403] 162.592138959
0.0509828951913 [[ 11.17229324 -12.9751107 ]] [ 3.99748122  5.98821699] -116.074663228
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.006 0.017 1.127022e-09 0.026 0.119
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.016 1.014841e-09 0.026 0.114

weight mean_x mean_y sig_x sig_y corr
1 0.710 -5.653 -6.356 11.453 10.447 -0.345
2 0.241 -0.237 7.328 7.420 7.804 0.094
3 0.049 11.147 -12.296 6.192 4.462 -0.363
GMM Plot Result
0.709731045908 [[-5.65312428 -6.35588469]] [  8.79998278  12.76167073] -127.530423941
0.241262787269 [[-0.2374763   7.32791786]] [ 7.19887845  8.0089058 ] 149.155611219
0.0490061668234 [[ 11.14678717 -12.29562043]] [ 3.93700372  6.53833403] -113.708571623
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.988 0.007 0.017 1.190130e-09 0.028 0.123
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.014 1.002483e-09 0.026 0.113
Wall time: 1min 8s

7.2 Cross-validation, to select the number of Gaussian

In [89]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Number of train/test dataset 32205.75 10735.25
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.062469 0.019924 5.350377e-09 0.059632 0.260615 0.947810
1 0.062292 0.018399 5.362382e-09 0.058142 0.261058 0.948375
2 0.066086 0.019802 5.657632e-09 0.059947 0.267918 0.945807
3 0.061779 0.019452 5.525019e-09 0.060576 0.264820 0.946274
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.067863 0.021880 6.255553e-09 0.061044 0.281867 0.941254
1 0.070187 0.027470 6.207802e-09 0.065657 0.280302 0.939451
2 0.060856 0.021016 5.478100e-09 0.059449 0.263995 0.945651
3 0.072897 0.020582 5.727103e-09 0.057661 0.269744 0.945708
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.034445 0.010712 1.809173e-09 0.034250 0.151604 0.982512
1 0.034708 0.010830 1.943468e-09 0.035132 0.157047 0.981370
2 0.034889 0.011027 1.997945e-09 0.035628 0.159322 0.980714
3 0.033935 0.010626 1.866860e-09 0.035378 0.153885 0.981739
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.045143 0.013985 2.596174e-09 0.041072 0.181382 0.974932
1 0.041531 0.015904 2.198200e-09 0.038831 0.167165 0.978223
2 0.043766 0.014074 2.191555e-09 0.034918 0.166631 0.978776
3 0.034190 0.014944 2.560080e-09 0.038210 0.180528 0.976146
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.015700 0.005900 9.380340e-10 0.025020 0.109096 0.990910
1 0.012876 0.006096 1.024627e-09 0.025406 0.114074 0.990121
2 0.014344 0.005492 1.031631e-09 0.026054 0.114407 0.989919
3 0.015784 0.005896 1.031165e-09 0.025592 0.114474 0.990117
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.018196 0.009982 1.548447e-09 0.030192 0.140342 0.985150
1 0.019255 0.013347 1.382486e-09 0.030189 0.132421 0.986543
2 0.027109 0.008208 1.535017e-09 0.030004 0.139739 0.985661
3 0.018467 0.012120 1.335067e-09 0.029987 0.130005 0.986763
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.013403 0.006921 8.238057e-10 0.022796 0.102244 0.992037
1 0.013063 0.004751 8.226715e-10 0.022889 0.102227 0.992078
2 0.012740 0.004917 8.432076e-10 0.023635 0.103495 0.991847
3 0.013797 0.005853 8.594867e-10 0.023893 0.104429 0.991643
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.018029 0.011554 1.471643e-09 0.031647 0.136791 0.985801
1 0.021355 0.009679 1.282428e-09 0.028673 0.127493 0.987464
2 0.020548 0.016413 1.381238e-09 0.029006 0.132316 0.986677
3 0.017311 0.008149 1.197104e-09 0.026713 0.123394 0.988626
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.020477 0.004130 6.559205e-10 0.020464 0.091271 0.993668
1 0.023143 0.006479 7.239591e-10 0.021584 0.095870 0.992977
2 0.023781 0.005210 6.805455e-10 0.020936 0.092956 0.993446
3 0.013801 0.007078 7.350556e-10 0.022123 0.096586 0.992869
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.024174 0.009123 1.281137e-09 0.029368 0.127472 0.987585
1 0.020568 0.013988 1.006147e-09 0.025647 0.113027 0.990390
2 0.037469 0.005840 9.964278e-10 0.025549 0.112465 0.990261
3 0.026880 0.010732 1.420613e-09 0.028975 0.134372 0.986428
Wall time: 3min 6s
In [90]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.063156 0.019394 5.473853e-09 0.059574 0.263603 0.947067
2 0.034494 0.010799 1.904362e-09 0.035097 0.155464 0.981584
3 0.014676 0.005846 1.006364e-09 0.025518 0.113013 0.990267
4 0.013251 0.005611 8.372929e-10 0.023303 0.103099 0.991901
5 0.020300 0.005724 6.988702e-10 0.021277 0.094171 0.993240
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.067951 0.022737 5.917140e-09 0.060953 0.273977 0.943016
2 0.041157 0.014727 2.386502e-09 0.038258 0.173926 0.977019
3 0.020757 0.010914 1.450254e-09 0.030093 0.135627 0.986029
4 0.019311 0.011449 1.333103e-09 0.029010 0.129999 0.987142
5 0.027273 0.009921 1.176081e-09 0.027384 0.121834 0.988666
In [91]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [ ]:
# for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
#     display(fig)
# for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
#             fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
#     display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)