Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth= './data/NCDC/europe/uk/tiree/dat.txt', 1.9
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path, convert_to_knot= './data/NCDC/europe/france/pau_pyrenees/dat.txt', True # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3
# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 0.9, 4 # good
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7, default 0.9
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19900101 0000 FM-12 999 0.0 C
1 19900101 0050 FM-15 999 0.0 C
2 19900101 0150 FM-15 999 0.0 C
3 19900101 0300 SY-MT 999 0.0 C
4 19900101 0400 FM-15 999 0.0 C
5 19900101 0450 FM-15 999 0.0 C
6 19900101 0600 FM-15 999 0.0 C
7 19900101 0650 FM-15 999 0.0 C
8 19900101 0750 FM-15 999 0.0 C
9 19900101 0900 FM-15 999 0.0 C
10 19900101 1050 FM-15 999 0.0 C
11 19900101 1150 FM-15 360 0.5 N
12 19900101 1250 FM-15 20 1.5 N
13 19900101 1400 FM-15 30 1.0 N
14 19900101 1450 FM-15 30 0.5 N
15 19900101 1600 FM-15 999 0.0 C
16 19900101 1630 FM-15 999 0.0 C
17 19900101 1650 FM-15 60 0.5 N
18 19900101 1750 FM-15 190 0.5 N
19 19900101 1900 FM-15 180 0.5 N
20 19900101 2000 FM-15 170 1.0 N
21 19900101 2100 SY-MT 170 0.5 N
22 19900101 2200 FM-15 180 0.5 N
23 19900101 2250 FM-15 190 1.0 N
24 19900101 2350 FM-15 180 0.5 N
25 19900102 0000 SY-MT 190 1.0 N
26 19900102 0150 FM-15 260 0.5 N
27 19900102 0250 FM-15 190 1.5 N
28 19900102 0300 FM-12 190 1.5 N
29 19900102 0400 FM-15 200 1.0 N
... ... ... ... ... ... ...
333361 20160131 2320 FM-15 250 9.8 N
333362 20160131 2346 FM-16 240 11.8 N
333363 20160201 0020 FM-15 240 12.9 N
333364 20160201 0120 FM-15 240 13.4 N
333365 20160201 0220 FM-15 240 13.4 N
333366 20160201 0320 FM-15 240 13.4 N
333367 20160201 0420 FM-15 240 13.4 N
333368 20160201 0520 FM-15 240 13.9 N
333369 20160201 0620 FM-15 250 14.9 N
333370 20160201 0625 FM-16 250 13.9 N
333371 20160201 0720 FM-15 250 14.9 N
333372 20160201 0820 FM-15 240 11.3 N
333373 20160201 0920 FM-15 240 11.8 N
333374 20160201 1020 FM-15 240 11.8 N
333375 20160201 1048 FM-16 240 11.8 N
333376 20160201 1120 FM-15 240 11.8 N
333377 20160201 1211 FM-16 240 9.3 N
333378 20160201 1220 FM-15 240 12.4 N
333379 20160201 1320 FM-15 250 12.4 N
333380 20160201 1420 FM-15 250 12.9 N
333381 20160201 1520 FM-15 240 12.4 N
333382 20160201 1604 FM-16 240 9.3 N
333383 20160201 1620 FM-15 240 10.3 N
333384 20160201 1720 FM-15 240 6.7 N
333385 20160201 1820 FM-15 270 7.2 N
333386 20160201 1920 FM-15 260 6.7 N
333387 20160201 2020 FM-15 260 8.8 N
333388 20160201 2120 FM-15 250 9.8 N
333389 20160201 2220 FM-15 250 8.8 N
333390 20160201 2320 FM-15 240 9.8 N

333391 rows × 6 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
48.071 10.906
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
date HrMn dir speed dir_windrose
count 3.332600e+05 333260.000000 333260.000000 333260.000000 333260.000000
mean 2.003340e+07 1166.548146 206.404069 3.225488 208.309536
std 7.020275e+04 687.794956 169.049772 2.319920 167.356881
min 1.990010e+07 0.000000 0.000000 0.000000 0.000000
25% 1.998072e+07 600.000000 110.000000 1.500000 110.000000
50% 2.004020e+07 1120.000000 200.000000 2.600000 210.000000
75% 2.009082e+07 1720.000000 250.000000 4.100000 250.000000
max 2.016020e+07 2359.000000 999.000000 34.500000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xcc40390>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
True
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [12]:
MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[20]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xce09dd8>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type dir_windrose
time
1994-03-22 18:20:00 19940322 1820 FM-15 176 4 N 274
1996-04-18 17:20:00 19960418 1720 FM-15 88 4 N 2
1998-02-05 13:20:00 19980205 1320 FM-15 87 2 N 3
1998-11-11 00:20:00 19981111 20 FM-15 159 14 N 291
2000-02-20 15:20:00 20000220 1520 FM-15 157 7 N 293
2000-05-20 08:20:00 20000520 820 FM-15 193 13 N 257
2000-10-27 00:20:00 20001027 20 FM-15 229 7 N 221
2000-12-06 01:20:00 20001206 120 FM-15 85 3 N 5
2001-03-02 12:20:00 20010302 1220 FM-15 221 4 N 229
2001-05-21 08:20:00 20010521 820 FM-15 29 34 N 61
2002-03-16 16:20:00 20020316 1620 FM-15 32 8 N 58
2002-04-06 03:20:00 20020406 320 FM-15 83 5 N 7
2002-06-09 17:20:00 20020609 1720 FM-15 206 4 N 244
2002-08-13 09:20:00 20020813 920 FM-15 159 2 N 291
2002-12-30 02:20:00 20021230 220 FM-15 198 25 N 252
2003-01-27 05:20:00 20030127 520 FM-15 209 2 N 241
2003-02-09 03:20:00 20030209 320 FM-15 85 3 N 5
2003-02-21 16:20:00 20030221 1620 FM-15 1 4 N 89
2003-12-06 08:20:00 20031206 820 FM-15 163 14 N 287
2005-07-24 10:20:00 20050724 1020 FM-15 195 5 N 255
2005-11-25 15:20:00 20051125 1520 FM-15 239 8 N 211
2005-12-31 11:20:00 20051231 1120 FM-15 216 14 N 234
2006-01-13 05:20:00 20060113 520 FM-15 229 2 N 221
2006-03-10 02:20:00 20060310 220 FM-15 209 67 N 241
2006-03-10 03:20:00 20060310 320 FM-15 209 67 N 241
2006-12-04 00:20:00 20061204 20 FM-15 178 28 N 272

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
2006-10-13 22:20:00 20061013 2220 FM-15 60 53 N 30 50.0 48.0
2015-11-15 11:20:00 20151115 1120 FM-15 200 52 N 250 27.0 22.0
2001-06-17 00:20:00 20010617 20 FM-15 270 47 N 180 42.0 39.0
1998-02-14 17:20:00 19980214 1720 FM-15 40 46 N 50 40.0 43.0
2007-01-25 03:20:00 20070125 320 FM-15 150 45 N 300 40.0 42.0
2002-10-03 03:20:00 20021003 320 FM-15 350 44 N 100 40.0 39.0
2006-06-14 06:20:00 20060614 620 FM-15 250 44 N 200 40.0 41.0
2006-04-29 00:20:00 20060429 20 FM-15 170 42 N 280 40.0 39.0
2005-12-21 17:20:00 20051221 1720 FM-15 270 42 N 180 40.0 38.0
1995-01-26 22:20:00 19950126 2220 FM-15 200 40 N 250 18.0 13.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xcd463c8>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 17
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
1995-01-26 22:20:00 19950126 2220 FM-15 200 40 N 250 18.0 13.0
1999-12-26 13:20:00 19991226 1320 FM-15 190 39 N 260 6.0 1.0
2015-03-31 09:20:00 20150331 920 FM-15 200 39 N 250 1.0 4.0
2015-03-31 11:20:00 20150331 1120 FM-15 190 39 N 260 4.0 2.0
2015-03-31 08:20:00 20150331 820 FM-15 210 38 N 240 1.0 -1.0
2005-12-16 14:20:00 20051216 1420 FM-15 210 38 N 240 6.0 4.0
1999-12-26 14:20:00 19991226 1420 FM-15 190 38 N 260 -1.0 8.0
2007-01-18 22:20:00 20070118 2220 FM-15 210 38 N 240 3.0 6.0
2015-03-31 14:20:00 20150331 1420 FM-15 200 38 N 250 2.0 6.0
2003-01-02 20:20:00 20030102 2020 FM-15 200 38 N 250 5.0 8.0

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0       2604
10      3772
20      4141
30      4787
40      3730
50      3786
60      3238
70      3294
80      2445
90      2218
100     1621
110     1844
120     1687
130     1865
140     1811
150     2383
160     2799
170     4118
180     4730
190     6997
200     9256
210    12291
220     9079
230     7033
240     4321
250     4491
260     4051
270     4781
280     3931
290     4075
300     3375
310     3279
320     2623
330     2466
340     2004
350     2434
999     4194
Name: dir, dtype: int64
36 10.0
In [18]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.00677323785894
In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
2    1718
1    1129
0     945
3     395
4       5
9       1
5       1
Name: speed, dtype: int64

1.5 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type dir_windrose
time
In [24]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1993 - 1995
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1996 - 2000
2001 - 2005
2006 - 2010
2011 - 2015
In [25]:
df.resample('A').mean().plot(y='speed', figsize=(4,3))
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[25]:
(0, 22.5)
In [26]:
%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
Wall time: 13.7 s
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2006':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? True
Report type used: FM-15
Sampling time used: [20]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed dir_windrose x y
count 8.640000e+04 86400.000000 86400.000000 86400.000000 86400.000000 86400.000000 86400.000000
mean 2.010585e+07 1171.596065 184.688148 6.791369 206.554780 -1.946480 -1.379277
std 2.864623e+04 690.812702 95.187640 4.161544 175.873185 6.016332 4.642646
min 2.006010e+07 20.000000 -4.986791 0.001042 0.000000 -38.398653 -25.648915
25% 2.008071e+07 620.000000 103.095277 3.877681 100.000000 -5.593108 -4.447410
50% 2.011011e+07 1220.000000 206.829021 5.772452 210.000000 -1.098342 -1.852899
75% 2.013070e+07 1820.000000 252.098174 8.575966 250.000000 2.346723 2.113300
max 2.015123e+07 2320.000000 354.986789 39.430212 999.000000 20.796761 19.250923
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x115cc908>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x19aaa208>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 8.35 s

2.3 Overview by Month

In [41]:
%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
Wall time: 18.2 s
In [42]:
df.describe()
Out[42]:
date HrMn dir speed dir_windrose x y
count 8.640000e+04 86400.000000 86400.000000 86400.000000 86400.000000 86400.000000 86400.000000
mean 2.010585e+07 1171.596065 184.688148 6.791369 206.554780 -1.946480 -1.379277
std 2.864623e+04 690.812702 95.187640 4.161544 175.873185 6.016332 4.642646
min 2.006010e+07 20.000000 -4.986791 0.001042 0.000000 -38.398653 -25.648915
25% 2.008071e+07 620.000000 103.095277 3.877681 100.000000 -5.593108 -4.447410
50% 2.011011e+07 1220.000000 206.829021 5.772452 210.000000 -1.098342 -1.852899
75% 2.013070e+07 1820.000000 252.098174 8.575966 250.000000 2.346723 2.113300
max 2.015123e+07 2320.000000 354.986789 39.430212 999.000000 20.796761 19.250923

3. Create input data and configuration

In [43]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [44]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
        
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)
[-16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1
   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16]

4. Kernel Density Estimation

In [45]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [46]:
%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1.1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
0.9
Wall time: 0 ns
In [47]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 0.9 1089
[  8.00485424e-05   1.24002059e-04   1.89606224e-04   2.49645743e-04
   3.08579248e-04]
In [48]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit

In [49]:
kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
In [50]:
%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20)) 
Wall time: 29.7 s
In [51]:
for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit

In [52]:
%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(3,2.5))
fig2, ax2 = plt.subplots(figsize=(3,2.5))

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    speed_ = array(list(zip(df_standard.x, df_standard.y)))
    kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
    kde_result_standard = exp(kde_current.score_samples(points))
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))
    if len(gofs_bivar)>0:
        gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
        gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
        year_lim = end_year-year_length-5, end_year-year_length
        gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set_bivar.append(gofs_mean)
    
plt_configure(ax=ax1, ylabel='$\ R^2$', xlabel='start year')
plt_configure(ax=ax2, ylabel='K-S', xlabel='start year')
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(2001, 2006) 9.139418e+06 0.034562 4.967776e-08 0.020889 0.245696 0.981110
(1991, 1996) 9.410803e+04 0.049701 1.276477e-07 0.033806 0.396092 0.948317
Wall time: 38.7 s

4.3 Univariate GOF Limit

In [53]:
def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    # 1. Speed
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}
In [54]:
%%time
x = arange(0, df.speed.max() + 1)
fig1, ax1 = plt.subplots(figsize=(2.8,2.5))
fig2, ax2 = plt.subplots(figsize=(2.8,2.5))
fig3, ax3 = plt.subplots(figsize=(2.8,2.5))
gofs_mean_set = []

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
    density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]
    if len(gofs)>0:
        gofs = pd.DataFrame(gofs)
        gofs.set_index(['year'], inplace=True)  
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
        year_lim = end_year-year_length-5, end_year-year_length
        gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set.append(gofs_mean)
plt.tight_layout()

plt_configure(ax=ax1, ylabel='$\ R^2$', tight='xtight', legend=True)
plt_configure(ax=ax2, ylabel='K-S', tight='xtight', legend=True)
plt_configure(ax=ax3, ylabel='$\ R^2$', tight='xtight', legend=True)
align_figures()
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))
k_s r_square r_square_dir
year_lim
(2001, 2006) 0.045466 0.991445 0.911881
(1997, 2002) 0.042175 0.992234 0.721662
(1991, 1996) 0.021687 0.996530 0.576464
Wall time: 1.32 s

5. GMM by Expectation-maximization

In [55]:
sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [56]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[56]:
weight mean_x mean_y sig_x sig_y corr
1 0.319 0.626 -3.233 2.820 2.183 0.325
2 0.277 2.987 3.638 3.704 2.685 0.070
3 0.233 -5.384 -1.683 2.882 3.792 0.191
4 0.171 -10.041 -5.640 5.468 4.517 0.257
In [57]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.318577216602 [[ 0.62630225 -3.23285053]] [ 1.9489722   2.98585641] -64.2518494058
0.277016421177 [[ 2.98730136  3.63822642]] [ 2.67163829  3.71338702] -83.9770661341
0.233422494675 [[-5.38385875 -1.68271483]] [ 2.76801562  3.87597629] 162.772003755
0.170983867547 [[-10.04084984  -5.64047207]] [ 4.14922288  5.75151438] -63.3888448453
In [58]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [59]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[59]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.976 0.013 0.090 5.862592e-08 0.023 0.269
In [60]:
gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization

In [61]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [62]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[62]:
     fun: -17.459085085829351
     jac: array([  3.06992292e+00,   4.76837158e-07,   4.76837158e-07,
         2.38418579e-07,   2.38418579e-07,  -2.38418579e-07,
         3.06992459e+00,   2.38418579e-07,   0.00000000e+00,
         2.38418579e-07,  -2.38418579e-07,  -2.38418579e-07,
         3.06991625e+00,   2.38418579e-07,  -2.38418579e-07,
         4.76837158e-07,   7.15255737e-07,   0.00000000e+00,
         3.06992579e+00,   0.00000000e+00,   0.00000000e+00,
        -2.38418579e-07,   0.00000000e+00,  -4.76837158e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 1739
     nit: 66
    njev: 66
  status: 0
 success: True
       x: array([ 0.14615741,  1.58485346, -2.73907552,  1.92697508,  1.75687609,
        0.38770181,  0.37896714, -5.34074838, -4.62861409,  6.32093165,
        3.31964506,  0.36410206,  0.2688106 ,  2.79332753,  3.65136353,
        3.54455637,  2.58694109, -0.16396335,  0.20606485, -3.93590089,
       -1.18692852,  3.18843986,  3.55531158, -0.33038242])

6.1 GMM Result

In [63]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[63]:
weight mean_x mean_y sig_x sig_y corr
1 0.379 -5.341 -4.629 6.321 3.320 0.364
2 0.269 2.793 3.651 3.545 2.587 -0.164
3 0.206 -3.936 -1.187 3.188 3.555 -0.330
4 0.146 1.585 -2.739 1.927 1.757 0.388
In [64]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.378967142322 [[-5.34074838 -4.62861409]] [ 3.02102627  6.46897369] -76.0807538496
0.268810596731 [[ 2.79332753  3.65136353]] [ 2.5158854  3.5953393] -103.558835983
0.206064854237 [[-3.93590089 -1.18692852]] [ 2.73111714  3.91757429] -144.139281083
0.146157406709 [[ 1.58485346 -2.73907552]] [ 1.43195659  2.17929964] -51.7127366459

6.2 Bivariate Goodness-of-fit statistics

In [65]:
gof_df(gmm_pdf_result, kde_result)
Out[65]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.013 0.669 2.615867e-08 0.015 0.180
In [66]:
pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')
Out[66]:
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(2001, 2006) 9.139418e+06 0.034562 4.967776e-08 0.020889 0.245696 0.981110
(1991, 1996) 9.410803e+04 0.049701 1.276477e-07 0.033806 0.396092 0.948317
In [67]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit

In [68]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V
In [69]:
%%time
x = arange(0, max_speed, 0.5)
# 1. Fit Weibull
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)

# 2. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibull*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))
Speed Distribution Comparison
(0.98198880665470267, 0.90533873684142596, 0.97753624137698691)
Wall time: 59 s
In [70]:
%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log
0.0224296017124 0.057969842225
2.0 7.0
Wall time: 59.2 s
In [71]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])
Direction Distribution Comparison
Out[71]:
0.91845309681583098
In [72]:
pd.DataFrame(gofs_mean_set).set_index('year_lim')
Out[72]:
k_s r_square r_square_dir
year_lim
(2001, 2006) 0.045466 0.991445 0.911881
(1997, 2002) 0.042175 0.992234 0.721662
(1991, 1996) 0.021687 0.996530 0.576464

6.4 Sectoral Comaprison

In [73]:
%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.9262141629
Wall time: 11.6 s
In [74]:
# Turn to parrallel would run 50% faster
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection
In [75]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 2719 weight 0.031469907407407405
GMM Weibull
R square 0.903299974879 0.933308434914
max diff: 0.0800030875027 0.0572249265947 speed value: 7.03484373418 3.01493302893 y gmm 0.677260612387
 
25.0 (15.0 - 35.0) degree
data size: 5229 weight 0.060520833333333336
GMM Weibull
R square 0.947731552504 0.954545572495
max diff: 0.0677978899277 0.0346742756716 speed value: 9.66795359012 7.25096519259 y gmm 0.887266239517
 
45.0 (35.0 - 55.0) degree
data size: 4934 weight 0.05710648148148148
GMM Weibull
R square 0.95571847187 0.950316354164
max diff: 0.0642426854665 0.0368400185821 speed value: 9.34387330932 7.00790498199 y gmm 0.907371992317
 
65.0 (55.0 - 75.0) degree
data size: 4233 weight 0.048993055555555554
GMM Weibull
R square 0.970761840798 0.976884972092
max diff: 0.0575818537394 0.0248247016266 speed value: 7.44524510896 3.30899782621 y gmm 0.784017006114
 
85.0 (75.0 - 95.0) degree
data size: 2993 weight 0.0346412037037037
GMM Weibull
R square 0.934253686302 0.950088476599
max diff: 0.0461398328308 0.0454183418769 speed value: 6.52826634412 6.52826634412 y gmm 0.703275469541
 
105.0 (95.0 - 115.0) degree
data size: 2137 weight 0.024733796296296295
GMM Weibull
R square 0.909996883429 0.953427971267
max diff: 0.0684885336365 0.0382573228409 speed value: 4.95830873231 4.95830873231 y gmm 0.486963033982
 
125.0 (115.0 - 135.0) degree
data size: 2125 weight 0.02459490740740741
GMM Weibull
R square 0.927300967414 0.944020813311
max diff: 0.0671882082454 0.0546412384179 speed value: 2.97173800159 5.94347600318 y gmm 0.202247031775
 
145.0 (135.0 - 155.0) degree
data size: 2401 weight 0.027789351851851853
GMM Weibull
R square 0.980574073634 0.966091439639
max diff: 0.0299518078376 0.0390285204042 speed value: 6.38401833044 7.98002291306 y gmm 0.604367225898
 
165.0 (155.0 - 175.0) degree
data size: 3834 weight 0.044375
GMM Weibull
R square 0.963815177 0.95085417013
max diff: 0.0586732837589 0.0413224472374 speed value: 10.3029858824 6.86865725495 y gmm 0.915219971292
 
185.0 (175.0 - 195.0) degree
data size: 6256 weight 0.0724074074074074
GMM Weibull
R square 0.981776511479 0.984835564186
max diff: 0.0400305054579 0.0190498307782 speed value: 8.28459153303 8.28459153303 y gmm 0.59821464868
 
205.0 (195.0 - 215.0) degree
data size: 11551 weight 0.13369212962962962
GMM Weibull
R square 0.990681155407 0.990381849228
max diff: 0.0116941199229 0.0120742596667 speed value: 20.7527430646 2.07527430646 y gmm 0.967022662906
 
225.0 (215.0 - 235.0) degree
data size: 11054 weight 0.1279398148148148
GMM Weibull
R square 0.97344897415 0.954142760222
max diff: 0.0437277500601 0.0344046885782 speed value: 15.1233892655 9.45211829091 y gmm 0.926123262997
 
245.0 (235.0 - 255.0) degree
data size: 5418 weight 0.06270833333333334
GMM Weibull
R square 0.979964923459 0.978963716849
max diff: 0.0264043516621 0.0204319080259 speed value: 3.92683871781 9.16262367489 y gmm 0.222026803746
 
265.0 (255.0 - 275.0) degree
data size: 5340 weight 0.06180555555555556
GMM Weibull
R square 0.965527497337 0.961545983288
max diff: 0.0553858645427 0.0387522215433 speed value: 5.1371552508 2.05486210032 y gmm 0.54124334894
 
285.0 (275.0 - 295.0) degree
data size: 5210 weight 0.060300925925925924
GMM Weibull
R square 0.914589515392 0.89461030797
max diff: 0.10020128117 0.0627563096873 speed value: 4.06914951683 2.03457475841 y gmm 0.447783363744
 
305.0 (295.0 - 315.0) degree
data size: 4320 weight 0.05
GMM Weibull
R square 0.945554525419 0.879779604748
max diff: 0.0568036602628 0.0655799416031 speed value: 3.95135443117 1.97567721559 y gmm 0.480233376774
 
325.0 (315.0 - 335.0) degree
data size: 3266 weight 0.037800925925925925
GMM Weibull
R square 0.963453105519 0.86658893413
max diff: 0.0294178635777 0.0596800970511 speed value: 1.86635216441 4.66588041103 y gmm 0.0833064122611
 
345.0 (335.0 - 355.0) degree
data size: 2697 weight 0.03121527777777778
GMM Weibull
R square 0.873060793509 0.876845553338
max diff: 0.116180339816 0.0678088719625 speed value: 4.94338728813 4.94338728813 y gmm 0.591272385434
 
Wall time: 1min 21s
In [76]:
diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9585012192583655 0.9502059223140222
In [77]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.05064821620094836 0.03679963920141404
In [78]:
# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation

(1) Data Variability, by Bootstrap (Resampling)

In [79]:
angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle
In [80]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [81]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
345.0 (335.0 - 355.0) Degree Speed Distribution
0.119370631716 6.0 0.730462516226

(2) Time Variability

In [82]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log
345.0 (335.0 - 355.0) Degree Speed Distribution

(3) Adjacent Sector Variability

In [83]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [84]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
345.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [85]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
0.9 square_error

7.1 Variability of the Result

In [86]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.375 -5.376 -4.662 6.389 3.353 0.369
2 0.266 2.857 3.640 3.527 2.595 -0.180
3 0.213 -3.911 -1.210 3.209 3.596 -0.302
4 0.146 1.577 -2.768 1.910 1.732 0.374
GMM Plot Result
0.375448053484 [[-5.37594574 -4.66235194]] [ 3.04371082  6.54217909] -75.9549086503
0.26602238796 [[ 2.85664815  3.64014117]] [ 2.50854996  3.58874796] -104.983081227
0.212585666936 [[-3.91060429 -1.2104057 ]] [ 2.80873011  3.91616331] -145.357625383
0.14594389162 [[ 1.5771826  -2.76750495]] [ 1.42940973  2.14591925] -52.3617713338
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.013 0.316 2.709750e-08 0.015 0.183
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.014 0.619 2.638573e-08 0.015 0.180

weight mean_x mean_y sig_x sig_y corr
1 0.639 -3.415 -1.343 6.586 5.380 0.525
2 0.179 1.363 -3.032 2.041 1.913 0.428
3 0.110 3.184 3.563 2.681 2.089 -0.368
4 0.072 -4.236 -3.709 2.062 2.192 0.363
GMM Plot Result
0.639490976765 [[-3.41496652 -1.34251593]] [ 4.0267695   7.48995103] -55.6099944253
0.178519766856 [[ 1.36298829 -3.03234838]] [ 1.49065553  2.36686602] -49.3238063512
0.11017247954 [[ 3.18358905  3.56316332]] [ 1.81085481  2.87632023] -117.798558473
0.0718167768389 [[-4.23616436 -3.7086695 ]] [ 1.69244608  2.48876111] 139.760769668
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.027 2.607479e-08 0.015 0.179
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.027 2.552474e-08 0.015 0.177

weight mean_x mean_y sig_x sig_y corr
1 0.604 -3.792 -1.417 6.638 5.462 0.547
2 0.194 1.327 -3.040 2.126 1.962 0.411
3 0.123 3.302 3.570 2.858 2.188 -0.340
4 0.079 -4.193 -3.559 1.948 2.483 0.357
GMM Plot Result
0.604257341238 [[-3.79216734 -1.41677057]] [ 3.98633886  7.61658738] -54.8694809847
0.193825099153 [[ 1.3267304  -3.04024178]] [ 1.56171999  2.43501587] -50.5581543227
0.123138711771 [[ 3.30244037  3.56961851]] [ 1.93904306  3.03287411] -115.76491804
0.0787788478381 [[-4.19277349 -3.55861435]] [ 1.69819776  2.65974022] 152.214778199
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.037 2.475309e-08 0.015 0.175
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.030 2.519418e-08 0.015 0.176

weight mean_x mean_y sig_x sig_y corr
1 0.371 -5.448 -4.669 6.139 3.319 0.372
2 0.278 2.769 3.612 3.663 2.587 -0.163
3 0.216 -3.667 -1.377 3.488 3.521 -0.428
4 0.134 1.644 -2.701 1.909 1.698 0.394
GMM Plot Result
0.371465073157 [[-5.44805037 -4.66867073]] [ 3.00180572  6.30018748] -75.2008959545
0.277928044219 [[ 2.76878989  3.61185856]] [ 2.5211027   3.70873648] -102.351119403
0.216160186693 [[-3.6673569 -1.3768932]] [ 2.64983906  4.1885491 ] -135.6220456
0.134446695931 [[ 1.64355755 -2.70094079]] [ 1.3894515   2.14405705] -53.2840162484
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.988 0.014 3.584 2.930807e-08 0.016 0.190
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.014 0.600 2.662018e-08 0.015 0.181

weight mean_x mean_y sig_x sig_y corr
1 0.592 -3.823 -1.480 6.777 5.488 0.562
2 0.198 1.301 -3.040 2.162 1.977 0.420
3 0.122 3.131 3.621 2.928 2.163 -0.345
4 0.088 -4.192 -3.476 2.027 2.675 0.247
GMM Plot Result
0.591624571809 [[-3.82349257 -1.48014061]] [ 3.95746222  7.77120246] -55.3507383999
0.197946035123 [[ 1.3009184  -3.04025902]] [ 1.56660784  2.47533174] -51.0269458039
0.122251122296 [[ 3.13081076  3.62062166]] [ 1.92313703  3.09067761] -114.139923281
0.0881782707725 [[-4.19198043 -3.47593024]] [ 1.89803468  2.76830624] 159.33820348
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.028 2.487472e-08 0.015 0.175
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.010 0.031 2.542096e-08 0.015 0.177

weight mean_x mean_y sig_x sig_y corr
1 0.599 -3.621 -1.519 6.786 5.463 0.539
2 0.194 1.246 -3.042 2.154 1.938 0.426
3 0.124 3.123 3.595 2.846 2.133 -0.322
4 0.083 -4.310 -3.348 1.933 2.728 0.304
GMM Plot Result
0.598621034071 [[-3.62119947 -1.51938389]] [ 4.04707289  7.71512781] -56.031172083
0.194303387935 [[ 1.2464642  -3.04180945]] [ 1.53840747  2.4555009 ] -51.9842083939
0.124126092224 [[ 3.12317389  3.59454432]] [ 1.91907203  2.99449207] -113.901452554
0.0829494857705 [[-4.30984572 -3.34811682]] [ 1.77201382  2.83483948] 159.58408704
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.991 0.008 0.028 2.301821e-08 0.014 0.169
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.027 2.553200e-08 0.015 0.177

weight mean_x mean_y sig_x sig_y corr
1 0.621 -3.632 -1.449 6.533 5.390 0.527
2 0.189 1.371 -3.049 2.108 1.979 0.432
3 0.115 3.310 3.653 2.862 2.119 -0.315
4 0.075 -4.178 -3.615 2.125 2.272 0.387
GMM Plot Result
0.62073431314 [[-3.63212762 -1.44856034]] [ 4.01423212  7.45838457] -55.0887903518
0.189237399945 [[ 1.37100064 -3.0493021 ]] [ 1.53630195  2.45005296] -49.1534485667
0.115014571059 [[ 3.31029824  3.65305488]] [ 1.91794567  2.99988278] -112.970302733
0.0750137158559 [[-4.17778315 -3.61478833]] [ 1.71559341  2.59470264] 139.901422135
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.028 2.522015e-08 0.015 0.176
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.010 0.026 2.555685e-08 0.015 0.178

weight mean_x mean_y sig_x sig_y corr
1 0.609 -3.687 -1.423 6.650 5.470 0.547
2 0.190 1.349 -3.022 2.131 2.001 0.447
3 0.118 3.164 3.627 2.882 2.096 -0.345
4 0.083 -4.191 -3.504 2.049 2.497 0.321
GMM Plot Result
0.609404454969 [[-3.68715492 -1.42300719]] [ 3.99229663  7.62877993] -54.8893931553
0.190112617127 [[ 1.34914084 -3.02235485]] [ 1.53219555  2.48956966] -49.0206982367
0.117638006667 [[ 3.16395455  3.62663548]] [ 1.86794125  3.03461598] -113.4215505
0.0828449212371 [[-4.19072016 -3.50398623]] [ 1.81257081  2.6733997 ] 150.925548939
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.008 0.028 2.656539e-08 0.016 0.181
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.010 0.029 2.527411e-08 0.015 0.177

weight mean_x mean_y sig_x sig_y corr
1 0.617 -3.498 -1.297 6.625 5.427 0.530
2 0.185 1.324 -3.057 2.071 1.929 0.411
3 0.115 3.284 3.469 2.657 2.146 -0.369
4 0.083 -4.264 -3.584 2.125 2.372 0.326
GMM Plot Result
0.617408827799 [[-3.498061  -1.2971891]] [ 4.03513284  7.55351301] -55.3660210335
0.185181969714 [[ 1.32429114 -3.05718152]] [ 1.5300652   2.38153737] -49.9059576334
0.114810304632 [[ 3.28398109  3.46873157]] [ 1.84279656  2.87582228] -119.885973691
0.0825988978546 [[-4.26363714 -3.58351267]] [ 1.82608789  2.60856446] 144.3250854
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.028 2.618556e-08 0.015 0.180
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.009 0.028 2.547773e-08 0.015 0.177

weight mean_x mean_y sig_x sig_y corr
1 0.615 -3.666 -1.321 6.596 5.455 0.551
2 0.195 1.319 -3.038 2.177 2.000 0.444
3 0.115 3.326 3.498 2.861 2.120 -0.373
4 0.075 -4.295 -3.722 2.044 2.404 0.378
GMM Plot Result
0.614724249195 [[-3.66570392 -1.32138525]] [ 3.95644635  7.59013157] -54.5619274221
0.195225601144 [[ 1.31926155 -3.03839469]] [ 1.55017405  2.51722431] -50.4301307075
0.114614162042 [[ 3.3258696   3.49782488]] [ 1.84878951  3.04308134] -115.404791838
0.0754359876188 [[-4.29485681 -3.72168705]] [ 1.71919831  2.64621573] 146.662552871
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.008 0.035 2.660832e-08 0.016 0.181
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.032 2.526040e-08 0.015 0.177

weight mean_x mean_y sig_x sig_y corr
1 0.391 -5.365 -4.515 6.286 3.358 0.310
2 0.261 2.897 3.594 3.362 2.620 -0.193
3 0.194 -4.007 -0.918 2.856 3.660 -0.255
4 0.155 1.469 -2.811 1.971 1.754 0.357
GMM Plot Result
0.39051310033 [[-5.36543371 -4.5147183 ]] [ 3.13516784  6.40045596] -77.5558371284
0.260998972069 [[ 2.89732959  3.59395166]] [ 2.50736308  3.446292  ] -108.741231626
0.193560662456 [[-4.00700874 -0.91805557]] [ 2.65351999  3.80922807] -157.23889151
0.154927265146 [[ 1.46886508 -2.81073827]] [ 1.47710329  2.18583166] -54.0604256321
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.015 0.298 2.811234e-08 0.016 0.186
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.017 0.896 2.667552e-08 0.015 0.181

weight mean_x mean_y sig_x sig_y corr
1 0.626 -3.511 -1.382 6.664 5.409 0.545
2 0.183 1.341 -3.052 2.075 1.922 0.422
3 0.116 3.205 3.506 2.774 2.163 -0.356
4 0.076 -4.239 -3.707 2.047 2.357 0.324
GMM Plot Result
0.625562295697 [[-3.51128692 -1.3820564 ]] [ 3.97300328  7.60833224] -55.5512561999
0.182919431445 [[ 1.34136957 -3.05232251]] [ 1.51311984  2.38921208] -50.1616693454
0.115654914064 [[ 3.20485691  3.5057561 ]] [ 1.889127    2.96726538] -117.386711879
0.0758633587938 [[-4.23924269 -3.7069459 ]] [ 1.77992596  2.56523654] 146.791002555
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.008 0.033 2.578717e-08 0.015 0.178
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.007 0.029 2.512363e-08 0.015 0.176
Wall time: 43.6 s

7.2 Cross-validation, to select the number of Gaussian

In [87]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Number of train/test dataset 64800.0 21600.0
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.176884 0.074135 3.878059e-07 0.058818 0.691612 0.843048
1 0.188004 0.075327 3.903619e-07 0.058975 0.694287 0.841985
2 0.147722 0.073739 3.761285e-07 0.058262 0.681272 0.847264
3 0.194849 0.074582 3.886130e-07 0.059470 0.692341 0.843088
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.158188 0.072191 3.786549e-07 0.059010 0.683959 0.846734
1 0.161473 0.069775 3.711260e-07 0.058531 0.675957 0.849828
2 0.400425 0.078487 4.176790e-07 0.060905 0.717859 0.832612
3 0.175761 0.077330 3.857625e-07 0.057812 0.690320 0.842759
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.097518 0.027092 9.745246e-08 0.029703 0.346822 0.960542
1 0.090336 0.026964 9.927790e-08 0.029933 0.349997 0.959877
2 0.118123 0.028223 1.001892e-07 0.029457 0.351516 0.959630
3 0.092103 0.028640 1.025926e-07 0.030770 0.355836 0.958208
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.108317 0.028759 1.106624e-07 0.031202 0.369354 0.955269
1 0.151458 0.026761 1.037106e-07 0.030346 0.357744 0.957823
2 0.078305 0.025324 1.043670e-07 0.032452 0.359131 0.957217
3 0.162889 0.030428 9.734076e-08 0.028461 0.346455 0.961379
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.068710 0.025547 4.540130e-08 0.020376 0.236601 0.981606
1 0.072838 0.026597 4.598658e-08 0.020414 0.238236 0.981320
2 0.064041 0.026661 4.416640e-08 0.019915 0.233523 0.982091
3 0.057280 0.025609 4.485194e-08 0.019837 0.235238 0.981946
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.069924 0.024775 4.696082e-08 0.020029 0.240988 0.981047
1 0.078546 0.026479 4.620981e-08 0.020135 0.238708 0.981500
2 0.072029 0.024602 5.168073e-08 0.021585 0.252284 0.979201
3 0.113478 0.029428 4.909832e-08 0.021803 0.246181 0.979794
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.790776 0.014052 2.675520e-08 0.015484 0.181751 0.989118
1 0.270129 0.013748 2.702811e-08 0.015655 0.182638 0.989048
2 0.030087 0.008200 2.381264e-08 0.014633 0.171409 0.990395
3 0.030497 0.007856 2.444087e-08 0.014776 0.173599 0.990124
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.435547 0.011291 2.845708e-08 0.016066 0.187218 0.988649
1 10.859439 0.015400 2.763835e-08 0.015557 0.184621 0.988850
2 0.039884 0.011268 3.413733e-08 0.017505 0.205259 0.986049
3 0.033414 0.009308 3.005793e-08 0.016591 0.192792 0.987772
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.420206 0.011472 1.752818e-08 0.012426 0.147077 0.992930
1 0.200281 0.009313 1.760153e-08 0.012656 0.147374 0.992861
2 0.265383 0.014130 1.827162e-08 0.012857 0.150181 0.992604
3 0.137819 0.015933 1.803788e-08 0.012743 0.149148 0.992684
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.362061 0.013057 2.269030e-08 0.014730 0.167287 0.990721
1 0.134426 0.009454 2.468141e-08 0.014626 0.174511 0.990080
2 0.185746 0.013282 1.969058e-08 0.013175 0.155785 0.992032
3 0.216990 0.015787 2.149176e-08 0.013867 0.162980 0.991354
Wall time: 1min 8s
In [88]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.176865 0.074446 3.857273e-07 0.058881 0.689878 0.843846
2 0.099520 0.027730 9.987805e-08 0.029966 0.351043 0.959564
3 0.065718 0.026104 4.510155e-08 0.020136 0.235899 0.981741
4 0.280372 0.010964 2.550920e-08 0.015137 0.177349 0.989671
5 0.255922 0.012712 1.785980e-08 0.012670 0.148445 0.992770
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.223962 0.074446 3.883056e-07 0.059064 0.692024 0.842983
2 0.125242 0.027818 1.040202e-07 0.030615 0.358171 0.957922
3 0.083495 0.026321 4.848742e-08 0.020888 0.244540 0.980386
4 2.842071 0.011817 3.007267e-08 0.016430 0.192473 0.987830
5 0.224806 0.012895 2.213851e-08 0.014099 0.165141 0.991047
In [89]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [ ]:
# for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
#     display(fig)
# for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
#             fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
#     display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)