Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models¶

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up¶

1.1 Environment¶

%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data¶

# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/uk/tiree/dat.txt', 1.9, 4 
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path= './data/NCDC/europe/france/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend, try 2
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 1.1, 4 # miss peak
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7

if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True

df

if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)

df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']

plot_speed_and_angle_distribution(df.speed, df.dir)

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0xc19e048>

1.3 General Data Info¶

1.3.1 Unit Detection¶

df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)

True

dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection¶

sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection¶

MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)

df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))

[0]

<matplotlib.axes._subplots.AxesSubplot at 0xc184cf8>

1.4 Error Data handling and Adjustment¶

1.4.1 Artefacts¶

wrong direction record

if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')

sudden increase in speed

# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))

<matplotlib.axes._subplots.AxesSubplot at 0xd2203c8>

incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)

sudden increase number 0

1.4.2 Direction re-aligment¶

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)

0.00      4
0.01      1
0.02      1
0.03      1
0.05      3
0.06      1
0.07      4
0.08      1
0.09      1
0.10      1
0.11      3
0.12      1
0.13      2
0.14      1
0.15      1
0.16      1
0.17      2
0.19      1
0.20      2
0.21      5
0.22      1
0.23      1
0.24      1
0.25      6
0.26      4
0.27      2
0.28      1
0.29      3
0.31      2
0.32      3
         ..
359.66    1
359.69    3
359.70    1
359.71    3
359.72    1
359.73    1
359.74    3
359.75    2
359.76    2
359.77    1
359.78    1
359.79    3
359.80    2
359.81    4
359.82    3
359.84    4
359.85    3
359.86    4
359.87    2
359.88    4
359.89    1
359.90    3
359.91    3
359.92    1
359.93    1
359.94    2
359.95    3
359.96    3
359.97    5
359.98    1
Name: dir, dtype: int64

1 10

df=realign_direction(df, effective_column)

1.4.3 0 Speed¶

with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)

False 0.0160740718664

print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)

Series([], Name: speed, dtype: int64)

1.5 Time Shift Comparison¶

DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))

df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')

display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)

# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)

2001 - 2005

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

2006 - 2010

2011 - 2015

df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)

(0, 11.0)

%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()

Wall time: 10.3 s

for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)¶

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)

if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)

1.7 Generate (x,y) from (speed,dir)¶

# Cook orientation
# df['dir']= (df['dir'] + 180)%360

# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview¶

2.1 Data Overview¶

## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()

Knot unit? True
Report type used: default
Sampling time used: [0]

df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0x11f8c0b8>

# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))

<matplotlib.axes._subplots.AxesSubplot at 0x16d92320>

# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))

x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log

df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction¶

if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10

%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Wall time: 7.58 s

2.3 Overview by Month¶

%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Wall time: 18.6 s

df.describe()

3. Create input data and configuration¶

SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []

fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])

[-16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1
   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16]

plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation¶

sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1

%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)

1.1
Wall time: 0 ns

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])

bandwidth: 1.1 1089
[  2.68203979e-07   1.10250923e-06   2.73466253e-06   4.13605396e-06
   6.86072023e-06]

# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit¶

kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20))

Wall time: 17.9 s

for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit¶

%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(4,3))
fig2, ax2 = plt.subplots(figsize=(4,3))

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    speed_ = array(list(zip(df_standard.x, df_standard.y)))
    kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
    kde_result_standard = exp(kde_current.score_samples(points))
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))

    gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
    gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
    year_lim = end_year-year_length-10, end_year-year_length
    gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set_bivar.append(gofs_mean)
    
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))

Wall time: 35.2 s

4.3 Univariate GOF Limit¶

def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    # 1. Speed
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}

%%time
x = arange(0, df.speed.max() + 1)
fig = plt.figure(figsize=(9,2.5))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
gofs_mean_set = []

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
    density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]

    gofs = pd.DataFrame(gofs)
    gofs.set_index(['year'], inplace=True)    
    if len(gofs)>0:
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
    year_lim = end_year-year_length-10, end_year-year_length
    gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set.append(gofs_mean)
plt.tight_layout()
plt.legend()
for ax in [ax1, ax2, ax3]:
    plt_configure(ax=ax, tight='xtight')
    
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))

Wall time: 695 ms

5. GMM by Expectation-maximization¶

sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)

True

gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)

fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.312630014029 [[ 4.51290978 -5.21290091]] [ 3.87450629  4.76529591] 144.964878149
0.291739241464 [[-1.89970307 -1.77517239]] [ 3.17428683  3.78499925] 142.306069023
0.203350969155 [[-8.3090732   5.07688153]] [ 4.25007981  5.17578797] 137.753201485
0.192279775351 [[ 0.60786534  5.1489511 ]] [ 4.17406949  4.97784268] -112.622861223

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics¶

points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)

gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization¶

sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)

square_error

# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result

     fun: -18.145800460354977
     jac: array([  3.27015877e-01,   0.00000000e+00,   2.38418579e-07,
         0.00000000e+00,   0.00000000e+00,   2.38418579e-07,
         3.27017546e-01,   0.00000000e+00,  -2.38418579e-07,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         3.27015638e-01,  -2.38418579e-07,   2.38418579e-07,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         3.27015400e-01,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,  -2.38418579e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 3197
     nit: 122
    njev: 122
  status: 0
 success: True
       x: array([ 0.05269188,  3.59385182, -3.49138001,  1.70260288,  1.65883103,
       -0.23526118,  0.08708669, -1.76610211, -2.31547049,  2.11897388,
        2.38881994,  0.31322415,  0.13836439,  4.04900683, -7.1953246 ,
        3.67914482,  3.68451123,  0.22193421,  0.72185704, -1.98716665,
        2.08182959,  6.66157766,  6.05984777, -0.2308473 ])

6.1 GMM Result¶

gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)

fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.721857044771 [[-1.98716665  2.08182959]] [ 5.52040997  7.11501551] -123.835580363
0.138364387248 [[ 4.04900683 -7.1953246 ]] [ 3.24765962  4.06994307] 135.188140841
0.0870866860649 [[-1.76610211 -2.31547049]] [ 1.8439385   2.60699095] 145.493567172
0.0526918819165 [[ 3.59385182 -3.49138001]] [ 1.46859554  1.86917204] -131.840989651

6.2 Bivariate Goodness-of-fit statistics¶

gof_df(gmm_pdf_result, kde_result)

pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit¶

def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V

%%time
x = arange(0, max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)

Wall time: 16.7 s

%%time
# Calculate Speed Distribution
# 1. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))

Speed Distribution Comparison
(0.99844193657154823, 0.99788392167246953, 0.99548459276050827)
Wall time: 18.3 s

%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log

0.0121000439786 0.00798759423705
7.0 6.0
Wall time: 50.5 s

# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])

Direction Distribution Comparison

0.86567646508011631

pd.DataFrame(gofs_mean_set).set_index('year_lim')

6.4 Sectoral Comaprison¶

%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))

0.918080688176
Wall time: 15.9 s

# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
#                                      (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))

# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection

%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)

5.0 (-5.0 - 15.0) degree
data size: 1341 weight 0.031524754337298413
GMM Weibull
R square 0.907998138223 0.915033038554
max diff: 0.0678767787835 0.0442676968056 speed value: 5.27578947368 5.27578947368 y gmm 0.360941655741

25.0 (15.0 - 35.0) degree
data size: 1517 weight 0.03566223141661573
GMM Weibull
R square 0.892774121723 0.97610117746
max diff: 0.0985603935574 0.0411808158201 speed value: 4.70947368421 7.06421052632 y gmm 0.319390980242

45.0 (35.0 - 55.0) degree
data size: 991 weight 0.023296816963656024
GMM Weibull
R square 0.944329003752 0.960219819659
max diff: 0.0498829566765 0.0244745173611 speed value: 5.21578947368 7.30210526316 y gmm 0.36787688187

65.0 (55.0 - 75.0) degree
data size: 1525 weight 0.0358502985565847
GMM Weibull
R square 0.960116305982 0.968608050447
max diff: 0.0453897122432 0.0185429393915 speed value: 6.49894736842 2.78526315789 y gmm 0.471331599232

85.0 (75.0 - 95.0) degree
data size: 2307 weight 0.054233861488551416
GMM Weibull
R square 0.972113091836 0.990250792721
max diff: 0.0418004259792 0.024396523695 speed value: 11.2684210526 3.38052631579 y gmm 0.803019686721

105.0 (95.0 - 115.0) degree
data size: 2310 weight 0.054304386666039775
GMM Weibull
R square 0.947433165716 0.944150551899
max diff: 0.0588969935647 0.0727713700075 speed value: 12.8294736842 5.83157894737 y gmm 0.817726383059

125.0 (115.0 - 135.0) degree
data size: 2926 weight 0.06878555644365038
GMM Weibull
R square 0.961836454973 0.961571635445
max diff: 0.048108511341 0.0497299943378 speed value: 12.1642105263 5.40631578947 y gmm 0.709581167401

145.0 (135.0 - 155.0) degree
data size: 3859 weight 0.09071888664253139
GMM Weibull
R square 0.969430169049 0.973812795512
max diff: 0.0394004219453 0.0489905483637 speed value: 9.79631578947 5.59789473684 y gmm 0.519552674712

165.0 (155.0 - 175.0) degree
data size: 2859 weight 0.06721049414641027
GMM Weibull
R square 0.979182874152 0.967522281159
max diff: 0.0174706731806 0.0514497210677 speed value: 18.4463157895 5.67578947368 y gmm 0.954547514997

185.0 (175.0 - 195.0) degree
data size: 2027 weight 0.0476515115896375
GMM Weibull
R square 0.964304452651 0.977100528177
max diff: 0.0377779501722 0.0291901908249 speed value: 13.8426315789 11.3257894737 y gmm 0.891749336457

diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.9565375311352659 0.9700882214387582

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.049870055725556466 0.03838967506972987

# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation¶

(1) Data Variability, by Bootstrap (Resampling)¶

angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle

FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log

25.0 (15.0 - 35.0) Degree Speed Distribution
0.104601718684 6.0 0.451338699567

(2) Time Variability¶

fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log

25.0 (15.0 - 35.0) Degree Speed Distribution

(3) Adjacent Sector Variability¶

incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]

fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

25.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation¶

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)

1.1 square_error

7.1 Variability of the Result¶

%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')

GMM Plot Result
0.717379533086 [[-1.95235695  2.08725179]] [ 5.5792963   7.01389502] -124.983421205
0.142011948168 [[ 4.24772668 -7.11531049]] [ 3.18394763  4.0544438 ] 136.669938768
0.0903497316488 [[-1.68809679 -2.36527641]] [ 1.87432764  2.57465517] 142.958333329
0.0502587870981 [[ 3.58886578 -3.4520917 ]] [ 1.44586102  1.78018195] -142.210622269

GMM Plot Result
0.753928139351 [[-1.8056343   1.74784391]] [ 5.48827053  7.23919836] -126.3680161
0.115592315561 [[ 4.12062304 -7.30701211]] [ 3.022196   3.7548244] 137.21403402
0.0804265544783 [[-1.61613243 -2.30161941]] [ 1.7469888   2.45308194] 144.03635144
0.0500529906096 [[ 3.49162558 -3.45362678]] [ 1.43625673  1.83153726] -127.988700083

GMM Plot Result
0.712054262138 [[-2.1285901   2.23498881]] [ 5.55101509  7.05671877] -123.623506048
0.148926744822 [[ 4.13056418 -7.02173661]] [ 3.3347092   4.26413016] -48.9565440129
0.0860176384681 [[-1.62682263 -2.26506255]] [ 1.822764    2.55967516] 145.787101367
0.0530013545717 [[ 3.60412351 -3.55602307]] [ 1.47578532  1.87948698] -136.471636532

GMM Plot Result
0.645288541113 [[ 0.33121404  0.34092577]] [ 5.23650455  7.58426706] -151.869320268
0.258918404759 [[-4.36587455  0.39268147]] [ 3.94048552  8.51861368] -134.108294755
0.0572050027957 [[ 3.99246757 -3.91346479]] [ 1.49879201  2.13461486] -136.530410264
0.0385880513322 [[-1.76054052 -2.35444259]] [ 1.37728676  2.05574652] 138.335632457

GMM Plot Result
0.756718575819 [[-1.84943936  1.73326921]] [ 5.52172381  7.39626771] -128.442937538
0.11211005897 [[ 4.15087248 -7.14182566]] [ 2.92815968  3.74684553] 143.512903622
0.0825762871693 [[-1.71857697 -2.32049724]] [ 1.81544644  2.54854911] 147.393330388
0.0485950780414 [[ 3.597616   -3.52156309]] [ 1.44142921  1.8787526 ] -136.275087177

7.2 Cross-validation, to select the number of Gaussian¶

%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)

Number of train/test dataset

D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

 31903.5 10634.5
  
Number of gaussian 1
Train

Test

  
Number of gaussian 2
Train

Test

  
Number of gaussian 3
Train

Test

  
Number of gaussian 4
Train

Test

train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)

Train gof mean, std

Test gof mean, std

prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()

R_square

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

K_S

Chi_square

fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)

for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)

import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)

	date	speed_max	speed	dir	HrMn	type	wind_type
0	20000101	10.0	7.25	265.54	0000	default	default
1	20000101	11.0	7.54	282.57	0100	default	default
2	20000101	9.0	5.99	300.36	0200	default	default
3	20000101	13.0	7.57	316.85	0300	default	default
4	20000101	11.0	7.55	326.70	0400	default	default
5	20000101	10.0	5.72	333.54	0500	default	default
6	20000101	13.0	7.84	359.81	0600	default	default
7	20000101	17.0	7.66	9.60	0700	default	default
8	20000101	15.0	9.01	10.83	0800	default	default
9	20000101	15.0	8.97	22.07	0900	default	default
10	20000101	17.0	11.09	43.60	1000	default	default
11	20000101	15.0	8.08	59.34	1100	default	default
12	20000101	16.0	9.98	64.34	1200	default	default
13	20000101	13.0	8.63	73.64	1300	default	default
14	20000101	14.0	9.11	73.27	1400	default	default
15	20000101	16.0	9.43	76.12	1500	default	default
16	20000101	15.0	8.78	68.38	1600	default	default
17	20000101	14.0	9.30	63.63	1700	default	default
18	20000101	15.0	9.25	68.60	1800	default	default
19	20000101	17.0	9.10	68.59	1900	default	default
20	20000101	15.0	9.75	61.73	2000	default	default
21	20000101	15.0	9.66	65.37	2100	default	default
22	20000101	12.0	7.53	48.52	2200	default	default
23	20000101	12.0	7.94	45.52	2300	default	default
24	20000102	9.0	6.92	31.84	0000	default	default
25	20000102	13.0	8.38	38.74	0100	default	default
26	20000102	12.0	9.04	41.45	0200	default	default
27	20000102	13.0	8.53	46.76	0300	default	default
28	20000102	13.0	8.94	49.42	0400	default	default
29	20000102	17.0	10.15	47.74	0500	default	default
...	...	...	...	...	...	...	...
140860	20161230	12.0	8.29	148.22	1500	default	default
140861	20161230	18.0	10.55	163.86	1600	default	default
140862	20161230	17.0	11.02	171.45	1700	default	default
140863	20161230	18.0	11.54	176.00	1800	default	default
140864	20161230	19.0	10.65	173.49	1900	default	default
140865	20161230	8.0	3.75	196.33	2000	default	default
140866	20161230	10.0	2.98	240.04	2100	default	default
140867	20161230	16.0	8.37	271.96	2200	default	default
140868	20161230	21.0	11.02	291.91	2300	default	default
140869	20161231	21.0	11.02	278.03	0000	default	default
140870	20161231	22.0	12.44	285.56	0100	default	default
140871	20161231	22.0	12.03	290.84	0200	default	default
140872	20161231	21.0	12.82	300.28	0300	default	default
140873	20161231	20.0	11.11	300.06	0400	default	default
140874	20161231	22.0	13.55	333.92	0500	default	default
140875	20161231	21.0	12.67	327.34	0600	default	default
140876	20161231	24.0	13.80	322.47	0700	default	default
140877	20161231	24.0	15.98	324.02	0800	default	default
140878	20161231	22.0	13.95	318.25	0900	default	default
140879	20161231	15.0	9.97	313.95	1000	default	default
140880	20161231	16.0	10.38	308.63	1100	default	default
140881	20161231	15.0	9.10	288.69	1200	default	default
140882	20161231	14.0	9.22	274.42	1300	default	default
140883	20161231	12.0	7.33	263.12	1400	default	default
140884	20161231	14.0	8.47	239.63	1500	default	default
140885	20161231	14.0	8.32	234.34	1600	default	default
140886	20161231	13.0	7.68	225.05	1700	default	default
140887	20161231	13.0	5.99	205.11	1800	default	default
140888	20161231	13.0	7.02	204.43	1900	default	default
140889	20161231	13.0	7.69	206.57	2000	default	default

	date	speed_max	speed	dir	HrMn	dir_windrose
count	1.408900e+05	140890.000000	140890.000000	140890.000000	140890.000000	140890.000000
mean	2.008173e+07	13.783746	7.817171	195.383235	1150.363404	196.856811
std	4.882175e+04	6.476898	4.134382	96.638849	692.392299	99.056700
min	2.000010e+07	0.000000	0.000000	0.000000	0.000000	0.000000
25%	2.004052e+07	9.000000	4.700000	123.570000	500.000000	126.100000
50%	2.008091e+07	13.000000	7.340000	192.690000	1200.000000	196.990000
75%	2.012111e+07	18.000000	10.450000	285.880000	1800.000000	288.680000
max	2.016123e+07	76.000000	31.900000	359.980000	2300.000000	359.990000

	date	speed_max	speed	dir	HrMn	type	wind_type	dir_windrose	incre	incre_reverse
time
2014-06-14 12:00:00	20140614	56.0	31.90	293.63	1200	default	default	156.37	8.04	7.09
2001-04-07 09:00:00	20010407	45.0	29.69	253.22	900	default	default	196.78	4.57	1.22
2000-04-05 20:00:00	20000405	51.0	29.36	165.55	2000	default	default	284.45	3.80	0.82
2016-11-18 15:00:00	20161118	51.0	28.81	118.13	1500	default	default	331.87	4.33	5.72
2000-04-05 21:00:00	20000405	43.0	28.54	155.62	2100	default	default	294.38	-0.82	6.01
2001-04-07 10:00:00	20010407	43.0	28.47	246.85	1000	default	default	203.15	-1.22	0.18
2001-04-07 11:00:00	20010407	42.0	28.29	236.30	1100	default	default	213.70	-0.18	0.46
2001-04-07 12:00:00	20010407	40.0	27.83	228.62	1200	default	default	221.38	-0.46	0.08
2001-04-07 13:00:00	20010407	42.0	27.75	224.03	1300	default	default	225.97	-0.08	2.55
2010-10-26 19:00:00	20101026	52.0	27.29	209.15	1900	default	default	240.85	0.45	0.92

	date	speed_max	speed	dir	HrMn	type	wind_type	dir_windrose	incre	incre_reverse
time
2014-06-14 12:00:00	20140614	56.0	31.90	293.63	1200	default	default	156.37	8.04	7.09
2001-04-07 09:00:00	20010407	45.0	29.69	253.22	900	default	default	196.78	4.57	1.22
2000-04-05 20:00:00	20000405	51.0	29.36	165.55	2000	default	default	284.45	3.80	0.82
2016-11-18 15:00:00	20161118	51.0	28.81	118.13	1500	default	default	331.87	4.33	5.72
2000-04-05 21:00:00	20000405	43.0	28.54	155.62	2100	default	default	294.38	-0.82	6.01
2001-04-07 10:00:00	20010407	43.0	28.47	246.85	1000	default	default	203.15	-1.22	0.18
2001-04-07 11:00:00	20010407	42.0	28.29	236.30	1100	default	default	213.70	-0.18	0.46
2001-04-07 12:00:00	20010407	40.0	27.83	228.62	1200	default	default	221.38	-0.46	0.08
2001-04-07 13:00:00	20010407	42.0	27.75	224.03	1300	default	default	225.97	-0.08	2.55
2010-10-26 19:00:00	20101026	52.0	27.29	209.15	1900	default	default	240.85	0.45	0.92

	date	speed_max	speed	dir	HrMn	dir_windrose	x	y
count	4.253800e+04	42538.000000	42538.000000	42538.000000	42538.000000	42538.000000	42538.000000	42538.000000
mean	2.013078e+07	14.087522	7.755293	197.030299	1150.303258	195.962790	-0.716125	-0.125169
std	1.410164e+04	6.613855	4.157751	97.192196	691.598382	99.198001	6.240054	6.161663
min	2.011010e+07	1.000000	0.000000	0.020000	0.000000	0.000000	-25.695173	-29.225280
25%	2.012040e+07	9.000000	4.600000	124.220000	600.000000	125.120000	-4.660698	-4.296425
50%	2.013071e+07	13.000000	7.190000	194.155000	1200.000000	193.390000	-0.470672	-0.637495
75%	2.014100e+07	18.000000	10.430000	288.790000	1700.000000	288.630000	3.539646	4.221872
max	2.015123e+07	59.000000	31.900000	359.970000	2300.000000	359.990000	25.003446	21.725100

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
year_lim
(1996, 2006)	0.028431	0.02286	3.429543e-08	0.032227	0.200005	0.972496
(1986, 1996)	NaN	NaN	NaN	NaN	NaN	NaN

	k_s	r_square	r_square_dir
year_lim
(1996, 2006)	0.029316	0.988133	0.910595
(1992, 2002)	0.045826	0.978682	0.887397
(1986, 1996)	NaN	NaN	NaN

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.313	4.513	-5.213	4.189	4.491	0.192
2	0.292	-1.900	-1.775	3.415	3.569	0.169
3	0.203	-8.309	5.077	4.691	4.780	0.194
4	0.192	0.608	5.149	4.867	4.302	-0.125

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.722	-1.987	2.082	6.662	6.060	-0.231
2	0.138	4.049	-7.195	3.679	3.685	0.222
3	0.087	-1.766	-2.315	2.119	2.389	0.313
4	0.053	3.594	-3.491	1.703	1.659	-0.235

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.717	-1.952	2.087	6.577	6.088	-0.212
2	0.142	4.248	-7.115	3.620	3.670	0.237
3	0.090	-1.688	-2.365	2.155	2.345	0.296
4	0.050	3.589	-3.452	1.580	1.663	-0.199

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.754	-1.806	1.748	6.676	6.161	-0.259
2	0.116	4.121	-7.307	3.380	3.436	0.213
3	0.080	-1.616	-2.302	2.019	2.235	0.312
4	0.050	3.492	-3.454	1.693	1.598	-0.232

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.712	-2.129	2.235	6.631	6.053	-0.218
2	0.149	4.131	-7.022	3.891	3.764	0.239
3	0.086	-1.627	-2.265	2.084	2.352	0.306
4	0.053	3.604	-3.556	1.679	1.700	-0.237

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.645	0.331	0.341	5.841	7.130	-0.301
2	0.259	-4.366	0.393	6.703	6.570	-0.647
3	0.057	3.992	-3.913	1.828	1.861	-0.339
4	0.039	-1.761	-2.354	1.711	1.788	0.378

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.757	-1.849	1.733	6.734	6.313	-0.277
2	0.112	4.151	-7.142	3.241	3.479	0.232
3	0.083	-1.719	-2.320	2.055	2.359	0.299
4	0.049	3.598	-3.522	1.665	1.684	-0.259

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.766	-1.731	1.636	6.608	6.315	-0.264
2	0.107	4.128	-7.354	3.248	3.501	0.232
3	0.075	-1.742	-2.343	2.033	2.345	0.329
4	0.053	3.505	-3.451	1.766	1.674	-0.293

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.764	-1.825	1.639	6.664	6.290	-0.257
2	0.113	3.933	-7.296	3.540	3.510	0.174
3	0.070	-1.469	-2.074	1.830	2.201	0.291
4	0.053	3.633	-3.460	1.732	1.671	-0.241

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.683	-2.331	2.501	6.673	5.809	-0.200
2	0.174	3.806	-6.861	4.102	3.790	0.175
3	0.085	-1.837	-2.178	2.177	2.399	0.391
4	0.058	3.572	-3.450	1.801	1.704	-0.229

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.673	-2.246	2.474	6.676	5.821	-0.208
2	0.163	4.023	-7.124	4.046	3.813	0.194
3	0.103	-1.871	-2.180	2.210	2.573	0.295
4	0.061	3.571	-3.530	1.754	1.746	-0.210

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.766	-1.657	1.638	6.604	6.285	-0.267
2	0.106	4.141	-7.322	3.308	3.574	0.276
3	0.078	-1.767	-2.447	2.037	2.213	0.266
4	0.050	3.534	-3.526	1.685	1.604	-0.234

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.062881	0.040116	1.044577e-07	0.057044	0.358324	0.916029
1	0.063242	0.041119	1.071397e-07	0.058749	0.362885	0.914213
2	0.061770	0.040594	1.022566e-07	0.057859	0.354501	0.917589
3	0.060986	0.039814	1.029396e-07	0.057012	0.355731	0.918030

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.062800	0.040132	1.027361e-07	0.056943	0.355341	0.918572
1	0.066735	0.039878	1.038718e-07	0.056769	0.357331	0.916755
2	0.067629	0.042427	1.156597e-07	0.058522	0.377117	0.908896
3	0.072293	0.039206	1.099804e-07	0.059301	0.367594	0.910236

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.116276	0.029846	4.892689e-08	0.039318	0.245127	0.961122
1	0.102920	0.029538	4.922599e-08	0.039395	0.245986	0.960564
2	0.084077	0.026925	4.625098e-08	0.038828	0.238638	0.962702
3	0.091153	0.027785	4.727884e-08	0.038862	0.240949	0.961951

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.110037	0.027574	4.894945e-08	0.040122	0.245596	0.959774
1	0.132051	0.033242	4.784835e-08	0.039802	0.242492	0.961643
2	0.099101	0.024287	5.863741e-08	0.041929	0.267764	0.953962
3	0.102979	0.029437	5.376091e-08	0.041357	0.257431	0.957495

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.031160	0.016733	2.074779e-08	0.025045	0.159663	0.983613
1	0.031022	0.017709	2.223113e-08	0.026767	0.165330	0.982039
2	0.039419	0.013965	3.324912e-08	0.032685	0.202129	0.973180
3	0.027681	0.019396	2.983835e-08	0.031456	0.191540	0.976054

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.042980	0.013796	3.140130e-08	0.032578	0.196570	0.973778
1	0.031857	0.023917	2.371326e-08	0.027107	0.170641	0.981477
2	0.042412	0.011721	3.732897e-08	0.034163	0.214292	0.970696
3	0.034448	0.022107	3.759423e-08	0.032766	0.214857	0.970016

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.019118	0.008322	1.340876e-08	0.020796	0.128466	0.989240
1	0.018300	0.008994	1.325183e-08	0.020343	0.127626	0.989450
2	0.020394	0.007648	1.389994e-08	0.021523	0.130743	0.988728
3	0.018419	0.008638	1.297173e-08	0.020027	0.126153	0.989657

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.024760	0.010435	1.804386e-08	0.023618	0.148624	0.985613
1	0.023978	0.012943	1.818668e-08	0.024902	0.149512	0.985142
2	0.024505	0.015037	1.810431e-08	0.022583	0.149060	0.986013
3	0.025804	0.017421	2.108869e-08	0.026794	0.161450	0.982873

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.014164	0.007468	9.829535e-09	0.017478	0.109931	0.992159
1	0.014833	0.006343	9.495345e-09	0.017444	0.108049	0.992305
2	0.012383	0.008664	9.250510e-09	0.017428	0.106595	0.992560
3	0.013491	0.009163	9.189944e-09	0.017075	0.106289	0.992697

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.018180	0.013453	1.502565e-08	0.022632	0.135850	0.987816
1	0.022175	0.010648	1.490864e-08	0.021421	0.135308	0.988448
2	0.019399	0.008604	1.551715e-08	0.021347	0.138246	0.987713
3	0.016721	0.009931	1.451212e-08	0.021711	0.133528	0.988073

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.062220	0.040411	1.041984e-07	0.057666	0.357860	0.916465
2	0.098607	0.028524	4.792068e-08	0.039100	0.242675	0.961585
3	0.032320	0.016951	2.651660e-08	0.028988	0.179666	0.978722
4	0.019058	0.008400	1.338306e-08	0.020672	0.128247	0.989269
5	0.013718	0.007910	9.441333e-09	0.017356	0.107716	0.992430

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.067364	0.040411	1.080620e-07	0.057884	0.364346	0.913615
2	0.111042	0.028635	5.229903e-08	0.040803	0.253321	0.958219
3	0.037924	0.017886	3.250944e-08	0.031654	0.199090	0.973992
4	0.024762	0.013959	1.885589e-08	0.024474	0.152161	0.984910
5	0.019119	0.010659	1.499089e-08	0.021778	0.135733	0.988012