%load_ext load_style
%load_style talk.css

k-means clustering of SST anomalies in the Pacific with scikit-learn¶

%matplotlib inline
from matplotlib import pyplot as plt

import os
import numpy as np
import pandas as pd
from mpl_toolkits.basemap import Basemap as bm

import xray

defines a function to plot a field (must be 2D)¶

def plot_field(X, lat, lon, vmin, vmax, step, cmap=plt.get_cmap('jet'), ax=False, title=False, grid=False):
    if not ax: 
        f, ax = plt.subplots(figsize=(10, (X.shape[0] / float(X.shape[1])) * 10))
    m.ax = ax
    im = m.contourf(lons, lats, X, np.arange(vmin, vmax+step, step), latlon=True, cmap=cmap, extend='both', ax=ax)
    m.drawcoastlines()
    if grid: 
        m.drawmeridians(np.arange(0, 360, 60), labels=[0,0,0,1])
        m.drawparallels([-40, 0, 40], labels=[1,0,0,0])
    m.colorbar(im)
    if title: 
        ax.set_title(title)

reads the PCs (obtained before, see sklearn_EOF_decomposition.ipynb)¶

PCs = pd.read_csv('../data/EOF_ERSST_PCs.csv', index_col=0, parse_dates=True)

PCs.head()

import the KMeans class from scikit-learn¶

from sklearn.cluster import KMeans

How many clusters do we want ?¶

nclusters = 6

instantiates the k-means class¶

KMeans?

kmeans = KMeans(init='k-means++', n_clusters=nclusters, n_init=10)

fit to the data¶

X = PCs.values

type(X)

numpy.ndarray

kmeans.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=6, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

the classes (clusters) are contained in the .labels_ attributes¶

kmeans.labels_

array([4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 4, 4, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1,
       5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4,
       4, 4, 4, 5, 5, 3, 4, 3, 3, 3, 4, 3, 3, 5, 5, 5, 3, 3, 4, 4, 4, 4, 4,
       4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 5,
       5, 5, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 5, 1, 1,
       5, 5, 5, 4, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 1,
       1, 5, 5, 2, 1, 5, 5, 5, 5, 5, 5, 2, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2,
       2, 2, 2, 5, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 5, 5, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 3, 3, 4, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5], dtype=int32)

Calculate composite anomalies for each of the clusters¶

ncfname = '../data/ersst.realtime.nc'

dset = xray.open_dataset(ncfname)

dsub = dset.sel(time=slice('1980','2014'),lat=slice(-40,40), lon=slice(120,290))

dsub

<xray.Dataset>
Dimensions:  (lat: 41, lon: 86, time: 420, zlev: 1)
Coordinates:
  * time     (time) datetime64[ns] 1980-01-15 1980-02-15 1980-03-15 1980-04-15 1980-05-15 ...
  * zlev     (zlev) float32 0.0
  * lat      (lat) float32 -40.0 -38.0 -36.0 -34.0 -32.0 -30.0 -28.0 -26.0 -24.0 -22.0 -20.0 ...
  * lon      (lon) float32 120.0 122.0 124.0 126.0 128.0 130.0 132.0 134.0 136.0 138.0 140.0 ...
Data variables:
    sst      (time, zlev, lat, lon) float64 15.19 15.12 15.04 15.04 15.1 15.12 15.08 15.09 15.18 ...
    anom     (time, zlev, lat, lon) float64 -0.13 -0.18 -0.24 -0.27 -0.25 -0.29 -0.4 -0.5 -0.56 ...
    err      (time, zlev, lat, lon) float64 0.46 0.42 0.38 0.35 0.37 0.36 0.32 0.31 0.27 0.27 ...
Attributes:
    title: ERSST V3b in situ only
    history: Wed Apr  8 09:45:04 2015: /usr/local/bin/ncrcat -O ersst.194802_ft.nc ersst.194803_ft.nc ersst.194804_ft.nc ersst.194805_ft.nc ersst.194806_ft.nc ersst.194807_ft.nc ersst.194808_ft.nc ersst.194809_ft.nc ersst.194810_ft.nc ersst.194811_ft.nc ersst.194812_ft.nc ersst.194901_ft.nc ersst.194902_ft.nc ersst.194903_ft.nc ersst.194904_ft.nc ersst.194905_ft.nc ersst.194906_ft.nc ersst.194907_ft.nc ersst.194908_ft.nc ersst.194909_ft.nc ersst.194910_ft.nc ersst.194911_ft.nc ersst.194912_ft.nc ersst.195...
    creation_date: 2010-03-23
    description: Smith, T.M., and R.W. Reynolds, 2003: Extended Reconstruction of Global Sea Surface Temperatures Based on COADS Data (1854-1997). Journal of Climate, 16, 1495-1510. A vailable at http://www.ncdc.noaa.gov/img/climate/research/sst/ersst.pdf  Climatology is based on 1971-2000 OI.v2 SST, In situ data: ICOADS2.4 before 2006 and NCEP in situ data from 2007 to present
    source: NOAA/National Climatic Data Center
    contact: Dick Reynolds, email: Richard.W.Reynolds@noaa.gov & Chunying Liu, email: Chunying.liu@noaa.gov
    Conventions: CF-1.0
    nco_openmp_thread_number: 1

lat = dsub['lat'].values
lon = dsub['lon'].values
lons, lats = np.meshgrid(lon, lat)

labels = pd.DataFrame(kmeans.labels_, index=dsub['time'], columns=['cluster'])

labels.head()

pd.unique(labels.cluster)

array([4, 1, 5, 3, 0, 2])

c = 0

clussub = labels.query('cluster == {}'.format(c))

clussub.index

<class 'pandas.tseries.index.DatetimeIndex'>
[1982-11-15, ..., 1998-09-15]
Length: 33, Freq: None, Timezone: None

cluster = dsub.sel(time=clussub.index).mean('time')

plt.imshow(cluster['anom'][0,::-1,:])

<matplotlib.image.AxesImage at 0x106e4df90>

m = bm(projection='cyl',llcrnrlat=-40,urcrnrlat=40,\
            llcrnrlon=120,urcrnrlon=290,\
            lat_ts=0,resolution='c')

f, axes = plt.subplots(nrows=3,ncols=2, figsize=(14,10))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten()
for c in xrange(nclusters):
    index = labels.query('cluster == {}'.format(c))
    cluster = dsub.sel(time=index.index).mean('time')
    ax = axes[c]
    plot_field(cluster['anom'][0,:,:], lats, lons, -2, 2, 0.1, \
               ax=ax, cmap=plt.get_cmap('RdBu_r'), \
               title="Cluster #{}: {} months".format(c+1, len(index)))

	EOF1	EOF2	EOF3	EOF4	EOF5	EOF6	EOF7	EOF8	EOF9
time
1980-01-15	-1.212233	-0.494918	0.972392	0.281485	-0.203608	2.072378	-0.632693	1.473872	-1.271845
1980-02-15	-1.213163	-0.715776	1.258066	0.741005	0.068926	2.731611	-0.182779	2.636705	-1.737925
1980-03-15	-0.898226	-0.562729	0.855915	1.316607	0.514087	2.650314	-0.001962	1.904691	-0.073639
1980-04-15	-1.037391	-1.086395	0.485100	0.374338	0.952949	1.320651	-0.340679	1.921813	0.549309
1980-05-15	-1.010956	-1.156914	0.006662	0.309884	0.534675	0.655073	-0.508158	1.677837	-0.253518