In [1]:
%load_ext load_style
%load_style talk.css

PCA of SST anomalies in the Pacific with scikit-learn

In [2]:
import pandas as pd
import numpy as np
from numpy import ma
from matplotlib import pyplot as plt
from mpl_toolkits.basemap import Basemap
In [3]:
%matplotlib inline

load the SST data

The file (74 Mb) can be downloaded at ftp://ftp.niwa.co.nz/incoming/fauchereaun/ersst.realtime.nc

In [6]:
import xray
In [7]:
dset = xray.open_dataset('../data/ersst.realtime.nc')
In [8]:
dset
Out[8]:
<xray.Dataset>
Dimensions:  (lat: 89, lon: 180, time: 806, zlev: 1)
Coordinates:
  * time     (time) datetime64[ns] 1948-02-15 1948-03-15 1948-04-15 1948-05-15 1948-06-15 ...
  * zlev     (zlev) float32 0.0
  * lat      (lat) float32 -88.0 -86.0 -84.0 -82.0 -80.0 -78.0 -76.0 -74.0 -72.0 -70.0 -68.0 ...
  * lon      (lon) float32 0.0 2.0 4.0 6.0 8.0 10.0 12.0 14.0 16.0 18.0 20.0 22.0 24.0 26.0 28.0 ...
Data variables:
    sst      (time, zlev, lat, lon) float64 nan nan nan nan nan nan nan nan nan nan nan nan nan ...
    anom     (time, zlev, lat, lon) float64 nan nan nan nan nan nan nan nan nan nan nan nan nan ...
    err      (time, zlev, lat, lon) float64 nan nan nan nan nan nan nan nan nan nan nan nan nan ...
Attributes:
    title: ERSST V3b in situ only
    history: Wed Apr  8 09:45:04 2015: /usr/local/bin/ncrcat -O ersst.194802_ft.nc ersst.194803_ft.nc ersst.194804_ft.nc ersst.194805_ft.nc ersst.194806_ft.nc ersst.194807_ft.nc ersst.194808_ft.nc ersst.194809_ft.nc ersst.194810_ft.nc ersst.194811_ft.nc ersst.194812_ft.nc ersst.194901_ft.nc ersst.194902_ft.nc ersst.194903_ft.nc ersst.194904_ft.nc ersst.194905_ft.nc ersst.194906_ft.nc ersst.194907_ft.nc ersst.194908_ft.nc ersst.194909_ft.nc ersst.194910_ft.nc ersst.194911_ft.nc ersst.194912_ft.nc ersst.195...
    creation_date: 2010-03-23
    description: Smith, T.M., and R.W. Reynolds, 2003: Extended Reconstruction of Global Sea Surface Temperatures Based on COADS Data (1854-1997). Journal of Climate, 16, 1495-1510. A vailable at http://www.ncdc.noaa.gov/img/climate/research/sst/ersst.pdf  Climatology is based on 1971-2000 OI.v2 SST, In situ data: ICOADS2.4 before 2006 and NCEP in situ data from 2007 to present
    source: NOAA/National Climatic Data Center
    contact: Dick Reynolds, email: Richard.W.Reynolds@noaa.gov & Chunying Liu, email: Chunying.liu@noaa.gov
    Conventions: CF-1.0
    nco_openmp_thread_number: 1

selects the period 1980 - 2014 and the tropical Pacific domain

In [9]:
dsub = dset.sel(time=slice('1980','2014'), zlev=0, lat=slice(-40,40), lon=slice(120,290))
In [10]:
lat = dsub['lat'].values
lon = dsub['lon'].values
sst = dsub['anom'].values
In [11]:
sst.shape
Out[11]:
(420, 41, 86)

reshape in 2D (time, space)

In [12]:
X = np.reshape(sst, (sst.shape[0], len(lat) * len(lon)), order='F')
In [13]:
np.any(np.isnan(X))
Out[13]:
True

Mask the land points

In [14]:
type(X)
Out[14]:
numpy.ndarray
In [15]:
X = ma.masked_array(X, np.isnan(X))
In [16]:
type(X)
Out[16]:
numpy.ma.core.MaskedArray
In [17]:
land = X.sum(0).mask
In [18]:
ocean = -land

keep only oceanic grid-points

In [19]:
X = X[:,ocean]

Standardize SST using the fit and transform methods of the sklearn.preprocessing.scaler.StandardScaler

In [20]:
from sklearn import preprocessing
scaler  = preprocessing.StandardScaler()
In [21]:
scaler_sst = scaler.fit(X)

Once the scaler object has been 'trained' on the data, we can save it as a pickle object

In [22]:
from sklearn.externals import joblib
In [23]:
joblib.dump(scaler_sst, '../data/scaler_sst.pkl', compress=9)
Out[23]:
['../data/scaler_sst.pkl']
In [24]:
scaler_sst = joblib.load('../data/scaler_sst.pkl')

scales: use the transform method of the scaler object

In [25]:
X = scaler_sst.transform(X)

verify that mean = 0 and std = 1

In [26]:
X.mean()
Out[26]:
1.0151690477930695e-18
In [27]:
X.std()
Out[27]:
0.99999999999999967
In [28]:
X.shape
Out[28]:
(420, 3133)

EOF decomposition

In [29]:
from sklearn.decomposition import pca

instantiates the PCA object

In [30]:
skpca = pca.PCA()

fit

In [31]:
skpca.fit(X)
Out[31]:
PCA(copy=True, n_components=None, whiten=False)

Now saves the (fitted) PCA object for reuse in operations

In [34]:
joblib.dump(skpca, '../data/EOF.pkl', compress=9)
Out[34]:
['../data/EOF.pkl']
In [35]:
from matplotlib import style
style.use('fivethirtyeight')
In [36]:
style.available
Out[36]:
[u'dark_background', u'bmh', u'grayscale', u'ggplot', u'fivethirtyeight']
In [37]:
f, ax = plt.subplots(figsize=(6,6))
ax.plot(skpca.explained_variance_ratio_[0:10]*100)
ax.plot(skpca.explained_variance_ratio_[0:10]*100,'ro')
Out[37]:
[<matplotlib.lines.Line2D at 0x10e24fa50>]

keep number of PC sufficient to explain 70 % of the original variance

In [38]:
ipc = np.where(skpca.explained_variance_ratio_.cumsum() >= 0.70)[0][0]
In [39]:
ipc
Out[39]:
9

The Principal Components (PCs) are obtained by using the transform method of the pca object (skpca)

In [40]:
PCs = skpca.transform(X)
In [41]:
PCs = PCs[:,:ipc]

The Empirical Orthogonal Functions (EOFs) are contained in the components_ attribute of the pca object (skpca)

In [42]:
EOFs = skpca.components_
In [43]:
EOFs = EOFs[:ipc,:]
In [44]:
EOFs.shape
Out[44]:
(9, 3133)

we can the reconstruct the 2D fields (maps)

In [45]:
EOF_recons = np.ones((ipc, len(lat) * len(lon))) * -999.
In [46]:
for i in xrange(ipc): 
    EOF_recons[i,ocean] = EOFs[i,:]
In [47]:
EOF_recons = ma.masked_values(np.reshape(EOF_recons, (ipc, len(lat), len(lon)), order='F'), -999.)
In [48]:
EOF_recons.shape
Out[48]:
(9, 41, 86)
In [49]:
plt.imshow(EOF_recons[0,:,:], origin='lower', interpolation='nearest', aspect='auto')
plt.colorbar();

scale the Principal Components

In [50]:
from sklearn.preprocessing import StandardScaler
In [51]:
scaler_PCs = StandardScaler()
In [52]:
scaler_PCs.fit(PCs)
Out[52]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [53]:
PCs_std = scaler_PCs.transform(PCs)
In [56]:
joblib.dump(scaler_PCs, '../data/scaler_PCs.pkl')
Out[56]:
['../data/scaler_PCs.pkl',
 '../data/scaler_PCs.pkl_01.npy',
 '../data/scaler_PCs.pkl_02.npy']
In [57]:
PCdf = pd.DataFrame(PCs_std, index = dsub['time'], \
                    columns = ["EOF%s" % (x) for x in xrange(1, PCs_std.shape[1] +1)])
In [58]:
PCdf.head()
Out[58]:
EOF1 EOF2 EOF3 EOF4 EOF5 EOF6 EOF7 EOF8 EOF9
time
1980-01-15 -1.212233 -0.494918 0.972392 0.281485 -0.203608 2.072378 -0.632693 1.473872 -1.271845
1980-02-15 -1.213163 -0.715776 1.258066 0.741005 0.068926 2.731611 -0.182779 2.636705 -1.737925
1980-03-15 -0.898226 -0.562729 0.855915 1.316607 0.514087 2.650314 -0.001962 1.904691 -0.073639
1980-04-15 -1.037391 -1.086395 0.485100 0.374338 0.952949 1.320651 -0.340679 1.921813 0.549309
1980-05-15 -1.010956 -1.156914 0.006662 0.309884 0.534675 0.655073 -0.508158 1.677837 -0.253518
In [59]:
PCdf.to_csv('../data/EOF_ERSST_PCs.csv')
In [60]:
from scipy.signal import detrend
In [61]:
f, ax = plt.subplots(figsize=(12,6))
PCdf.ix[:,0].plot(ax=ax, color='k', label='PC1')
#ax.set_xlabel('period', fontsize=18)
ax.plot(PCdf.index, detrend(PCdf.ix[:,0].values), 'r',  label='PC1 (trend removed)')
ax.grid('off')
ax.legend(loc=1); 
In [62]:
!ipython nbconvert sklearn_EOF_decomposition.ipynb --to html
[NbConvertApp] Converting notebook sklearn_EOF_decomposition.ipynb to html
[NbConvertApp] Writing 379833 bytes to sklearn_EOF_decomposition.html