%load_ext load_style
%load_style talk.css
from IPython.display import YouTubeVideo, Image
xray has been developed by scientists / engineers working at the Climate Corporation
It is an open source project and Python package that aims to bring
the labeled data power of pandas to the
physical sciences, by providing N-dimensional variants of the core
pandas data structures, Series
and
DataFrame
: the xray DataArray
and Dataset
.
the goal is to provide a pandas-like and pandas-compatible toolkit for
analytics on multi-dimensional arrays, rather than the tabular data for
which pandas excels. The approach adopts the Common Data
Model
for self-describing scientific data in widespread use in the Earth
sciences (e.g., netCDF
and OPeNDAP): xray.Dataset
is an in-memory
representation of a netCDF file.
The main advantages of using xray versus netCDF4 are:
To install the latest version of xray (via conda):
ᐅ conda install xray
or if you want the bleeding edge:
ᐅ pip install git+https://github.com/xray/xray
There's too much to see in the context of this talk, to know more about all the cool xray features, watch:
PyData talk by Stefan Hoyer: https://www.youtube.com/watch?v=T5CZyNwBa9c
YouTubeVideo('T5CZyNwBa9c', width=500, height=400, start=0)
%matplotlib inline
from matplotlib import pyplot as plt
import os
import numpy as np
import pandas as pd
from mpl_toolkits.basemap import Basemap as bm
import xray; print(xray.__version__)
The file (74 Mb) can be downloaded at ftp://ftp.niwa.co.nz/incoming/fauchereaun/ersst.realtime.nc
dset = xray.open_dataset('../data/ersst.realtime.nc')
dset is a xray.Dataset, It is a dict-like container of labeled arrays (DataArray objects) with aligned dimensions. It is designed as an in-memory representation of the data model from the netCDF file format.
Image('http://xray.readthedocs.org/en/stable/_images/dataset-diagram.png', width=700)
dset
dset.dims.keys()
dset.dims
dset.attrs.keys()
dset.keys()
lat = dset['lat']
type(lat)
lat = dset['lat']
lon = dset['lon']
lons, lats = np.meshgrid(lon, lat)
dset.sel(time=('1998-1-15'), zlev=0)
dset.sel(time=slice('1998-1-15','2000-12-15'), zlev=0, lat=slice(-40,40))
m = bm(projection='cyl',llcrnrlat=-90,urcrnrlat=90,\
llcrnrlon=0,urcrnrlon=360,\
lat_ts=0,resolution='c')
def plot_field(X, lat, lon, vmin, vmax, step, cmap=plt.get_cmap('jet'), ax=False, title=False, grid=False):
if not ax:
f, ax = plt.subplots(figsize=(10, (X.shape[0] / float(X.shape[1])) * 10))
m.ax = ax
im = m.contourf(lons, lats, X, np.arange(vmin, vmax+step, step), latlon=True, cmap=cmap, extend='both', ax=ax)
m.drawcoastlines()
if grid:
m.drawmeridians(np.arange(0, 360, 60), labels=[0,0,0,1])
m.drawparallels([-40, 0, 40], labels=[1,0,0,0])
m.colorbar(im)
if title:
ax.set_title(title)
plot_field(dset.sel(time=('1998-1-15'), zlev=0)['sst'], lats, lons, 0, 30, 1, grid=True)
plot_field(dset.sel(time=('1998-1-15'), zlev=0)['anom'], lats, lons, -4, 4, 0.1, \
cmap=plt.get_cmap('RdBu_r'), grid=True)
mat = dset.sel(lon=slice(0, 10), time=('1998-1-15'), zlev=0)['sst']
Image(filename='images/split-apply-combine.png', width=500)
sst = dset[['sst']]
sst
clim = sst.groupby('time.month').mean('time')
clim[['sst']]
from calendar import month_name
f, axes = plt.subplots(nrows=4,ncols=3, figsize=(14,10))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten()
for i, month in enumerate(xrange(1,13)):
ax = axes[i]
plot_field(clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=month_name[month])
f.savefig('./images/clim_sst.png')
NOTE: If you have DAILY data, you can calculate a daily climatolgy using the dayofyear
attribute, e.g.:
clim = dset.groupby('time.dayofyear').mean('time')
seas_clim = sst.groupby('time.season').mean('time')
seas_clim
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values):
ax = axes[i]
plot_field(seas_clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=seas)
f.savefig('./images/seas_clim_sst.png')
def get_dpm(time):
"""
return a array of days per month corresponding to the months provided in `time`
"""
import calendar as cal
month_length = np.zeros(len(time), dtype=np.float)
for i, (month, year) in enumerate(zip(time.month, time.year)):
month_length[i] = cal.monthrange(year, month)[1]
return month_length
def season_mean(ds, calendar='standard'):
# Make a DataArray of season/year groups
year_season = xray.DataArray(ds.time.to_index().to_period(freq='Q-NOV').to_timestamp(how='E'),
coords=[ds.time], name='year_season')
# Make a DataArray with the number of days in each month, size = len(time)
month_length = xray.DataArray(get_dpm(ds.time.to_index()),
coords=[ds.time], name='month_length')
# Calculate the weights by grouping by 'time.season'
weights = month_length.groupby('time.season') / month_length.groupby('time.season').sum()
# Test that the sum of the weights for each season is 1.0
np.testing.assert_allclose(weights.groupby('time.season').sum().values, np.ones(4))
# Calculate the weighted average
return (ds * weights).groupby('time.season').sum(dim='time')
sst_seas = season_mean(sst)
sst_seas
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values):
ax = axes[i]
plot_field(seas_clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=seas)
f.savefig('./images/seas_clim_sst.png')
diff_seas = seas_clim - sst_seas
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values):
ax = axes[i]
plot_field(diff_seas['sst'][i,0,:,:].values, lats, lons, -0.1, 0.1, 0.01, ax=ax, title=seas)
def demean(x):
return x - x.sel(time=slice('1981-1-15','2010-12-15')).mean('time')
#sst_anoms = dset.groupby('time.month').apply(demean)
# or (will return a xray.DataArray)
sst_anoms = dset['sst'].groupby('time.month').apply(demean)
plot_field(sst_anoms.sel(time=('1998-1-15'), zlev=0), lats, lons, -4, 4, 0.1, \
cmap=plt.get_cmap('RdBu_r'), grid=True)
sst_anoms = sst_anoms.to_dataset('sst_anoms')
sst_anoms
sst_anoms.to_netcdf('../data/ersst_anoms.nc')
!ncdump -h ../data/ersst_anoms.nc
lon = np.linspace(0, 357.5, 144, endpoint=True)
lat = np.linspace(-90,90, 73, endpoint=True)
lons, lats = np.meshgrid(lon,lat)
lev = np.array([1000,925,850])
time = pd.date_range(start='2015-1-1',end='2015-1-3')
lat
arr = np.random.randn(3,3,73,144)
The dictionnary keys are the variables contained in the Dataset.
The Dictionnary values are tuples, with first the (or the list of) dimension(s) over which the array varies, then the array itself
d = {}
d['time'] = ('time',time)
d['latitudes'] = ('latitudes',lat)
d['longitudes'] = ('longitudes', lon)
d['level'] = ('level', lev)
d['var'] = (['time','level','latitudes','longitudes'], arr)
dset = xray.Dataset(d)
adding global attributes
dset.attrs['author'] = 'nicolas.fauchereau@gmail.com'
adding variables attributes
dset.longitudes.attrs['units'] = 'degrees_east'
dset.latitudes.attrs['units'] = 'degrees_north'
dset.sel(time='2015-1-2', level=1000)
lons, lats = np.meshgrid(dset['longitudes'], dset['latitudes'])
plot_field(dset.sel(time='2015-1-2', level=1000)['var'], \
lats, lons, -4, 4, 0.1, grid=True)
dset.to_netcdf('../data/dset_from_dict.nc')
!ncdump -h ../data/dset_from_dict.nc
import string
df = pd.DataFrame(np.random.randn(365,5), \
index=pd.date_range(start='2014-1-1', periods=365), \
columns=list(string.ascii_letters[:5]))
df.head()
df_ds = xray.Dataset.from_dataframe(df)
df_ds
group = df_ds.groupby('index.month').mean('index')
group
group_df = group.to_dataframe()
group_df.reindex_axis(list(string.ascii_letters[:5]), axis=1).head()
df.groupby(df.index.month).mean().head()
url = 'http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/interp_OLR/olr.mon.mean.nc'
olr_dset = xray.open_dataset(url)
olr_dset
olr_sub = olr_dset.sel(time='1998-1-1',lat=slice(30,-30), lon=slice(170, 300))
olr_sub
m = bm(projection='merc',llcrnrlat=-30,urcrnrlat=30,\
llcrnrlon=170,urcrnrlon=300,\
lat_ts=0,resolution='c')
lons, lats = np.meshgrid(olr_sub['lon'], olr_sub['lat'])
plot_field(olr_sub['olr'].values, lats, lons, 80, 300, 10, grid=True)
!ipython nbconvert xray.ipynb --to html