In [ ]:
%load_ext load_style
%load_style talk.css
from IPython.display import YouTubeVideo, Image

xray has been developed by scientists / engineers working at the Climate Corporation

It is an open source project and Python package that aims to bring the labeled data power of pandas to the physical sciences, by providing N-dimensional variants of the core pandas data structures, Series and DataFrame: the xray DataArray and Dataset.

the goal is to provide a pandas-like and pandas-compatible toolkit for analytics on multi-dimensional arrays, rather than the tabular data for which pandas excels. The approach adopts the Common Data Model for self-describing scientific data in widespread use in the Earth sciences (e.g., netCDF and OPeNDAP): xray.Dataset is an in-memory representation of a netCDF file.

The main advantages of using xray versus netCDF4 are:

  • intelligent selection along labelled dimensions (and also indexes)
  • groupby operations !
  • data alignment
  • IO (netcdf)
  • conversion from / to Pandas.DataFrames

To install the latest version of xray (via conda):

ᐅ conda install xray

or if you want the bleeding edge:

ᐅ pip install git+https://github.com/xray/xray

There's too much to see in the context of this talk, to know more about all the cool xray features, watch:

PyData talk by Stefan Hoyer: https://www.youtube.com/watch?v=T5CZyNwBa9c

In [ ]:
YouTubeVideo('T5CZyNwBa9c', width=500, height=400, start=0)

Some examples

In [ ]:
%matplotlib inline
from matplotlib import pyplot as plt
In [ ]:
import os
import numpy as np
import pandas as pd
from mpl_toolkits.basemap import Basemap as bm
In [ ]:
import xray; print(xray.__version__)

Open a netcdf file (ERSST Version 4)

The file (74 Mb) can be downloaded at ftp://ftp.niwa.co.nz/incoming/fauchereaun/ersst.realtime.nc

In [ ]:
dset = xray.open_dataset('../data/ersst.realtime.nc')

dset is a xray.Dataset, It is a dict-like container of labeled arrays (DataArray objects) with aligned dimensions. It is designed as an in-memory representation of the data model from the netCDF file format.

In [ ]:
Image('http://xray.readthedocs.org/en/stable/_images/dataset-diagram.png', width=700)
In [ ]:
dset
In [ ]:
dset.dims.keys()
In [ ]:
dset.dims
In [ ]:
dset.attrs.keys()
In [ ]:
dset.keys()

accessing variables

In [ ]:
lat = dset['lat']
In [ ]:
type(lat)
In [ ]:
lat = dset['lat']
lon = dset['lon']
lons, lats = np.meshgrid(lon, lat)

selecting a Dataset along dimensions

In [ ]:
dset.sel(time=('1998-1-15'), zlev=0)
In [ ]:
dset.sel(time=slice('1998-1-15','2000-12-15'), zlev=0, lat=slice(-40,40))

defines the Basemap projection

In [ ]:
m = bm(projection='cyl',llcrnrlat=-90,urcrnrlat=90,\
            llcrnrlon=0,urcrnrlon=360,\
            lat_ts=0,resolution='c')

defines a function to plot a field (must be 2D)

In [ ]:
def plot_field(X, lat, lon, vmin, vmax, step, cmap=plt.get_cmap('jet'), ax=False, title=False, grid=False):
    if not ax: 
        f, ax = plt.subplots(figsize=(10, (X.shape[0] / float(X.shape[1])) * 10))
    m.ax = ax
    im = m.contourf(lons, lats, X, np.arange(vmin, vmax+step, step), latlon=True, cmap=cmap, extend='both', ax=ax)
    m.drawcoastlines()
    if grid: 
        m.drawmeridians(np.arange(0, 360, 60), labels=[0,0,0,1])
        m.drawparallels([-40, 0, 40], labels=[1,0,0,0])
    m.colorbar(im)
    if title: 
        ax.set_title(title)

plots

In [ ]:
plot_field(dset.sel(time=('1998-1-15'), zlev=0)['sst'], lats, lons, 0, 30, 1, grid=True)
In [ ]:
plot_field(dset.sel(time=('1998-1-15'), zlev=0)['anom'], lats, lons, -4, 4, 0.1, \
           cmap=plt.get_cmap('RdBu_r'), grid=True)
In [ ]:
mat = dset.sel(lon=slice(0, 10), time=('1998-1-15'), zlev=0)['sst']

calculates a monthly climatology using the groupby machinery

In [ ]:
Image(filename='images/split-apply-combine.png', width=500)
In [ ]:
sst = dset[['sst']]
In [ ]:
sst
In [ ]:
clim = sst.groupby('time.month').mean('time')
In [ ]:
clim[['sst']]
In [ ]:
from calendar import month_name
In [ ]:
f, axes = plt.subplots(nrows=4,ncols=3, figsize=(14,10))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten()
for i, month in enumerate(xrange(1,13)): 
    ax = axes[i]
    plot_field(clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=month_name[month])
f.savefig('./images/clim_sst.png')

NOTE: If you have DAILY data, you can calculate a daily climatolgy using the dayofyear attribute, e.g.:

clim = dset.groupby('time.dayofyear').mean('time')

New in version 0.4 (RC1 at 27/02/2015): calculates a seasonal (DJF, MAM, ...) climatology

In [ ]:
seas_clim = sst.groupby('time.season').mean('time')
In [ ]:
seas_clim
In [ ]:
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values): 
    ax = axes[i]
    plot_field(seas_clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=seas)
f.savefig('./images/seas_clim_sst.png')

calculates seasonal averages weigthed by the number of days in each month

In [ ]:
def get_dpm(time):
    """
    return a array of days per month corresponding to the months provided in `time`
    """
    import calendar as cal
    month_length = np.zeros(len(time), dtype=np.float)

    for i, (month, year) in enumerate(zip(time.month, time.year)):
        month_length[i] = cal.monthrange(year, month)[1]
    return month_length
In [ ]:
def season_mean(ds, calendar='standard'):
    # Make a DataArray of season/year groups
    year_season = xray.DataArray(ds.time.to_index().to_period(freq='Q-NOV').to_timestamp(how='E'),
                                 coords=[ds.time], name='year_season')

    # Make a DataArray with the number of days in each month, size = len(time)
    month_length = xray.DataArray(get_dpm(ds.time.to_index()),
                                  coords=[ds.time], name='month_length')
    # Calculate the weights by grouping by 'time.season'
    weights = month_length.groupby('time.season') / month_length.groupby('time.season').sum()

    # Test that the sum of the weights for each season is 1.0
    np.testing.assert_allclose(weights.groupby('time.season').sum().values, np.ones(4))

    # Calculate the weighted average
    return (ds * weights).groupby('time.season').sum(dim='time')
In [ ]:
sst_seas = season_mean(sst)
In [ ]:
sst_seas
In [ ]:
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values): 
    ax = axes[i]
    plot_field(seas_clim['sst'][i,0,:,:].values, lats, lons, 0, 30, 1, ax=ax, title=seas)
f.savefig('./images/seas_clim_sst.png')

difference between non-weigthed and weighted seasonal climatologies

In [ ]:
diff_seas = seas_clim - sst_seas
In [ ]:
f, axes = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
f.subplots_adjust(hspace=0.1, wspace=0.1)
axes = axes.flatten('F')
for i, seas in enumerate(seas_clim['season'].values): 
    ax = axes[i]
    plot_field(diff_seas['sst'][i,0,:,:].values, lats, lons, -0.1, 0.1, 0.01, ax=ax, title=seas)

calculates anomalies with respect to a specific climatological normal

1. defines the function

In [ ]:
def demean(x): 
    return x - x.sel(time=slice('1981-1-15','2010-12-15')).mean('time')

2. apply the function to the groupby object

In [ ]:
#sst_anoms = dset.groupby('time.month').apply(demean)

# or (will return a xray.DataArray)

sst_anoms = dset['sst'].groupby('time.month').apply(demean) 

should be very similar to the original anomalies

In [ ]:
plot_field(sst_anoms.sel(time=('1998-1-15'), zlev=0), lats, lons, -4, 4, 0.1, \
           cmap=plt.get_cmap('RdBu_r'), grid=True)

dumps a xray.Dataset object into a netcdf (Version 4) file (note: does not work for a xray.DataArray object)

In [ ]:
sst_anoms = sst_anoms.to_dataset('sst_anoms')
In [ ]:
sst_anoms
In [ ]:
sst_anoms.to_netcdf('../data/ersst_anoms.nc')
In [ ]:
!ncdump -h ../data/ersst_anoms.nc

Creates a xray dataset object from numpy arrays

In [ ]:
lon = np.linspace(0, 357.5, 144, endpoint=True)
lat = np.linspace(-90,90, 73, endpoint=True)

lons, lats = np.meshgrid(lon,lat)

lev = np.array([1000,925,850])
time = pd.date_range(start='2015-1-1',end='2015-1-3')
In [ ]:
lat
In [ ]:
arr = np.random.randn(3,3,73,144)

The dictionnary keys are the variables contained in the Dataset.

The Dictionnary values are tuples, with first the (or the list of) dimension(s) over which the array varies, then the array itself

In [ ]:
d = {}
d['time'] = ('time',time)
d['latitudes'] = ('latitudes',lat)
d['longitudes'] = ('longitudes', lon)
d['level'] = ('level', lev)
d['var'] = (['time','level','latitudes','longitudes'], arr)
In [ ]:
dset = xray.Dataset(d)

adding global attributes

In [ ]:
dset.attrs['author'] = 'nicolas.fauchereau@gmail.com'

adding variables attributes

In [ ]:
dset.longitudes.attrs['units'] = 'degrees_east'
dset.latitudes.attrs['units'] = 'degrees_north'
In [ ]:
dset.sel(time='2015-1-2', level=1000)
In [ ]:
lons, lats = np.meshgrid(dset['longitudes'], dset['latitudes'])
In [ ]:
plot_field(dset.sel(time='2015-1-2', level=1000)['var'], \
           lats, lons, -4, 4, 0.1, grid=True)
In [ ]:
dset.to_netcdf('../data/dset_from_dict.nc')
In [ ]:
!ncdump -h ../data/dset_from_dict.nc

Creates a xray dataset object from a Pandas DataFrame

In [ ]:
import string
df = pd.DataFrame(np.random.randn(365,5), \
                  index=pd.date_range(start='2014-1-1', periods=365), \
                  columns=list(string.ascii_letters[:5]))
In [ ]:
df.head()

from DataFrame

In [ ]:
df_ds = xray.Dataset.from_dataframe(df)
In [ ]:
df_ds
In [ ]:
group = df_ds.groupby('index.month').mean('index')
In [ ]:
group

converts TO a Pandas.DataFrame

In [ ]:
group_df = group.to_dataframe()
In [ ]:
group_df.reindex_axis(list(string.ascii_letters[:5]), axis=1).head()
In [ ]:
df.groupby(df.index.month).mean().head()

Opening a file throught the network with openDAP

In [ ]:
url = 'http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/interp_OLR/olr.mon.mean.nc'
In [ ]:
olr_dset = xray.open_dataset(url)
In [ ]:
olr_dset

the dataset is not loaded in memory until one selects something

In [ ]:
olr_sub = olr_dset.sel(time='1998-1-1',lat=slice(30,-30), lon=slice(170, 300))
In [ ]:
olr_sub
In [ ]:
m = bm(projection='merc',llcrnrlat=-30,urcrnrlat=30,\
            llcrnrlon=170,urcrnrlon=300,\
            lat_ts=0,resolution='c')
In [ ]:
lons, lats = np.meshgrid(olr_sub['lon'], olr_sub['lat'])
In [ ]:
plot_field(olr_sub['olr'].values, lats, lons, 80, 300, 10, grid=True)
In [ ]:
!ipython nbconvert xray.ipynb --to html