from __future__ import print_function
from future.utils import string_types
import os
import re
import glob, copy
from collections import OrderedDict as odict
from functools import partial
import warnings
import numpy as np
import netCDF4 as nc
import dimarray as da
#from geo.data.index import to_slice, _slice3D
from dimarray.tools import format_doc, isscalar
from dimarray.dataset import Dataset, concatenate_ds, stack_ds
from dimarray.core import DimArray, Axis, Axes
from dimarray.config import get_option
from dimarray.core.bases import AbstractDimArray, AbstractDataset, AbstractAxis, GetSetDelAttrMixin, AbstractAxes
# from dimarray.core.metadata import _repr_metadata
from dimarray.prettyprinting import repr_axis, repr_axes, repr_dimarray, repr_dataset, repr_attrs
from .conventions import encode_cf_datetime, decode_cf_datetime
__all__ = ['read_nc','summary_nc', 'write_nc']
#
# Global variables
#
FORMAT = get_option('io.nc.format') # for the doc
#
# Helper functions
#
def maybe_encode_values(values, format=None):
""" strings are given "object" type in Axis object
==> assume all objects are actually strings
NOTE: this will fail for other object-typed axes such as tuples
"""
# if dtype is np.dtype('O'):
values = np.asarray(values)
dtype = values.dtype
cf_attrs = {}
if dtype.kind in ('S','O'):
encoded = np.asarray(values, dtype=object)
dtype = str
elif dtype.kind == 'M': # 'datetime64'
# values = np.asarray(np.datetime_as_string(values), dtype=object)
# dtype = str
encoded, units, calendar = encode_cf_datetime(values)
dtype = encoded.dtype
cf_attrs['calendar'] = calendar
cf_attrs['units'] = units
elif dtype.kind == 'm': # 'timedelta64'
# elif np.issubdtype(dtype, np.timedelta64):
encoded, units = encode_cf_timedelta(values)
cf_attrs['units'] = units
dtype = encoded.dtype
else:
encoded = values
# if the format is NETCDF3, uses int32 instead of int64
if format == "NETCDF3_CLASSIC" and dtype is np.dtype("int64"):
warnings.warn("convert int64 into int32 for writing to NETCDF3 format", RuntimeWarning)
dtype = "int32"
elif format == "NETCDF3_64BIT" and dtype is np.dtype("int64"):
# it is strange that a warnings also occurs in the _64BIT version !
warnings.warn("convert int64 into int32 for writing to NETCDF3 format", RuntimeWarning)
dtype = "int32"
return encoded, dtype, cf_attrs
# quick and dirty conversion from string
TIME_UNITS = ['years','months','days','hours','minutes', 'seconds']
def hastimeunits(ncvar):
regexpr = "{} +since".format("|".join(TIME_UNITS))
return hasattr(ncvar, 'units') and re.match(regexpr, ncvar.units)
def istimevariable(ncvar):
"""Return True if a netCDF4 variable represents time
"""
# return hastimeunits(ncvar) or \
return (
ncvar.size > 0 and isinstance(ncvar[0], string_types) \
and re.match('\d\d(\d\d)?-\d\d-\d\d',ncvar[0]))
class NCTimeVariableWrapper(object):
"""Subclass of netCDF4 variable that performs conversions
to datetime64 object when indexing. Should only be used for
object or string types
"""
def __init__(self, ncvar):
self.ncvar = ncvar
def __getitem__(self, idx):
arr = self.ncvar[idx]
if arr.dtype.kind in ('S','O'):
try:
arr = np.asarray(arr, dtype='datetime64')
except Exception as error:
warnings.warn("Failed to convert string time to datetime64\n"+str(error), RuntimeWarning)
else:
try:
arr = decode_cf_datetime(arr, getattr(self.ncvar, 'units',None), getattr(self.ncvar, 'calendar', None))
except Exception as error:
warnings.warn("Failed to convert {} to datetime64\n".format(arr.dtype)+str(error), RuntimeWarning)
return arr
def __getattr__(self, att):
return getattr(self.ncvar, att)
#
# Define an open_nc function return an on-disk Dataset object, more similar to
# netCDF4's Dataset.
#
#
# Specific to NetCDF I/O
#
class NetCDFOnDisk(object):
" DimArray wrapper for netCDF4 Variables and Datasets "
@property
def nc(self):
return self._ds
@property
def _obj(self):
return NotImplementedError("need to be subclassed")
@property
def attrs(self):
return AttrsOnDisk(self._obj)
def close(self):
return self._ds.close()
def __exit__(self, exception_type, exception_value, tracebook):
self.close()
def __enter__(self):
return self
class DatasetOnDisk(GetSetDelAttrMixin, NetCDFOnDisk, AbstractDataset):
""" Dataset on Disk
.. versionadded :: 0.2
See open_nc for examples of use.
"""
@format_doc(format=FORMAT)
def __init__(self, f, mode="r", clobber=True, diskless=False, persist=False, format=FORMAT, **kwargs):
"""
Parameters
----------
f : file name or netCDF4.Dataset instance
mode : str, optional
access mode. This include "r" (read) (the default), "w" (write)
and "a" (append)
See netCDF4.Dataset for full information.
clobber : `bool`, optional
if True (default), opening a file with mode='w'
will clobber an existing file with the same name. if False, an
exception will be raised if a file with the same name already exists.
diskless : see netCDF4.Dataset help
persist : see netCDF4.Dataset help
format : `str`
netCDF file format. Default is '{format}' (only accounted for during file creation)
**kwargs : key-word arguments, used as default argument when creating a variable
Notes
-----
See the netCDF4-python module documentation for more information about the use
of keyword arguments to DatasetOnDisk
"""
if isinstance(f, nc.Dataset):
self._ds = f
else:
try:
self._ds = nc.Dataset(f, mode=mode, clobber=clobber, diskless=diskless, persist=persist, format=format)
except UserWarning as error:
print(error)
except Exception as error: # indicate file name when issuing error
raise IOError("{}\n=> failed to open {} (mode={}, clobber={})".format(str(error), f, mode, clobber)) # easier to handle
# assert self._kwargs is not None
self._kwargs = kwargs
@property
def _obj(self):
return self._ds
def read(self, names=None, indices=None, axis=0, indexing=None, tol=None, keepdims=False,
verbose=False, # back-compatibility
):
""" Read values from disk
Parameters
----------
names : list of variables to read, optional
indices : int or list or slice (single-dimensional indices)
or a tuple of those (multi-dimensional)
or `dict` of { axis name : axis indices }
Indices refer to Dataset axes. Any item that does not possess
one of the dimensions will not be indexed along that dimension.
For example, scalar items will be left unchanged whatever indices
are provided.
axis : None or int or str, optional
if specified and indices is a slice, scalar or an array, assumes
indexing is along this axis.
indexing : {'label', 'position'}, optional
Indexing mode.
- "label": indexing on axis labels (default)
- "position": use numpy-like position index
Default value can be changed in dimarray.rcParams['indexing.by']
tol : float, optional
tolerance when looking for numerical values, e.g. to use nearest
neighbor search, default `None`.
keepdims : bool, optional
keep singleton dimensions (default False)
Returns
-------
Dataset
See Also
--------
open_nc : examples of use
DimArrayOnDisk.read, DatasetOnDisk.write, DimArray.take
"""
if verbose:
# print "Read ",self.filename
pass
# automatically read all variables to load (except for the dimensions)
if names is None:
dims = self.dims
names = self.keys()
elif isinstance(names, string_types):
return self[names].read(indices=indices, axis=axis, indexing=indexing, tol=tol, keepdims=keepdims)
else:
dims = []
# else:
# raise TypeError("Expected list or str for 'names=', got {}".format(names))
tuple_indices = self._get_indices(indices, axis=axis, tol=tol, keepdims=keepdims, indexing=indexing)
dict_indices = {dim:tuple_indices[i] for i, dim in enumerate(self.dims)}
data = Dataset()
# first load dimensions
for dim in dims:
data.axes.append(self.axes[dim][dict_indices[dim]])
# then normal variables
for nm in names:
data[nm] = self[nm].read(indices={dim:dict_indices[dim] for dim in self[nm].dims}, indexing='position')
data.attrs.update(self.attrs) # dataset's metadata
# reorder the axes in the dataset to match input
data.axes = Axes(data.axes[dim] for dim in self.dims if dim in data.dims)
return data
# so that loc, iloc, sel, isel, nloc work:
_getitem = read
def __getitem__(self, name):
if name in self.dims:
return DimArrayOnDisk(self._ds, name)
# raise KeyError("Use 'axes' property to access dimension variables: {}".format(name))
# return AxisOnDisk(self._ds, name)
if name not in self.keys():
raise KeyError("Variable not found in the dataset: {}.\nExisting variables: {}".format(name, self.keys()))
return DimArrayOnDisk(self._ds, name)
# def write(self, name, dataset, zlib=False, **kwargs):
def write(self, name, dima, **kwargs):
""" Write a variable to a netCDF4 dataset.
Parameters
----------
name : variable name
dima : DimArray instance
zlib : `bool`
Enable zlib compression if True. Default is False (no compression).
complevel : `int`
integer between 1 and 9 describing the level of compression desired. Ignored if zlib=False.
**kwargs : key-word arguments
Any additional keyword arguments accepted by `netCDF4.Dataset.createVariable`
Notes
-----
See the netCDF4-python module documentation for more information about the use
of keyword arguments to write_nc.
See also
--------
DimArrayOnDisk.write
Examples
--------
>>> import numpy as np
>>> import dimarray as da
Create a DimArray (in memory), with metadata
>>> dima = da.DimArray([[1,2,3],[4,5,6]], axes=[('time',[2000,2045.5]),('scenario',['a','b','c'])])
>>> dima.units = 'myunits' # metadata
>>> dima.axes['time'].units = 'metadata-dim-in-memory'
Write it to disk, and add some additional metadata
>>> ds = da.open_nc('/tmp/test.nc', mode='w')
>>> ds['myvar'] = dima
>>> ds['myvar'].bla = 'bla'
>>> ds['myvar'].axes['time'].yo = 'metadata-dim-on-disk'
>>> ds.axes['scenario'].ya = 'metadata-var-on-disk'
>>> ds.yi = 'metadata-dataset-on-disk'
>>> ds.close()
Check the result with ncdump utility from a terminal (need to be installed for the test below to work)
:> ncdump -h /tmp/test.nc
netcdf test {
dimensions:
time = 2 ;
scenario = 3 ;
variables:
double time(time) ;
time:units = "metadata-dim-in-memory" ;
time:yo = "metadata-dim-on-disk" ;
string scenario(scenario) ;
scenario:ya = "metadata-var-on-disk" ;
int64 myvar(time, scenario) ;
myvar:units = "myunits" ;
myvar:bla = "bla" ;
// global attributes:
:yi = "metadata-dataset-on-disk" ;
}
"""
# if not isinstance(obj, da.DimArray):
# raise TypeError("Can only write Dataset, use `ds[name] = dima` to write a DimArray")
_, nctype, cf_attrs = maybe_encode_values(dima, format=self._ds.file_format)
name = name or getattr(self, "name", None)
if not name:
raise ValueError("Need to provide variable name")
kw = self._kwargs.copy()
kw.update(kwargs)
if name not in self._ds.variables.keys():
if np.isscalar(dima):
dima = da.DimArray(dima)
if not isinstance(dima, DimArray):
raise TypeError("Expected DimArray, got {}".format(type(dima)))
if dima.ndim > 0:
# create Dimension and associated variable
for ax in dima.axes:
if ax.name not in self.dims:
self.axes.append(ax, **kw)
# add _FillValue or missing_value attribute to fill_value
if 'fill_value' not in kw and hasattr(dima, '_FillValue'):
kw['fill_value'] = dima._FillValue
elif 'fill_value' not in kw and hasattr(dima, 'missing_value'):
kw['fill_value'] = dima.missing_value
# if the variable is already present (e.g. as present as variable
# AND dimension in dimarray Dataset), just skip it
if name in self._ds.variables.keys():
pass
# warnings.warn("dimension variable present both as variable and Axis in DimArra", RuntimeWarning)
else:
self._ds.createVariable(name, nctype, dima.dims, **kw)
dimaondisk = DimArrayOnDisk(self._ds, name)
dimaondisk[()] = dima
dimaondisk.attrs.update(cf_attrs) # calendar?
__setitem__ = write
def __delitem__(self, name):
" will raise an Exception if on-disk "
del self._ds.variables[name]
def keys(self):
" all variables except for dimensions"
return [nm for nm in self._ds.variables.keys() if nm not in self.dims]
def values(self):
return [self[k] for k in self.keys()]
def __len__(self):
return len(self.keys())
@property
def dims(self):
return tuple(self._ds.dimensions.keys())
@dims.setter
def dims(self, newdims):
self._set_dims(newdims)
@property
def axes(self):
return AxesOnDisk(self._ds, self.dims, **self._kwargs)
@axes.setter
def axes(self, newaxes):
for ax in newaxes:
if ax.name in self.dims:
self.axes[ax.name] = ax
else:
self.axes.append(ax)
_repr = repr_dataset
def __iter__(self):
for k in self.keys():
yield k
class NetCDFVariable(NetCDFOnDisk):
@property
def _obj(self):
return self.values
@property
def values(self):
return self._ds.variables[self._name] # simply the variable to be indexed and returns values like netCDF4
@values.setter
def values(self, values):
self[()] = values
@property
def __array__(self):
return self.values[()].__array__ # returns a numpy array
class DimArrayOnDisk(GetSetDelAttrMixin, NetCDFVariable, AbstractDimArray):
_constructor = DimArray
_broadcast = False
def __init__(self, ds, name, _indexing=None):
self._indexing = _indexing or get_option('indexing.by')
self._ds = ds
self._name = name
@property
def name(self):
return self._name
@name.setter
def name(self, newname):
self._ds.renameVariable(self, self._name, newname)
self._name = newname
@property
def dims(self):
return tuple(self._obj.dimensions)
@property
def axes(self):
return AxesOnDisk(self._ds, self.dims)
def write(self, indices, values, axis=0, indexing=None, tol=None):
""" Write numpy array or DimArray to netCDF file (The
variable must have been created previously)
Parameters
----------
indices : int or list or slice (single-dimensional indices)
or a tuple of those (multi-dimensional)
or `dict` of { axis name : axis values }
values : np.ndarray or DimArray
axis : None or int or str, optional
if specified and indices is a slice, scalar or an array, assumes
indexing is along this axis.
indexing : {'label', 'position'}, optional
Indexing mode.
- "label": indexing on axis labels (default)
- "position": use numpy-like position index
Default value can be changed in dimarray.rcParams['indexing.by']
tol : float, optional
tolerance when looking for numerical values, e.g. to use nearest
neighbor search, default `None`.
See Also
--------
DimArray.put, DatasetOnDisk.write, DimArrayOnDisk.read
"""
# just create the variable and dimensions
ds = self._ds
dima = values # internal convention: values is a numpy array
values, nctype, cf_attrs = maybe_encode_values(values, format=self._ds.file_format)
assert self._name in ds.variables.keys(), "variable does not exist, should have been created earlier!"
# add attributes
if hasattr(dima,'attrs'):
self.attrs.update(dima.attrs)
self.attrs.update(cf_attrs) # calendar?
# special case: index == slice(None) and self.ndim == 0
# This would fail with numpy, but not with netCDF4
if type(indices) is slice and indices == slice(None) and self.ndim == 0:
indices = ()
indices = self._get_indices(indices,axis=axis, indexing=indexing, tol=tol)
# Perform additional checks on axes if the Data to assign is a DimArray
if isinstance(dima, DimArray):
for i, ax in enumerate(self.axes):
idx = indices[i]
axis = dima.axes[ax.name]
# write unlimited dimensions
if self._ds.dimensions[ax.name].isunlimited():
self.axes[ax.name][idx] = axis
else:
# dimension variable already written, simple check
ondisk = self.axes[ax.name][idx if not (np.isscalar(idx) or np.ndim(idx)==0) else [idx]].values
inmemory = axis.values
# inmemory, _ = maybe_encode_values(axis.values)
if not np.all(ondisk == inmemory):
# assert np.all(ondisk == inmemory)
warnings.warn("axes values differ in the netCDF file, try using a numpy array instead, or re-index\
the dimarray prior to assigning to netCDF", RuntimeWarning)
# do all the indexing and assignment via IndexedArray class
# ==> it will set the values via _setvalues_ortho below
# super(DimArrayOnDisk)._setitem(self, indices, values, **kwargs)
AbstractDimArray._setitem(self, indices, values, indexing='position')
def read(self, indices=None, axis=0, indexing=None, tol=None, keepdims=False):
""" Read values from disk
Parameters
----------
indices : int or list or slice (single-dimensional indices)
or a tuple of those (multi-dimensional)
or `dict` of { axis name : axis values }
axis : None or int or str, optional
if specified and indices is a slice, scalar or an array, assumes
indexing is along this axis.
indexing : {'label', 'position'}, optional
Indexing mode.
- "label": indexing on axis labels (default)
- "position": use numpy-like position index
Default value can be changed in dimarray.rcParams['indexing.by']
tol : float, optional
tolerance when looking for numerical values, e.g. to use nearest
neighbor search, default `None`.
keepdims : bool, optional
keep singleton dimensions (default False)
Returns
-------
DimArray instance or scalar
See Also
--------
DimArray.take, DatasetOnDisk.read, DimArrayOnDisk.write
"""
if type(indices) is slice and indices == slice(None) and self.ndim == 0:
indices = ()
return AbstractDimArray._getitem(self, indices=indices, axis=axis,
indexing=indexing, tol=tol, keepdims=keepdims)
__setitem__ = write
__getitem__ = read
# so that loc, iloc, sel, isel, nloc work:
_getitem = read
_setitem = write
def _repr(self, **kwargs):
assert 'lazy' not in kwargs, "lazy parameter cannot be provided, it is always True"
return repr_dimarray(self, lazy=True, **kwargs)
def _setvalues_ortho(self, idx_tuple, values, cast=False):
if cast is True:
warnings.warn("`cast` parameter is ignored", DeprecationWarning)
# values = maybe_encode_values(values)
values, nctype, cf_attrs = maybe_encode_values(values)
self.values[idx_tuple] = values
self.attrs.update(cf_attrs)
def _getvalues_ortho(self, idx_tuple):
res = self.values[idx_tuple]
# scalar become arrays with netCDF4# scalar become arrays with netCDF4
# need convert to ndim=0 numpy array for consistency with axes
if self.ndim == 0:
try:
res[0] + 1 # pb arises only for numerical types
res = np.array(res[0])
except:
res = np.array(res) # str and unicode
return res
def _getaxes_ortho(self, idx_tuple):
" idx: tuple of position indices of length = ndim (orthogonal indexing)"
axes = []
for i, ix in enumerate(idx_tuple):
ax = self.axes[i][ix]
if np.ndim(ax) != 0: # do not include scalar axes
axes.append(ax)
return axes
class AxisOnDisk(GetSetDelAttrMixin, NetCDFVariable, AbstractAxis):
def __init__(self, ds, name):
self._ds = ds
if not isinstance(name, string_types):
raise TypeError("only string names allowed")
self._name = name
def _isdefined(self):
return self._name in self._ds.variables
def _ncvar(self):
return self._ds.variables[self._name]
def _ncdim(self):
return self._ds.dimensions[self._name]
def _istime(self):
return self._isdefined() and istimevariable(self._ncvar())
@property
def values(self):
if self._isdefined():
ncvar = self._ncvar()
if self._istime():
values = NCTimeVariableWrapper(ncvar)
else:
values = ncvar
else:
# default, dummy dimension axis
msg = "'{}' dimension not found, define integer range".format(self._name)
warnings.warn(msg, RuntimeWarning)
values = np.arange(len(self._ncdim()))
return values
@property
def name(self):
return self._name
@name.setter
def name(self, new):
self._ds.renameDimension(self._name, new)
if self._name in self._ds.variables.keys():
self._ds.renameVariable(self._name, new)
self._name = new
@property
def attrs(self):
if self._istime():
return AttrsOnDisk(self._obj, exclude=['calendar','units'])
else:
return AttrsOnDisk(self._obj)
def __setitem__(self, indices, ax):
ds = self._ds
name = self._name
assert getattr(ax, 'name', name) == name, "inconsistent axis name"
values, nctype, cf_attrs = maybe_encode_values(ax, format=self._ds.file_format)
# assign value to variable (finer-grained control in AxesOnDisk.append)
if name not in self._ds.variables.keys():
ds.createVariable(name, nctype, name)
if np.isscalar(indices):
assert values.size == 1
try:
values = np.asscalar(values)
except AttributeError:
values = values[()]
ds.variables[name][indices] = values
# add attributes
attrs = getattr(ax, 'attrs', {})
try:
self.attrs.update(attrs)
except Exception as error:
warnings.warn(str(error), RuntimeWarning)
self.attrs.update(cf_attrs)
def __getitem__(self, indices):
values = self.values[indices]
if isinstance(values, np.ma.MaskedArray):
values = values.filled(0)
# do not produce an Axis object
if np.isscalar(values):
return values
elif np.ndim(values) == 0:
return np.asscalar(values)
ax = Axis(values, self._name)
# add metadata
if self._name in self._ds.variables.keys():
ax.attrs.update(self.attrs)
return ax
def __len__(self):
return len(self._ds.dimensions[self._name])
@property
def size(self):
return len(self)
def _bounds(self):
size = len(self) # does not work with unlimited dimensions?
if self.size == 0:
first, last = None, None
elif self._name in self._ds.variables.keys():
first, last = self._ds.variables[self._name][0], self._ds.variables[self._name][size-1]
else:
first, last = 0, size-1
return first, last
@property
def dtype(self):
if self._name in self._ds.variables.keys():
return np.dtype(self._ds.variables[self._name].dtype)
else:
return np.dtype('i')
class AxesOnDisk(AbstractAxes):
def __init__(self, ds, dims, **kwargs):
self._ds = ds
self._dims = dims
self._kwargs = kwargs # for variable creation
@property
def dims(self):
return self._dims
def __len__(self):
return len(self._dims)
def __getitem__(self, dim):
if not isinstance(dim, string_types):
dim = self.dims[dim]
elif dim not in self.dims:
msg = "{} not found in dimensions (dims={}).".format(dim, self.dims)
print("""An error ocurred? A new dimension can be created via `axes.append(axis)` syntax,
where axis can be a string (unlimited dimension), an Axis instance,
or a tuple (name, values). See help on `axes.append` for more information.
Low-level netCDF4 function is also available as `ds.nc.createDimension`
and `ds.nc.createVariable`""")
raise KeyError(msg)
return AxisOnDisk(self._ds, dim)
def __setitem__(self, dim, ax):
" modify existing axis and possibly create new associated variable "
if not isinstance(dim, string_types):
dim = self.dims[dim]
if dim not in self._ds.dimensions.keys():
raise ValueError("{} dimension does not exist. Use axes.append() to create a new axis".format(dim))
values, nctype, cf_attrs = maybe_encode_values(ax, format=self._ds.file_format)
if dim not in self._ds.variables.keys():
v = self._ds.createVariable(dim, nctype, dim, **self._kwargs)
self[dim][:] = ax
self[dim].attrs.update(cf_attrs)
# v[:] = values
def append(self, ax, size=None, **kwargs):
""" create new dimension
Parameters
----------
ax : str or Axis or (name, values) tuple
if str, simply create a dimension without writing the axis values
size : int or None, optional
if None, an unlimited dimension is created
if ax is provided as an array-like or Axis, size is taken
from the axis values by default (so size does not need to
be provided)
**kwargs : passed to createVariable (compression parameters)
"""
if isinstance(ax, string_types):
self._ds.createDimension(ax, size)
self._dims += (ax,)
else:
try:
ax = Axis.as_axis(ax)
except:
raise TypeError("can only append Axis instances or (name, values),\
or provide str to create unlimited dimension")
# elif isinstance(ax, Axis):
size = size or ax.size
self._ds.createDimension(ax.name, size)
self._dims += (ax.name,)
self._kwargs.update(kwargs) # createVariable parameters
self[ax.name] = ax # assign values and attributes
def __iter__(self):
for dim in self.dims:
yield AxisOnDisk(self._ds, dim)
_repr = repr_axes
class AttrsOnDisk(object):
""" represent netCDF Dataset or Variable Attribute
"""
def __init__(self, obj, exclude=None):
self.obj = obj
self.exclude = exclude
def __setitem__(self, name, value):
if name == "_FillValue": return
try:
self.obj.setncattr(name, value)
except TypeError as error:
warnings.warn(str(error), RuntimeWarning)
if type(value) is bool:
warnings.warn("convert boolean to integer", RuntimeWarning)
self.obj.setncattr(name, int(value))
def __getitem__(self, name):
return self.obj.getncattr(name)
def __delitem__(self, name):
return self.obj.delncattr(name)
def update(self, attrs):
for k in attrs.keys():
self[k] = attrs[k]
def keys(self):
keys = getattr(self.obj,'ncattrs', lambda :[])()
if self.exclude is not None:
keys = [k for k in keys if k not in self.exclude]
return keys
def values(self):
return [self.obj.getncattr(k) for k in self.obj.ncattrs()]
def todict(self):
return odict(zip(self.keys(), self.values()))
def __iter__(self):
for k in self.keys():
yield k
def __len__(self):
return len(self.keys())
def __repr__(self):
return repr_attrs(self)
###################################################
# Wrappers
###################################################
def open_nc(file_name, *args, **kwargs):
""" open a netCDF file a la netCDF4, for interactive access to its properties
Parameters
----------
file_name : netCDF file name
*args, **kwargs : passed to netCDF4.Dataset
Returns
-------
dimarray.io.nc.DatasetOnDisk class (see help on this class for more info)
Examples
--------
>>> ncfile = da.get_ncfile('greenland_velocity.nc')
>>> ds = da.open_nc(ncfile)
Informative Display similar to a in-memory Dataset
>>> ds # doctest: +SKIP
DatasetOnDisk of 6 variables (NETCDF4)
0 / y1 (113): -3400000.0 to -600000.0
1 / x1 (61): -800000.0 to 700000.0
surfvelmag: (u'y1', u'x1')
lat: (u'y1', u'x1')
lon: (u'y1', u'x1')
surfvely: (u'y1', u'x1')
surfvelx: (u'y1', u'x1')
mapping: nan
Load variables with [:] syntax, like netCDF4 package
>>> ds['surfvelmag'][:] # load one variable
dimarray: 6893 non-null elements (0 null)
0 / y1 (113): -3400000.0 to -600000.0
1 / x1 (61): -800000.0 to 700000.0
array(...)
Indexing is similar to DimArray, and includes the sel, isel methods
>>> ds['surfvelmag'].ix[:10, -1] # load first 10 y1 values, and last x1 value
dimarray: 10 non-null elements (0 null)
0 / y1 (10): -3400000.0 to -3175000.0
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)
>>> ds['surfvelmag'].sel(x1=700000, y1=-3400000)
dimarray: 1 non-null elements (0 null)
array(0., dtype=float32)
>>> ds['surfvelmag'].isel(x1=-1, y1=0)
dimarray: 1 non-null elements (0 null)
array(0., dtype=float32)
Need to close the Dataset at the end
>>> ds.close() # close
Also usable as context manager
>>> with da.open_nc(ncfile) as ds:
... dataset = ds.read() # load full data set, same as da.read_nc(ncfile)
>>> dataset
Dataset of 6 variables
0 / y1 (113): -3400000.0 to -600000.0
1 / x1 (61): -800000.0 to 700000.0
surfvelmag: (u'y1', u'x1')
lat: (u'y1', u'x1')
lon: (u'y1', u'x1')
surfvely: (u'y1', u'x1')
surfvelx: (u'y1', u'x1')
mapping: nan
"""
return DatasetOnDisk(file_name, *args, **kwargs)
[docs]def read_nc(f, names=None, *args, **kwargs):
""" Wrapper around DatasetOnDisk.read
Read one or several variables from one or several netCDF file
Parameters
----------
f : str or netCDF handle
netCDF file to read from or regular expression
names : None or list or str, optional
variable name(s) to read
default is None
indices : int or list or slice (single-dimensional indices)
or a tuple of those (multi-dimensional)
or `dict` of { axis name : axis indices }
Indices refer to Dataset axes. Any item that does not possess
one of the dimensions will not be indexed along that dimension.
For example, scalar items will be left unchanged whatever indices
are provided.
indexing : {'label', 'position'}, optional
Indexing mode.
- "label": indexing on axis labels (default)
- "position": use numpy-like position index
Default value can be changed in dimarray.rcParams['indexing.by']
tol : float, optional
tolerance when looking for numerical values, e.g. to use nearest
neighbor search, default `None`.
keepdims : bool, optional
keep singleton dimensions (default False)
axis : str, optional
When reading multiple files, axis along which to join
the dimarrays or datasets. It the axis already exist,
the resulting arrays will be concatenated, otherwise
they will be stacked along a new array (in the sense
of the numpy functions `concatenate` and `stack`)
keys : sequence, optional
When reading multiple files, keys for the join axis. If the
axis already exists in the dataset, the concatenated dataset/dimarray
will be re-indexed along the provided key, otherwise the keys
will be used to create a new axis for stacking. In the latter case,
keys' length needs to exactly match the number of input files, and
if not provided, file names will be taken instead. Note you may
manually rename the axes later, or use the `set_axis` method.
align : bool, optional
When reading multiple files, passed to `stack` (new axis) or
`concatenate` (existing axis) to reindex all arrays onto common axes.
(in `concatenate` mode, the concatenation axis is *not* re-indexed of course,
only the secondary axes)
Default to False.
**kwargs : optional key-word arguments passed to align, if align is True
When reading multiple files, passed to `stack` (new axis) or
This includes: `sort` (False by default) and `join` ('outer' by default)
Returns
-------
obj : DimArray or Dataset
depending on whether a (single) variable name is passed as argument (names) or not
See Also
--------
DatasetOnDisk.read, stack, concatenate, stack_ds, concatenate_ds, align,
DimArray.write_nc, Dataset.write_nc
Examples
--------
>>> import os
>>> from dimarray import read_nc, get_datadir
Single netCDF file
>>> ncfile = os.path.join(get_datadir(), 'cmip5.CSIRO-Mk3-6-0.nc')
>>> data = read_nc(ncfile) # load full file
>>> data
Dataset of 2 variables
0 / time (451): 1850 to 2300
1 / scenario (5): u'historical' to u'rcp85'
tsl: (u'time', u'scenario')
temp: (u'time', u'scenario')
>>> data = read_nc(ncfile,'temp') # only one variable
>>> data = read_nc(ncfile,'temp', indices={"time":slice(2000,2100), "scenario":"rcp45"}) # load only a chunck of the data
>>> data = read_nc(ncfile,'temp', indices={"time":1950.3}, tol=0.5) # approximate matching, adjust tolerance
>>> data = read_nc(ncfile,'temp', indices={"time":-1}, indexing='position') # integer position indexing
Multiple files
Read variable 'temp' across multiple files (representing various climate models)
In this case the variable is a time series, whose length may
vary across experiments (thus align=True is passed to reindex axes before stacking)
>>> direc = get_datadir()
>>> temp = da.read_nc(direc+'/cmip5.*.nc', 'temp', align=True, axis='model')
A new 'model' axis is created labeled with file names. It is then
possible to rename it more appropriately, e.g. keeping only the part
directly relevant to identify the experiment:
>>> getmodel = lambda x: os.path.basename(x).split('.')[1] # extract model name from path
>>> temp.set_axis(getmodel, axis='model') # would return a copy if inplace is not specified
>>> temp
dimarray: 9114 non-null elements (6671 null)
0 / model (7): 'CSIRO-Mk3-6-0' to 'MPI-ESM-MR'
1 / time (451): 1850 to 2300
2 / scenario (5): u'historical' to u'rcp85'
array(...)
This works on datasets as well:
>>> ds = da.read_nc(direc+'/cmip5.*.nc', align=True, axis='model')
>>> ds.set_axis(getmodel, axis='model')
>>> ds
Dataset of 2 variables
0 / model (7): 'CSIRO-Mk3-6-0' to 'MPI-ESM-MR'
1 / time (451): 1850 to 2300
2 / scenario (5): u'historical' to u'rcp85'
tsl: ('model', u'time', u'scenario')
temp: ('model', u'time', u'scenario')
"""
# check for regular expression
if type(f) is str:
test = glob.glob(f)
if len(test) == 1:
f = test[0]
elif len(test) == 0:
raise ValueError('File is not present: '+repr(f))
else:
f = test
f.sort()
# multi-file ?
if type(f) is list:
mf = True
else:
mf = False
# Read a single file
if not mf:
f, close = _maybe_open_file(f, mode='r')
obj = DatasetOnDisk(f).read(names, *args, **kwargs)
if close: f.close()
# Read multiple files
else:
# single variable ==> DimArray (via Dataset)
if names is not None and isinstance(names, str):
obj = _read_multinc(f, [names], *args, **kwargs)
obj = obj[names]
# single variable ==> DimArray
else:
obj = _read_multinc(f, names, *args, **kwargs)
return obj
def _read_multinc(fnames, names=None, axis=None, keys=None, align=False, sort=False, join='outer', concatenate_only=False, **kwargs):
""" read multiple netCDF files
Parameters
----------
fnames : list of file names or file handles to be read
names : variable names to be read
axis : str, optional
dimension along which the files are concatenated
(created as new dimension if not already existing)
keys : sequence, optional
to be passed to stack_nc, if axis is not part of the dataset
align : `bool`, optional
if True, reindex axis prior to stacking / concatenating (default to False)
sort : `bool`, optional
if True, sort common axes prior to stacking / concatenating (defaut to False)
join : `str`, optional
join method in align (default to 'outer')
concatenate_only : `bool`, optional
if True, only concatenate along existing axis (and raise error if axis not existing)
**kwargs : keyword arguments passed to DatasetOnDisk.read (cannot
contain 'axis', though, but indices can be passed as a dictionary
if needed, e.g. {'time':2010})
Returns
-------
dimarray's Dataset instance
This function reads several files and call stack_ds or concatenate_ds
depending on whether `axis` is absent from or present in the dataset, respectively
See `dimarray.read_nc` for more complete documentation.
See Also
--------
read_nc
"""
variables = None
dimensions = None
datasets = []
for fn in fnames:
with DatasetOnDisk(fn) as f:
ds = f.read(names, **kwargs)
# check that the same variables are present in the file
if variables is None:
variables = ds.keys()
else:
variables_new = ds.keys()
assert variables == variables_new, \
"netCDF files must contain the same \
subset of variables to be concatenated/stacked"
# check that the same dimensions are present in the file
if dimensions is None:
dimensions = ds.dims
else:
dimensions_new = ds.dims
assert dimensions == dimensions_new, \
"netCDF files must contain the same \
subset of dimensions to be concatenated/stacked"
datasets.append(ds)
# check that dimension in dataset if required
if concatenate_only and axis not in dimensions:
raise Exception('required axis {} not found, only got {}'.format(axis, dimensions))
# Join dataset
if axis in dimensions:
ds = concatenate_ds(datasets, axis=axis, align=align, sort=sort, join=join)
if keys is not None:
ds = ds.reindex_axis(keys, axis=axis)
# elif sort:
# ds = ds.sort_axis(axis=axis)
#warnings.warn('keys argument will be ignored.', RuntimeWarning)
else:
# use file name as keys by default
if keys is None:
keys = [os.path.splitext(fn)[0] for fn in fnames]
ds = stack_ds(datasets, axis=axis, keys=keys, align=align, sort=sort, join=join)
return ds
#
# display summary information
#
[docs]def summary_nc(fname, name=None, metadata=False):
""" Print summary information about the content of a netCDF file
Deprecated, see dimarray.open_nc
"""
warnings.warn("Deprecated. Use dimarray.open_nc", FutureWarning)
with DatasetOnDisk(fname) as obj:
if name is not None:
obj = obj[name]
print((obj.__repr__(metadata=metadata)))
def _maybe_open_file(f, mode='r', clobber=None, verbose=False, format=None):
""" open a netCDF4 file
Parameters
----------
f : file name (str) or netCDF file handle
mode: changed from original 'r','w','r' & clobber option:
mode : `str`
read or write access
- 'r': read
- 'w' : write, overwrite if file if present (clobber=True)
- 'w-': create new file, but raise Exception if file is present (clobber=False)
- 'a' : append, raise Exception if file is not present
- 'a+': append if file is present, otherwise create
format: passed to netCDF4.Dataset, only relevatn when mode = 'w', 'w-', 'a+'
'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_CLASSIC', 'NETCDF3_64BIT'
Returns
-------
f : netCDF file handle
close: `bool`, `True` if input f indicated file name
"""
format = format or get_option('io.nc.format')
if mode == 'w-':
mode = 'w'
if clobber is None: clobber = False
# mode 'a+' appends if file exists, otherwise create new variable
elif mode == 'a+' and not isinstance(f, nc.Dataset):
if os.path.exists(f): mode = 'a'
else: mode = 'w'
if clobber is None: clobber=False
else:
if clobber is None: clobber=True
if not isinstance(f, nc.Dataset):
fname = f
# make sure the file does not exist if mode == "w"
if os.path.exists(fname) and clobber and mode == "w":
os.remove(fname)
try:
f = nc.Dataset(fname, mode, clobber=clobber, format=format)
except UserWarning as msg:
print(msg)
except Exception as msg: # raise a weird RuntimeError
#print "read from",fname
raise IOError("{} => failed to opend {} in mode {}".format(msg, fname, mode)) # easier to handle
if verbose:
if 'r' in mode:
print("read from",fname)
elif 'a' in mode:
print("append to",fname)
else:
print("write to",fname)
close = True
else:
close = False # leave it open
return f, close