Source code for xmitgcm.mds_store

"""
Class to represent MITgcm mds file storage format.
"""
# python 3 compatiblity
from __future__ import print_function, division

from glob import glob
import os
import re
import numpy as np
import warnings
from io import StringIO
import inspect
import xarray as xr
import dask.array as da
import sys

# we keep the metadata in its own module to keep this one cleaner
from .variables import dimensions, \
    horizontal_coordinates_spherical, horizontal_coordinates_cartesian, \
    horizontal_coordinates_curvcart, horizontal_coordinates_llc, \
    horizontal_coordinates_cs, \
    vertical_coordinates, horizontal_grid_variables, vertical_grid_variables, \
    volume_grid_variables, state_variables, aliases, package_state_variables, \
    extra_grid_variables, mask_variables
# would it be better to import mitgcm_variables and then automate the search
# for variable dictionaries

from .utils import parse_meta_file, read_mds, parse_available_diagnostics,\
    get_extra_metadata

from .file_utils import listdir, listdir_startswith, listdir_endswith, \
    listdir_startsandendswith, listdir_fnmatch

# Python2/3 compatibility
if (sys.version_info > (3, 0)):
    stringtypes = [str]
else:
    stringtypes = [str, unicode]

# xarray>=0.12.0 compatiblity
try:
    from xarray.core.pycompat import OrderedDict
except ImportError:
    from collections import OrderedDict

# should we hard code this?
LLC_NUM_FACES = 13
CS_NUM_FACES = 6
FACE_DIMNAME = 'face'


[docs]def open_mdsdataset(data_dir, grid_dir=None, iters='all', prefix=None, read_grid=True, delta_t=1, ref_date=None, calendar='gregorian', levels=None, geometry='sphericalpolar', grid_vars_to_coords=True, swap_dims=None, endian=">", chunks=None, ignore_unknown_vars=False, default_dtype=None, nx=None, ny=None, nz=None, llc_method="smallchunks", extra_metadata=None, extra_variables=None): """Open MITgcm-style mds (.data / .meta) file output as xarray datset. Parameters ---------- data_dir : string Path to the directory where the mds .data and .meta files are stored grid_dir : string, optional Path to the directory where the mds .data and .meta files are stored, if different from ``data_dir``. iters : list, optional The iterations numbers of the files to be read. If ``None``, no data files will be read. If ``'all'`` (default), all iterations will be read. prefix : list, optional List of different filename prefixes to read. Default (``None``) is to read all available files. read_grid : bool, optional Whether to read the grid data delta_t : number, optional The timestep used in the model. (Can't be inferred.) ref_date : string, optional An iSO date string corresponding to the zero timestep, e.g. "1990-1-1 0:0:0" (See CF conventions [1]_) calendar : string, optional A calendar allowed by CF conventions [1]_ levels : list or slice, optional A list or slice of the indexes of the grid levels to read Same syntax as in the data.diagnostics file geometry : {'sphericalpolar', 'cartesian', 'llc', 'curvilinear', 'cs'} MITgcm grid geometry specifier grid_vars_to_coords : boolean, optional Whether to promote grid variables to coordinate status swap_dims : boolean, optional Whether to swap the logical dimensions for physical ones. If ``None``, will be set to ``False`` for ``geometry==llc`` and ``True`` otherwise. endian : {'=', '>', '<'}, optional Endianness of variables. Default for MITgcm is ">" (big endian) chunks : int or dict, optional If chunks is provided, it used to load the new dataset into dask arrays. ignore_unknown_vars : boolean, optional Don't raise an error if unknown variables are encountered while reading the dataset. default_dtype : numpy.dtype, optional A datatype to fall back on if the metadata can't be read. nx, ny, nz : int, optional The numerical dimensions of the model. These will be inferred from ``XC.meta`` and ``RC.meta`` if they are not specified. If ``geometry==llc``, ``ny`` does not have to specified. llc_method : {"smallchunks", "bigchunks"}, optional Which routine to use for reading LLC data. "smallchunks" splits the file into a individual dask chunk of size (nx x nx) for each face of each level (i.e. the total number of chunks is 13 * nz). "bigchunks" loads the whole raw data file (either into memory or as a numpy.memmap), splits it into faces, and concatenates those faces together using ``dask.array.concatenate``. The different methods will have different memory and i/o performance depending on the details of the system configuration. extra_metadata : dict, optional Allow to pass information on llc type grid (global or regional). The additional metadata is typically such as : aste = {'has_faces': True, 'ny': 1350, 'nx': 270, 'ny_facets': [450,0,270,180,450], 'pad_before_y': [90,0,0,0,0], 'pad_after_y': [0,0,0,90,90], 'face_facets': [0, 0, 2, 3, 4, 4], 'facet_orders' : ['C', 'C', 'C', 'F', 'F'], 'face_offsets' : [0, 1, 0, 0, 0, 1], 'transpose_face' : [False, False, False, True, True, True]} For global llc grids, no extra metadata is required and code will set up to global llc default configuration. extra_variables : dict, optional Allow to pass variables not listed in the variables.py or in available_diagnostics.log. extra_variables must be a dict containing the variable names as keys with the corresponging values being a dict with the keys being dims and attrs. Syntax: extra_variables = dict(varname = dict(dims=list_of_dims, attrs=dict(optional_attrs))) where optional_attrs can contain standard_name, long_name, units as keys Example: extra_variables = dict( ADJtheta = dict(dims=['k','j','i'], attrs=dict( standard_name='Sensitivity_to_theta', long_name='Sensitivity of cost function to theta', units='[J]/degC')) ) Returns ------- dset : xarray.Dataset Dataset object containing all coordinates and variables. References ---------- .. [1] http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch04s04.html """ # get frame info for history frame = inspect.currentframe() _, _, _, arg_values = inspect.getargvalues(frame) del arg_values['frame'] function_name = inspect.getframeinfo(frame)[2] # auto-detect whether to swap dims if swap_dims is None: if read_grid == False: swap_dims = False else: swap_dims = False if geometry in ( 'llc', 'cs', 'curvilinear') else True # some checks for argument consistency if swap_dims and not read_grid: raise ValueError("If swap_dims==True, read_grid must be True.") # if prefix is passed as a string, force it to be a list if type(prefix) in stringtypes: prefix = [prefix] else: pass # if levels s a slice or a list, a subset of levels is needed if levels is not None and nz is not None: warnings.warn('levels has been set, nz will be ignored.') nz = None if isinstance(levels, slice): levels = np.arange(levels.start, levels.stop) # We either have a single iter, in which case we create a fresh store, # or a list of iters, in which case we combine. if iters == 'all': iters = _get_all_iternums(data_dir, file_prefixes=prefix) if iters is None: iternum = None else: try: iternum = int(iters) # if not we probably have some kind of list except TypeError: if len(iters) == 1 and levels is None: iternum = int(iters[0]) else: # We have to check to make sure we have the same prefixes at # each timestep...otherwise we can't combine the datasets. first_prefixes = prefix or _get_all_matching_prefixes( data_dir, iters[0]) for iternum in iters: these_prefixes = _get_all_matching_prefixes( data_dir, iternum, prefix ) # don't care about order if set(these_prefixes) != set(first_prefixes): raise IOError("Could not find the expected file " "prefixes %s at iternum %g. (Instead " "found %s)" % (repr(first_prefixes), iternum, repr(these_prefixes))) # chunk at least by time chunks = chunks or {} # recursively open each dataset at a time kwargs = dict( grid_dir=grid_dir, delta_t=delta_t, swap_dims=False, prefix=prefix, ref_date=ref_date, calendar=calendar, geometry=geometry, grid_vars_to_coords=False, endian=endian, chunks=chunks, ignore_unknown_vars=ignore_unknown_vars, default_dtype=default_dtype, nx=nx, ny=ny, nz=nz, llc_method=llc_method, levels=levels, extra_metadata=extra_metadata, extra_variables=extra_variables) datasets = [open_mdsdataset( data_dir, iters=iternum, read_grid=False, **kwargs) for iternum in iters] # drop all data variables not common to the datasets, # this should be done by xr.combine_by_coords, but # with the "overide" option, it does not work properly if len(datasets)>0: all_data_vars = [set(ds.data_vars) for ds in datasets] all_data_vars = set.intersection(*all_data_vars) dropped_vars = set() for k, ds in enumerate(datasets): vars_to_drop = set(ds.data_vars) ^ all_data_vars if len(vars_to_drop) > 0: datasets[k] = ds.drop_vars(vars_to_drop) dropped_vars = set(vars_to_drop) | dropped_vars if len(dropped_vars) > 0: warnings.warn(f"dropped these variables: {dropped_vars}") # now add the grid if read_grid: if 'iters' in kwargs: kwargs.pop('iters') if 'read_grid' in kwargs: kwargs.pop('read_grid') if levels is not None: kwargs.pop('nz') kwargs.pop('levels') grid_dataset = open_mdsdataset(data_dir, iters=None, read_grid=True, **kwargs) if levels is not None: grid_dataset = grid_dataset.isel(**{coord: levels for coord in ['k', 'k_l', 'k_u', 'k_p1']}) datasets.insert(0, grid_dataset) # apply chunking if sys.version_info[0] < 3: ds = xr.auto_combine(datasets) elif xr.__version__ < '0.15.2': ds = xr.combine_by_coords(datasets) elif xr.__version__ < '0.18.0': ds = xr.combine_by_coords(datasets, compat='override', coords='minimal', combine_attrs='drop') else: ds = xr.combine_by_coords(datasets, compat='override', coords='minimal', combine_attrs='drop_conflicts') if swap_dims: ds = _swap_dimensions(ds, geometry) if grid_vars_to_coords: ds = _set_coords(ds) return ds store = _MDSDataStore(data_dir, grid_dir, iternum, delta_t, read_grid, prefix, ref_date, calendar, geometry, endian, ignore_unknown_vars=ignore_unknown_vars, default_dtype=default_dtype, nx=nx, ny=ny, nz=nz, llc_method=llc_method, levels=levels, extra_metadata=extra_metadata, extra_variables=extra_variables) ds = xr.Dataset.load_store(store) if swap_dims: ds = _swap_dimensions(ds, geometry) if grid_vars_to_coords: ds = _set_coords(ds) if 'time' in ds: ds['time'] = xr.decode_cf(ds[['time']])['time'] # do we need more fancy logic (like open_dataset), or is this enough if chunks is not None: ds = ds.chunk(chunks) # set attributes for CF conventions ds.attrs['Conventions'] = "CF-1.6" ds.attrs['title'] = "netCDF wrapper of MITgcm MDS binary data" ds.attrs['source'] = "MITgcm" arg_string = ', '.join(['%s=%s' % (str(k), repr(v)) for (k, v) in arg_values.items()]) ds.attrs['history'] = ('Created by calling ' '`%s(%s)`'% (function_name, arg_string)) return ds
def _set_coords(ds): """Turn all variables without `time` dimensions into coordinates.""" coords = set() for vname in ds.variables: if ('time' not in ds[vname].dims) or (ds[vname].dims == ('time',)): coords.add(vname) return ds.set_coords(list(coords)) def _swap_dimensions(ds, geometry, drop_old=True): """Replace logical coordinates with physical ones. Does not work for llc or cs. """ keep_attrs = ['axis', 'c_grid_axis_shift'] # this fixes problems ds = ds.reset_coords() if geometry.lower() in ('llc', 'cs', 'curvilinear'): raise ValueError("Can't swap dimensions if `geometry` is `llc` or `cs`") # first squeeze all the coordinates for orig_dim in ds.dims: if 'swap_dim' in ds[orig_dim].attrs: new_dim = ds[orig_dim].attrs['swap_dim'] coord_var = ds[new_dim] for coord_dim in coord_var.dims: if coord_dim != orig_dim: # dimension should be the same along all other axes, so just # take the first row / column coord_var = coord_var.isel(**{coord_dim: 0}).drop(coord_dim) ds[new_dim] = coord_var for key in keep_attrs: if key in ds[orig_dim].attrs: ds[new_dim].attrs[key] = ds[orig_dim].attrs[key] # then swap dims for orig_dim in ds.dims: if 'swap_dim' in ds[orig_dim].attrs: new_dim = ds[orig_dim].attrs['swap_dim'] ds = ds.swap_dims({orig_dim: new_dim}) if drop_old: if sys.version_info[0] < 3: ds = ds.drop(orig_dim) else: ds = ds.drop_vars(orig_dim) return ds class _MDSDataStore(xr.backends.common.AbstractDataStore): """Representation of MITgcm mds binary file storage format for a specific model instance and a specific timestep iteration number.""" def __init__(self, data_dir, grid_dir=None, iternum=None, delta_t=1, read_grid=True, file_prefixes=None, ref_date=None, calendar=None, geometry='sphericalpolar', endian='>', ignore_unknown_vars=False, default_dtype=np.dtype('f4'), nx=None, ny=None, nz=None, llc_method="smallchunks", levels=None, extra_metadata=None, extra_variables=None): """ This is not a user-facing class. See open_mdsdataset for argument documentation. The only ones which are distinct are. Parameters ---------- iternum : int, optional The iteration timestep number to read. file_prefixes : list The prefixes of the data files to be read. """ self.geometry = geometry.lower() allowed_geometries = ['cartesian', 'sphericalpolar', 'llc', 'cs', 'curvilinear'] if self.geometry not in allowed_geometries: raise ValueError('Unexpected value for parameter `geometry`. ' 'It must be one of the following: %s' % allowed_geometries) # the directory where the files live self.data_dir = data_dir self.grid_dir = grid_dir if (grid_dir is not None) else data_dir self.extra_variables = extra_variables self._ignore_unknown_vars = ignore_unknown_vars # The endianness of the files # By default, MITgcm does big endian if endian not in ['>', '<', '=']: raise ValueError("Invalid byte order (endian=%s)" % endian) self.endian = endian if default_dtype is not None: self.default_dtype = np.dtype(default_dtype).newbyteorder(endian) else: self.default_dtype = default_dtype # storage dicts for variables and attributes self._variables = OrderedDict() self._attributes = OrderedDict() self._dimensions = [] # the dimensions are theoretically the same for all datasets [self._dimensions.append(k) for k in dimensions] self.llc = (self.geometry == 'llc') self.cs = (self.geometry == 'cs') if nz is None: self.nz = _guess_model_nz(self.grid_dir) else: self.nz = nz # if user passes extra_metadata, this should have priority user_metadata = True if extra_metadata is not None else False # put in local variable to make it more readable if extra_metadata is not None and 'has_faces' in extra_metadata: has_faces = extra_metadata['has_faces'] else: has_faces = False # --------------- LEGACY ---------------------- if self.llc: has_faces = True if extra_metadata is None or 'ny_facets' not in extra_metadata: # default to llc90, we only need number of facets # and we cannot know nx at this point llc = get_extra_metadata(domain='llc', nx=90) extra_metadata = llc if self.cs: has_faces = True if extra_metadata is None or 'ny_facets' not in extra_metadata: # default to llc90, we only need number of facets # and we cannot know nx at this point cs = get_extra_metadata(domain='cs', nx=32) extra_metadata = cs # --------------- /LEGACY ---------------------- # we don't need to know ny if using llc if has_faces and (nx is not None): ny = nx # Now we need to figure out the horizontal dimensions nx, ny # nface is the number of llc faces if (nx is not None) and (ny is not None): # we have been passed enough information to determine the # dimensions without reading any files self.ny, self.nx = ny, nx self.nface = len(extra_metadata['face_facets']) if has_faces \ else None else: # have to peek at the grid file metadata self.nface, self.ny, self.nx = ( _guess_model_horiz_dims(self.grid_dir, is_llc=self.llc, is_cs=self.cs)) # --------------- LEGACY ---------------------- if self.llc: if not user_metadata: # if user didn't provide metadata, we default to llc llc = get_extra_metadata(domain='llc', nx=self.nx) extra_metadata = llc if self.cs: if not user_metadata: # if user didn't provide metadata, we default to llc cs = get_extra_metadata(domain='cs', nx=self.nx) extra_metadata = cs # --------------- /LEGACY ---------------------- self.layers = _guess_layers(data_dir) if has_faces: nyraw = self.nx*self.nface else: nyraw = self.ny self.default_shape_3D = (self.nz, nyraw, self.nx) self.default_shape_2D = (nyraw, self.nx) self.llc_method=llc_method # Now set up the corresponding coordinates. # Rather than assuming the dimension names, we use Comodo conventions # to parse the dimension metdata. # http://pycomodo.forge.imag.fr/norm.html irange = np.arange(self.nx) jrange = np.arange(self.ny) if levels is None: krange = np.arange(self.nz) krange_p1 = np.arange(self.nz+1) else: krange = levels krange_p1 = levels + [levels[-1] + 1] # the keys are `standard_name` attribute dimension_data = { "x_grid_index": irange, "x_grid_index_at_u_location": irange, "x_grid_index_at_f_location": irange, "y_grid_index": jrange, "y_grid_index_at_v_location": jrange, "y_grid_index_at_f_location": jrange, "z_grid_index": krange, "z_grid_index_at_lower_w_location": krange, "z_grid_index_at_upper_w_location": krange, "z_grid_index_at_w_location": krange_p1, } for dim in self._dimensions: dim_meta = dimensions[dim] dims = dim_meta['dims'] attrs = dim_meta['attrs'] data = dimension_data[attrs['standard_name']] dim_variable = xr.Variable(dims, data, attrs) self._variables[dim] = dim_variable # possibly add the llc dimension # seems sloppy to hard code this here # TODO: move this metadata to variables.py if has_faces: self._dimensions.append(FACE_DIMNAME) data = np.arange(self.nface) attrs = {'standard_name': 'face_index'} dims = [FACE_DIMNAME] self._variables[FACE_DIMNAME] = xr.Variable(dims, data, attrs) # do the same for layers for layer_name, n_layer in self.layers.items(): for suffix, offset in zip(['bounds', 'center', 'interface'], [0, -1, -2]): # e.g. "layer_1RHO_bounds" # dimname = 'layer_' + layer_name + '_' + suffix # e.g. "l1_b" dimname = 'l' + layer_name[0] + '_' + suffix[0] self._dimensions.append(dimname) data = np.arange(n_layer + offset) # we should figure out a way to properly populate the layers # attributes attrs = {'standard_name': layer_name + '_layer_grid_index_at_layer_' + suffix, 'swap_dim': 'layer_' + layer_name + '_' + suffix} dim_variable = xr.Variable([dimname], data, attrs) self._variables[dimname] = dim_variable # maybe add a time dimension if iternum is not None: self.time_dim_name = 'time' self._dimensions.append(self.time_dim_name) # a variable for iteration number self._variables['iter'] = xr.Variable( (self.time_dim_name,), [iternum], {'standard_name': 'timestep', 'long_name': 'model timestep number'}) self._variables[self.time_dim_name] = _iternum_to_datetime_variable( iternum, delta_t, ref_date, calendar, self.time_dim_name ) # build lookup tables for variable metadata self._all_grid_variables = _get_all_grid_variables(self.geometry, self.grid_dir, self.layers) self._all_data_variables = _get_all_data_variables(self.data_dir, self.grid_dir, self.layers, self.extra_variables) # The rest of the data has to be read from disk. # The list `prefixes` specifies file prefixes from which to infer # The problem with this is that some prefixes are single variables # while some are multi-variable diagnostics files. prefixes = [] if read_grid: prefixes = prefixes + list(self._all_grid_variables.keys()) # add data files prefixes = (prefixes + _get_all_matching_prefixes( data_dir, iternum, file_prefixes)) for p in prefixes: # use a generator to loop through the variables in each file for (vname, dims, data, attrs) in \ self.load_from_prefix(p, iternum, extra_metadata): # print(vname, dims, data.shape) # Sizes of grid variables can vary between mitgcm versions. # Check for such inconsistency and correct if so (vname, dims, data, attrs) = self.fix_inconsistent_variables( vname, dims, data, attrs) # Create masks from hFac variables data = self.calc_masks(vname, data) thisvar = xr.Variable(dims, data, attrs) self._variables[vname] = thisvar # print(type(data), type(thisvar._data), thisvar._in_memory) def fix_inconsistent_variables(self, vname, dims, data, attrs): if vname == 'drC': #check to see if the drC variable has the wrong length if len(data)==self.nz: #create a new array which will replace it drc_data = np.zeros(self.nz + 1) drc_data[:-1] = np.asarray(data) #fill in the missing value drc_data[-1] = 0.5 * data[-1] data = drc_data return vname, dims, data, attrs def calc_masks(self, vname, data): """Compute mask as True where hFac nonzero, otherwise False""" if vname[0:4] == 'mask': data = data > 0 return data def load_from_prefix(self, prefix, iternum=None, extra_metadata=None): """Read data and look up metadata for grid variable `name`. Parameters ---------- name : string The name of the grid variable. iternume : int (optional) MITgcm iteration number Yields ------- varname : string The name of the variable dims : list The dimension list data : arraylike The raw data attrs : dict The metadata attributes """ fname_base = prefix # some special logic is required for grid variables if prefix in self._all_grid_variables: # grid variables don't have an iteration number suffix iternum = None # some grid variables have a different filename than their varname if 'filename' in self._all_grid_variables[prefix]: fname_base = self._all_grid_variables[prefix]['filename'] ddir = self.grid_dir else: assert iternum is not None ddir = self.data_dir basename = os.path.join(ddir, fname_base) chunks = "CS" if self.cs else "3D" try: vardata = read_mds(basename, iternum, endian=self.endian, llc=self.llc, llc_method=self.llc_method, extra_metadata=extra_metadata, chunks=chunks) except IOError as ioe: # that might have failed because there was no meta file present # we can try to get around this by specifying the shape and dtype try: ndims = len(self._all_data_variables[prefix]['dims']) except KeyError: ndims = 3 if ndims == 3 and self.nz > 1: data_shape = self.default_shape_3D elif ndims == 2 or self.nz == 1: data_shape = self.default_shape_2D elif self.cs: data_shape = None # handle this inside read_mds else: raise ValueError("Can't determine shape " "of variable %s" % prefix) vardata = read_mds(basename, iternum, endian=self.endian, dtype=self.default_dtype, shape=data_shape, llc=self.llc, llc_method=self.llc_method, extra_metadata=extra_metadata, chunks=chunks) for vname, data in vardata.items(): # we now have to revert to the original prefix once the file is read if fname_base != prefix: vname = prefix try: metadata = (self._all_grid_variables[vname] if vname in self._all_grid_variables else self._all_data_variables[vname]) except KeyError: if self._ignore_unknown_vars: # we didn't find any metadata, so we just skip this var continue else: raise KeyError("Couln't find metadata for variable %s " "and `ignore_unknown_vars`==False." % vname) # maybe slice and squeeze the data if 'slice' in metadata: # if we are slicing, we can assume it is safe to convert dask # array to numpy data = np.asarray(data) sl = metadata['slice'] # need to promote the variable to higher dimensions in the # to handle certain 2D model outputs if len(sl) == 3 and data.ndim == 2: data.shape = (1,) + data.shape data = np.atleast_1d(data[sl]) if 'transform' in metadata: # transform is a function to be called on the data data = metadata['transform'](data) # make sure we copy these things dims = list(metadata['dims']) attrs = dict(metadata['attrs']) # Some 2D output squeezes one of the dimensions out (e.g. hFacC). # How should we handle this? Can either eliminate one of the dims # or add an extra axis to the data. Let's try the former, on the # grounds that it is simpler for the user. if ((len(dims) == 3 and data.ndim == 2) or ((self.llc or self.cs) and (len(dims) == 3 and data.ndim == 3))): # Deleting the first dimension (z) assumes that 2D data always # corresponds to x,y horizontal data. Is this really true? # The answer appears to be yes: 2D (x|y,z) data retains the # missing dimension as an axis of length 1. # Also handles https://github.com/xgcm/xmitgcm/issues/140 # (special case for 2d llc diags) dims = dims[1:] elif len(dims) == 1 and data.ndim > 1: # this is for certain profile data like RC, PHrefC, etc. # for some reason, dask arrays don't work here # ok to promote to numpy array because data is always 1D data = np.atleast_1d(np.asarray(data).squeeze()) if self.llc: dims, data = _reshape_for_llc(dims, data) if self.cs: dims, data = _reshape_for_cs(dims, data) # need to add an extra dimension at the beginning if we have a time # variable if iternum is not None: dims = [self.time_dim_name] + dims #newshape = (1,) + data.shape #data = data.reshape(newshape) data = data[None] yield vname, dims, data, attrs def get_variables(self): return self._variables def get_attrs(self): return self._attributes def get_dimensions(self): return self._dimensions def close(self): for var in list(self._variables): del self._variables[var] def _guess_model_nz(data_dir): try: rc_meta = parse_meta_file(os.path.join(data_dir, 'RC.meta')) if len(rc_meta['dimList']) == 2: nz = 1 else: nz = rc_meta['dimList'][2][2] except IOError: raise IOError("Couldn't find RC.meta file to infer nz.") return nz def _guess_model_horiz_dims(data_dir, is_llc=False, is_cs=False): try: xc_meta = parse_meta_file(os.path.join(data_dir, 'XC.meta')) nx = int(xc_meta['dimList'][0][0]) ny = int(xc_meta['dimList'][1][0]) except IOError: raise IOError("Couldn't find XC.meta file to infer nx and ny.") if is_llc: nface = LLC_NUM_FACES ny //= nface elif is_cs: nface = CS_NUM_FACES if ny > nx: ny //= nface else: nx //= nface else: nface = None return nface, ny, nx def _guess_layers(data_dir): """Return a dict matching layers suffixes to dimension length.""" layers_files = listdir_startsandendswith(data_dir, 'layers', '.meta') all_layers = {} for fname in layers_files: # make sure to exclude filenames such as # "layers_surfflux.01.0000000001.meta" if not re.search('\.\d{10}\.', fname): # should turn "foo/bar/layers1RHO.meta" into "1RHO" layers_suf = os.path.splitext(os.path.basename(fname))[0][6:] meta = parse_meta_file(os.path.join(data_dir, fname)) Nlayers = meta['dimList'][2][2] all_layers[layers_suf] = Nlayers return all_layers def _get_all_grid_variables(geometry, grid_dir=None, layers={}): """"Put all the relevant grid metadata into one big dictionary.""" possible_hcoords = {'cartesian': horizontal_coordinates_cartesian, 'llc': horizontal_coordinates_llc, 'cs': horizontal_coordinates_cs, 'curvilinear': horizontal_coordinates_curvcart, 'sphericalpolar': horizontal_coordinates_spherical} hcoords = possible_hcoords[geometry] # look for extra variables, if they exist in grid_dir extravars = _get_extra_grid_variables(grid_dir) if grid_dir is not None else {} allvars = [hcoords, vertical_coordinates, horizontal_grid_variables, vertical_grid_variables, volume_grid_variables, mask_variables, extravars] # tortured logic to add layers grid variables layersvars = [_make_layers_variables(layer_name) for layer_name in layers] allvars += layersvars metadata = _concat_dicts(allvars) return metadata def _get_extra_grid_variables(grid_dir): """Scan a directory and return all file prefixes for extra grid files. Then return the variable information for each of these""" extra_grid = {} fnames = dict([[val['filename'],key] for key,val in extra_grid_variables.items() if 'filename' in val]) all_datafiles = listdir_endswith(grid_dir, '.data') for f in all_datafiles: prefix = os.path.split(f[:-5])[-1] # Only consider what we find that matches extra_grid_vars if prefix in extra_grid_variables: extra_grid[prefix] = extra_grid_variables[prefix] elif prefix in fnames: extra_grid[fnames[prefix]] = extra_grid_variables[fnames[prefix]] return extra_grid def _make_layers_variables(layer_name): """Translate metadata template to actual variable metadata.""" from .variables import layers_grid_variables lvars = OrderedDict() layer_num = layer_name[0] # should always be int assert isinstance(int(layer_num), int) layer_id = 'l' + layer_num for key, vals in layers_grid_variables.items(): # replace the name template with the actual name # e.g. layer_NAME_bounds -> layer_1RHO_bounds varname = key.replace('NAME', layer_name) metadata = _recursively_replace(vals, 'NAME', layer_name) # now fix dimension metadata['dims'] = [metadata['dims'][0].replace('l', layer_id)] lvars[varname] = metadata return lvars def _recursively_replace(item, search, replace): """Recursively search and replace all strings in dictionary values.""" if isinstance(item, dict): return {key: _recursively_replace(item[key], search, replace) for key in item} try: return item.replace(search, replace) except AttributeError: # probably no such method return item def _get_all_data_variables(data_dir, grid_dir, layers, extra_variables): """"Put all the relevant data metadata into one big dictionary.""" allvars = [state_variables] allvars.append(package_state_variables) if extra_variables is not None: allvars.append(extra_variables) # add others from available_diagnostics.log # search in the data dir fnameD = os.path.join(data_dir, 'available_diagnostics.log') # and in the grid dir fnameG = os.path.join(grid_dir, 'available_diagnostics.log') # first look in the data dir if os.path.exists(fnameD): diag_file = fnameD # then in the grid dir elif os.path.exists(fnameG): diag_file = fnameG else: warnings.warn("Couldn't find available_diagnostics.log " "in %s or %s. Using default version." % (data_dir, grid_dir)) from .default_diagnostics import diagnostics diag_file = StringIO(diagnostics) available_diags = parse_available_diagnostics(diag_file, layers) allvars.append(available_diags) metadata = _concat_dicts(allvars) # Now add the suffix '-T' to every diagnostic. This is a somewhat hacky # way to increase the coverage of possible output filenames. # But it doesn't work in python3!!! extra_metadata = OrderedDict() for name, val in metadata.items(): newname = name + '-T' extra_metadata[newname] = val metadata = _concat_dicts([metadata, extra_metadata]) # now fill in aliases for alias, original in aliases.items(): metadata[alias] = metadata[original] return metadata def _concat_dicts(list_of_dicts): result = OrderedDict() for eachdict in list_of_dicts: for k, v in eachdict.items(): result[k] = v return result def _get_all_iternums(data_dir, file_prefixes=None, file_format='*.??????????.data'): """Scan a directory for all iteration number suffixes.""" iternums = set() all_datafiles = listdir_fnmatch(data_dir, file_format) istart = file_format.find('?')-len(file_format) iend = file_format.rfind('?')-len(file_format)+1 for f in all_datafiles: iternum = int(f[istart:iend]) prefix = os.path.split(f[:istart-1])[-1] if file_prefixes is None: iternums.add(iternum) else: if prefix in file_prefixes: iternums.add(iternum) iterlist = sorted(iternums) return iterlist def _is_pickup_prefix(prefix): if len(prefix) >= 6: if prefix[:6] == 'pickup': return True return False def _get_all_matching_prefixes(data_dir, iternum, file_prefixes=None, ignore_pickup=True): """Scan a directory and return all file prefixes matching a certain iteration number.""" if iternum is None: return [] prefixes = set() all_datafiles = listdir_endswith(data_dir, '.%010d.data' % iternum) for f in all_datafiles: iternum = int(f[-15:-5]) prefix = os.path.split(f[:-16])[-1] if file_prefixes is None: if not (ignore_pickup and _is_pickup_prefix(prefix)): prefixes.add(prefix) else: if prefix in file_prefixes: prefixes.add(prefix) return list(prefixes) def _iternum_to_datetime_variable(iternum, delta_t, ref_date, calendar, time_dim_name='time'): # create time array timedata = np.atleast_1d(iternum)*delta_t time_attrs = {'standard_name': 'time', 'long_name': 'Time', 'axis': 'T'} if ref_date is not None: time_attrs['units'] = 'seconds since %s' % ref_date else: time_attrs['units'] = 'seconds' if calendar is not None: time_attrs['calendar'] = calendar timevar = xr.Variable((time_dim_name,), timedata, time_attrs) return timevar def _reshape_for_llc(dims, data): """Take dims and data and return modified / reshaped dims and data for llc geometry.""" # the data should already come shaped correctly for llc # but the dims are not yet correct # the only dimensions that get expanded into faces expand_dims = ['j', 'j_g'] for dim in expand_dims: if dim in dims: # add face dimension to dims jdim = dims.index(dim) dims.insert(jdim, FACE_DIMNAME) assert data.ndim==len(dims), '%r %r' % (data.shape, dims) return dims, data def _reshape_for_cs(dims, data): """Take dims and data and return modified / reshaped dims and data for cs geometry.""" # the data should already come shaped correctly for cs # but the dims are not yet correct # the only dimensions that get expanded into faces expand_dims = ['j', 'j_g'] for dim in expand_dims: if dim in dims: # add face dimension to dims jdim = dims.index(dim) dims.insert(jdim+1, FACE_DIMNAME) assert data.ndim == len(dims), '%r %r' % (data.shape, dims) return dims, data