"""GigaAnalysis - Data Set Management - :mod:`gigaanalysis.dset`
-------------------------------------------------------------------
This module has functions to save nested dictionaries with :class:`Data`
objects as the values. It provides the functionality to save and read from
HDF5 files using `h5py <https://www.h5py.org/>`_ and also .csv files. It
also can create and use :class:`pandas.DataFrame` to store and display
associated meta data.
"""
from .data import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py # For interacting with HDF5 files
[docs]def check_set(data_set, meta_df=None, higher_key=()):
"""Checks the data_set and metadata data frame is the correct from.
This goes through the nested dictionaries and checks that the values
contained are either :class:`dict` or :class:`gigaanalysis.data.Data`
objects. If objects other than these are found errors are thrown. The
metadata dictionary ``meta_df`` is checked that every
:class:`Data` has a row that is describing it in the ``meta_df``.
Parameters
----------
data_set : dict of {str: dict or Data}
A dictionary containing either nested dictionaries or
:class:`gigaanalysis.data.Data` objects.
meta_df : pandas.DataFrame
Metadata held in a :class:`pandas.DataFrame` where the indexes are a
the keys of ``data_set`` and the columns provide information
about the :class:`Data` objects. For nested
dictionaries hierarchical indexing is used
(:class:`pandas.MultiIndex`).
higher_key : tuple, optional
Tuple with keys in used for the regression to start in a nested
dictionary.
Returns
-------
count : int
The number of layers of the data_set.
"""
if not isinstance(data_set, dict):
raise TypeError(
f"data_set was not a dict but instead a {type(data_set)}.")
for key, val in data_set.items():
new_key = (*higher_key, key)
if isinstance(val, dict):
count = 1 + check_set(val, meta_df, new_key)
elif not isinstance(val, Data):
raise TypeError(
f"The dictioaries contain objects which are "
f"not dictionaires or Data objects. The object in "
f"key:{new_key}, was a {type(val)}.")
elif meta_df is not None:
count = 1
if isinstance(new_key, tuple) and len(new_key) == 1:
new_key = new_key[0]
if new_key not in meta_df.index:
raise ValueError(
f"The meta DataFrame given did not have items "
f"which where in the data_set. The key missing "
f"was: {new_key}.")
else:
count = 1
return count
def __label_hdf5_as_ga_set(file, location):
"""Add a attribute to a hdf5 group called "ga_data_set" and create
it if it doesn't exist.
Parameters
----------
file : h5py.File
A file object that references the `.hdf5` file where the data is
being saved.
location : str
The location of the group to consider in the `.hdf5` file.
"""
if location not in file:
file.create_group(location)
file[location].attrs['ga_data_set'] = True
def __set_attrs_from_df(dset, data, meta_df, key):
"""Sets the attributes of a :class:`h5py.Dataset` to hold the metadata.
Used by :func:`set_to_hdf5`.
Parameters
----------
dset : h5py.Dataset
The dataset to set the attributes of.
data : gigaanalysis.data.Data
The data that is being saved to the dataset.
meta_df : pandas.DataFrame or None
Where the meta data is stored
key : tuple
The key that refers to the row in the matadata table
"""
if meta_df is not None:
if isinstance(key, tuple) and len(key) == 1:
key = key[0]
attrs_to_set = dict(meta_df.loc[key].items())
else:
attrs_to_set = dict()
# set some values that should be in every metadata table
attrs_to_set.update({
'size':len(data),
'min_x':data.min_x() if len(data) != 0 else np.nan,
'max_x':data.max_x() if len(data) != 0 else np.nan,
})
for prop, val in attrs_to_set.items():
if not pd.isnull(val):
dset.attrs[prop] = val
def __hdf5_set(file, data_set, meta_df, higher_key=(),
location="/"):
"""Goes though a data_set and save the values to a :class:`h5py.File`.
This is called in :func:`set_to_hdf5`.
Parameters
----------
file : h5py.File
A file object that references the HDF5 file where the data is
being saved.
data_set : dict of {str: dict or Data}
A dictionary containing either nested dictionaries or
:class:`gigaanalysis.data.Data` objects.
meta_df : pandas.DataFrame
Metadata held in a :class:`pandas.DataFrame` where the indexes are
the keys of the `data_set` dict.
higher_key : tuple, optional
Tuple with keys in used for the regression to start in a nested
dictionary.
location : str, optional
The location of the hdf5 Group to save the data to.
"""
for key, val in data_set.items():
# remove non allowed characters ' ' and '/'
# and sort keys and locations
key = str(key).replace(' ', '').replace('/', '')
new_key = (*higher_key, key)
new_loc = f"{location}{'/'.join(new_key)}"
if isinstance(val, dict): # if dict in dict call self
__label_hdf5_as_ga_set(file, new_loc)
__hdf5_set(file, val, meta_df, new_key, location)
elif not isinstance(val, Data):
raise TypeError(
f"The dictioaries contain objects which are "
f"not dictionaires or Data objects. The object in "
f"key:{new_key}, was a {type(val)}.")
else:
file.create_dataset(new_loc, data=val.values)
__set_attrs_from_df(
file[new_loc], val, meta_df, new_key)
[docs]def set_to_hdf5(data_set, file_name, meta_df=None,
location="/", overwrite=False, info_attr=None):
"""This saves a data set to a HDF5 file.
This saves a data set of made of nested :class:`dict` of
:class:`gigaanalysis.data.Data` to a HDF5 file, using
:class:`h5py.File`. This can also take a :class:`pandas.DataFrame`
containing the associated meta_data.
Parameters
----------
data_set : dict of {str: dict or Data}
A dictionary containing either nested dictionaries or
:class:`gigaanalysis.data.Data` objects.
file_name : str
The file name to save the HDF5 file with
meta_df : pandas.DataFrame, optional
Metadata held in a :class:`pandas.DataFrame` where the indexes are a
the keys of the `data_set` dict and the columns provide information
about the :class:`Data` objects. For nested
dictionaries hierarchical indexing is used
(:class:`pandas.MultiIndex`).
location : str, optional
The location of the HDF5 group to save the data to. The default is
the root group.
overwrite : bool, optional
If the function should overwrite existing HDF5 file. The default is
to not overwrite.
info_attr : str, optional
If a string is given this is set as an HDF5 attribute to group. This
can hold a description of data if required.
"""
if location != "/": # In case the user forgets the "/" at the end
if location[-1] != "/":
location += "/"
if not isinstance(data_set, dict):
raise TypeError(
f"data_set needs to be a dict but is a {type(data_set)}")
if meta_df is not None:
if not isinstance(meta_df, pd.DataFrame):
raise TypeError(
f"meta_table needs to be a pandas.DataFrame but is "
f"a {type(meta_df)}")
check_set(data_set, meta_df)
read_write = 'w' if overwrite else 'a'
with h5py.File(file_name, read_write) as file:
__label_hdf5_as_ga_set(file, location)
__hdf5_set(file, data_set, meta_df, location=location)
if isinstance(info_attr, str):
file[location].attrs['info'] = info_attr
def __print_hdf5_group(group):
"""A recursive function to call used by :func:`print_hdf5`.
Parameters
----------
group : h5py.Group
The HDF5 group to print.
"""
for val in group.values():
if isinstance(val, h5py.Group):
print(
f"{val.name} - Group\n"
f" {list(val.attrs.items())}")
__print_hdf5_group(val)
else:
print(
f"{val.name} - Data Set\n"
f" {list(val.attrs.items())}")
[docs]def print_hdf5(file_name):
"""Prints the names and attributes of the contents of a HDF5 file.
Parameters
----------
file_name : str
The name of the HDF5 file to read.
"""
with h5py.File(file_name, 'r') as file:
__print_hdf5_group(file['/'])
def __read_hdf5_group(group, data_set, meta_df):
"""Build up a data set and metadata table recursively from a HDF5 file.
Parameters
----------
group : h5py.Group
The group to read in the HDF5 file.
data_set : dict
The dictionary to save the :class:`gigaanalysis.data.Data` in.
meta_df : pandas.DataFrame
A metadata table to fill with the attributes of the
:class:`h5py.Dataset` in the group.
Returns
-------
data_set : dict
The dictionary after it is populated.
meta_df : pandas.DataFrame
A metadata table after it is populated.
"""
try:
assert group.attrs['ga_data_set']
except:
raise ValueError(
f"The groups given to extract gigaanalyis data from "
f"did not have the tag ga_data_set in the location "
f"{group.name}.")
for val in group.values():
this_key = val.name.split('/')[-1]
if isinstance(val, h5py.Group):
data_set[this_key] = {}
data_set[this_key], meta_df = __read_hdf5_group(
val, data_set[this_key], meta_df)
else:
data_set[this_key] = Data(val[:])
new_row = pd.DataFrame(
[dict(val.attrs.items())],
index=[val.name])
meta_df = meta_df.append(new_row)
return data_set, meta_df
def __count_layer(dict_to_check, count=0):
"""Counts the layers of :class:`dict` in a nested dictionary.
This only counts the depth first item and assumes the rest is the same.
Parameters
----------
dict_to_check : dict
The dictionary to check how many layers of dictionaries are inside.
count : int
The count which is used recursively.
"""
count +=1
for val in dict_to_check.values():
if isinstance(val, dict):
count = __count_layer(val, count)
break
return count
def __reindex_meta(meta_df, layers):
"""Swaps the indexes in a table from HDF5 locations to multiindex tupels.
Parameters
----------
meta_df : pandas.DataFrame
The metadata table to reindex.
layers : int
The number of layers in the data set to use as an index.
Returns
-------
meta_df : pandas.DataFrame
The metadata table with the new indexes.
"""
meta_df = meta_df.reset_index()
capture = '/([^/]*)'*layers + '$'
key_list = [f'key{x+1}' for x in range(layers)]
meta_df[key_list] = meta_df['index'].str.extract(capture)
meta_df = meta_df.drop('index', axis=1)
meta_df = meta_df.set_index(key_list)
return meta_df
[docs]def set_from_hdf5(file_name, location='/'):
"""Reads a HDF5 file and returns a dataset and a metadata table.
This reads a HDF5 file using :class:`h5py.File`, and produces a dataset
comprising of a nested :class:`dict` which contains
:class:`gigaanalysis.data.Data` objects. The dataset is accompanied by a
metadata table in the form of a :class:`pandas.DataFrame` with the
indexes are the same as the keys of the dictionaries.
Parameters
----------
file_name : str
The name of the HDF5 file to read.
location : str, optional
The location of the group in the HDF5 file which contains the
dataset to be read. The default is the root group.
Returns
-------
data_set : dict of {str: dict or Data}
A dictionary containing either nested dictionaries or
:class:`gigaanalysis.data.Data` objects.
meta_df : pandas.DataFrame, optional
Metadata held in a :class:`pandas.DataFrame` where the indexes are a
the keys of ``data_set`` and the columns provide information
about the :class:`Data` objects. For nested
dictionaries hierarchical indexing is used
(:class:`pandas.MultiIndex`).
"""
data_set = {}
meta_df = pd.DataFrame(columns=[
'size', 'min_x', 'max_x'])
with h5py.File(file_name, 'r') as file:
data_set, meta_df = __read_hdf5_group(
file[location], data_set, meta_df)
meta_df = __reindex_meta(
meta_df, __count_layer(data_set))
return data_set, meta_df
[docs]def array_to_hdf5(data, file_name, location, attributes=None,
overwrite=False):
"""Saves a numpy array to a HDF5 file.
This is for saving a plane :class:`numpy.ndarray` to a HDF5 using
:class:`h5py.File`. This is meant to work in the same style as
:func:`set_to_hdf5`. It also can save a set of attributes in the form of
a dictionary.
Parameters
----------
data : numpy.ndarray
The data to save to the file in a numpy array.
file_name : str
The name of the HDF5 file to save the data to.
location : str
The location of the :class:`h5py.Dataset`, which is a string with
the groups and the data set name separated by "/".
attributes : dict of {str: val}, optional
A dictionary of meta data to attach to the data set. The keys of the
dictionary need to be `str`. Default is None and attaches no
attributes to the data set.
overwrite : bool, optional
If default of `False` the existing file is not overwritten and is
instead added to. This will throw an error if trying to save to a
location of an already existing dataset.
"""
# Start with checking location specifier
if not isinstance(location, np.str):
raise TypeError(
f"location needs to be a string but was a "
f"{type(location)}")
location = location.replace(" ", "")
if location[-1] == "/":
raise ValueError(
"The location and the data set name need to spesified "
"no data set name was given after the last '/'")
if location[0] != "/": # Same if they spesify the root group
location = "/" +location
# Check the data is a correct kind of np.array
if not isinstance(data, np.ndarray):
raise TypeError(
f"data needs to be a numpy array but is a {type(data)}")
if data.dtype == 'O':
raise TypeError(
f"The array contained python object type values but "
f"these cannot be saved to HDF5 files.")
# Check the attributes are the correct type
if attributes is not None:
if not isinstance(attributes, dict):
raise TypeError(
f"attributes needs to be a dict but was type "
f"{type(attributes)}")
if not all(isinstance(key, np.str) for key in attributes.keys()):
raise TypeError(
f"The keys for the attributes need to be all strings")
# Parse the location into groups and dset names
locs = location.split('/')
dset_name = locs[-1]
if len(locs) == 2:
group_name = None
else:
group_name = "/".join(locs[:-1])
# Open File
read_write = 'w' if overwrite else 'a'
with h5py.File(file_name, read_write) as file:
if group_name is None:
file.create_dataset(dset_name, data=data)
dset = file[dset_name]
elif group_name in file:
file[group_name].create_dataset(dset_name, data=data)
dset = file[group_name][dset_name]
else:
file.create_group(group_name)
file[group_name].create_dataset(dset_name, data=data)
dset = file[group_name][dset_name]
if attributes is not None:
for prop, val in attributes.items():
dset.attrs[prop] = val
[docs]def array_from_hdf5(file_name, location):
"""This reads a dataset in a HDF5 file to a numpy array.
This function is to read the data saved using the :func:`array_to_hdf5`.
It reads the data and the attributes using :class:`h5py.File` and
returns the result.
Parameters
----------
file_name : str
The name of the HDF5 file to be read.
location : str
The location of the dataset with the groups and dataset name
separated by "/".
Returns
-------
data : numpy.ndarray
A numpy array containing the data in the data set.
attributes: dict
A dictionary containing the attributes of the data set that was
read. If there was no attributes then the dictionary will be empty.
"""
if not isinstance(location, np.str):
raise TypeError(
f"location need to be a string but was a {type(location)}")
if location[0] != "/":
location = "/" + location
with h5py.File(file_name, 'r') as file:
if location not in file:
raise ValueError(
f"This is not a valid location of a data set.")
if not isinstance(file[location], h5py.Dataset):
raise ValueError(
f"The spesified location is not a data set")
data = file[location][:]
attributes = dict(file[location].attrs.items())
return data, attributes
[docs]def sort_dset(dataset, apply_key=None, sort_key=None, check_data=True):
"""This sorts and formats the keys in a dataset.
This is useful after loading a dataset from a HDF5 file, as keys that
were floats will have been set to strings and then loaded by the leading
digit. This function can apply a function to each key and then sort them.
Parameters
----------
dataset : recursive dict of Data
This is the dataset to sort which are nested dictionaries of Data
objects.
apply_key : function or list of function, optional
This is a function that will be applied to the keys to reformat them
before the are reordered. If a list is given then each function will
be applied on each layer of the dataset in turn. If `None` then no
function is applied. The default is `None`.
sort_key : function or list of function, optional
This is the key that is passed to `sorted` to sort the dataset
based on its keys. If a list is given then each function will be
applied to each layer of the dataset. If `None` then no key is passed
and the default sorting behaviour is used. If the string 'pass' is
given then no sorting is applied. The default is `None`.
check_data : bool, optional
Whether to check if the objects in the dict are :class:`Data`
objects. The default is True.
Returns
-------
dataset : recursive dict of Data
The dataset after the functions have been applied to the keys and the
keys and then they have been sorted.
"""
if isinstance(dataset, dict): # deal with dataset
pass
elif check_data and not isinstance(dataset, Data):
raise TypeError(
f"The dataset contains values other than gigaanalyis.Data "
f"objects. Contained {type(dataset)}. To turn off this check "
f"set check_data to False.")
else:
return dataset # For recursion
if isinstance(apply_key, list): # if apply_key a list
if len(apply_key) == 0:
raise ValueError(
f"The apply_key list was not as deep as the nested dataset.")
apply_list = apply_key[1:]
apply_key = apply_key[0]
else:
apply_list = None
if apply_key is None: # check apply_key
apply_key = lambda x: x
if apply_list is None:
apply_list = apply_key
elif callable(apply_key):
if apply_list is None:
apply_list=apply_key
else:
raise TypeError(
f"apply_key needs to be callable or a list of callable "
f"functions but was {type(apply_key)}")
if isinstance(sort_key, list): # if sort_ley a list
if len(sort_key) == 0:
raise ValueError(
f"The sort_key list was not as deep as the nested dataset.")
sort_list = sort_key[1:]
sort_key = sort_key[0]
else:
sort_list = None
if sort_key is None: # check sort_key
pass
elif sort_key == "pass":
sort_key = lambda x: 0
if sort_list is None:
sort_list = sort_key
elif callable(sort_key):
if sort_list is None:
sort_list = sort_key
else:
raise TypeError(
f"sort_key needs to be callable or a list of callable "
f"functions but was {type(sort_key)}")
# edit key and preform recursion on included data
dataset = {
apply_key(key):sort_dset(dat, apply_list, sort_list, check_data) \
for key, dat in dataset.items()}
# Sort by key and return
sorted_keys = sorted(dataset.keys(), key=sort_key)
dataset = {key:dataset[key] for key in sorted_keys}
return dataset