HDF5 datastore in place - need to patch in to the node now (7673a2fd) · Commits · aflab / astrogeology / Autocnet

autocnet/fileio/hdf.py

deleted100644 → 0

+0 −38

Original line number	Diff line number	Diff line
		import h5py as h5
		import numpy as np

		class HDFDataSet(object):
		"""
		Read / Write an HDF5 dataset using h5py
		"""

		#TODO: This is dumb, why did I hard code this...
		def __init__(self, filename='/scratch/jlaura/newrun.h5'):
		self.filename = filename
		self.groups = None

		@property
		def data(self):
		if not hasattr(self, '_data'):
		self._data = h5.File(self.filename)
		return self._data

		def getgroups(self):
		"""
		Get all of the first order neighbors to the root node.

		Returns
		-------
		groups : list
		A unicode list of the keys of the file.
		"""
		if self.groups == None:
		self.groups = self.data.keys()
		return self.groups

		def getattributes(self):
		if self.groups == None:
		self.groups = self.data.keys()

		for k in self.groups:
		print self.data[k].attrs.items()

autocnet/fileio/io_hdf.py

0 → 100644

+78 −0

Original line number	Diff line number	Diff line
		import h5py as h5
		import numpy as np
		import pandas as pd


		class HDFDataset(h5.File):
		"""
		Read / Write an HDF5 dataset using h5py. If HDF5 is compiled with
		parallel support, this class will support parallel I/O of all supported
		types as well as Pandas dataframes.
		"""

		def __init__(self, filename, mode='a'):
		super(HDFDataset, self).__init__(filename, mode)

		def __del__(self):
		self.close()

		@staticmethod
		def df_to_sarray(df):
		"""
		Convert a pandas DataFrame object to a numpy structured array.
		This is functionally equivalent to but more efficient than
		np.array(df.to_array())

		From: http://stackoverflow.com/questions/30773073/save-pandas-dataframe-using-h5py-for-interoperabilty-with-other-hdf5-readers

		Parameters
		----------
		df : dataframe
		the data frame to convert

		Returns
		-------
		z : ndarray
		a numpy structured array representation of df
		"""

		v = df.values
		cols = df.columns
		types = [(cols[i], df[k].dtype.type) for (i, k) in enumerate(cols)]
		dtype = np.dtype(types)
		z = np.zeros(v.shape[0], dtype)
		for (i, k) in enumerate(z.dtype.names):
		z[k] = v[:, i]
		return z

		@staticmethod
		def sarray_to_df(sarray, index_column='index'):
		"""
		Convert from a structured array back to a Pandas Dataframe

		Parameters
		----------
		sarray : array
		numpy structured array

		Returns
		-------
		: dataframe
		A pandas dataframe
		"""

		def remove_field_name(a, name):
		names = list(a.dtype.names)
		if name in names:
		names.remove(name)
		b = a[names]
		return b
		if index_column is not None:
		index = sarray[index_column]
		clean_array = remove_field_name(sarray, 'index')
		else:
		clean_array = sarray
		index = None
		columns = clean_array.dtype.names

		return pd.DataFrame(data=sarray, index=index, columns=columns)
		No newline at end of file

autocnet/fileio/tests/test_io_hdf.py

0 → 100644

+15 −0

Original line number	Diff line number	Diff line
		import unittest

		import numpy as np
		import pandas as pd

		from .. import io_hdf


		class TestHDF(unittest.TestCase):

		def test_df_sarray(self):
		self.assertTrue(False)

		def test_sarray_df(self):
		self.assertTrue(False)

requirements.txt

+2 −1

Original line number	Diff line number	Diff line
		@@ -10,3 +10,4 @@ pandas
		scikit-image
		sqlalchemy
		dill
		h5py
		No newline at end of file