Commit 4540712b authored by Kristin Berry's avatar Kristin Berry
Browse files

Merge pull request #69 from jlaura/hdf

Adds HDF io Module
parents 49f7e50b 26658a9e
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -27,13 +27,13 @@ install:
  - conda info -a

  # Create a virtual env and install dependencies
  - conda create -y -q -n test-env python=$TRAVIS_PYTHON_VERSION nose numpy pillow scipy pandas networkx scikit-image sqlalchemy numexpr dill
  - conda create -y -q -n test-env python=$TRAVIS_PYTHON_VERSION nose numpy pillow scipy pandas networkx scikit-image sqlalchemy numexpr dill cython
  # Activate the env
  - source activate test-env

  # Install the non-conda packages if required, requirements.txt duplicates are ignored
  - conda install -c https://conda.anaconda.org/jlaura opencv3=3.0.0
  - conda install -c https://conda.anaconda.org/anaconda gdal
  - conda install -c https://conda.anaconda.org/jlaura h5py gdal
  - conda install -c osgeo proj4
  - conda upgrade numpy
  - pip install -r requirements.txt
+1 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ Development Team
* Jeannie Backer <jwbacker@usgs.gov>
* Dyer Lytle <dmlytle@usgs.gov>
* Kelvin Rodriguez <krodriguez@usgs.gov>
* Adam Paquette <acpaquette@usgs.gov>

Contributors
------------

autocnet/fileio/hdf.py

deleted100644 → 0
+0 −38
Original line number Diff line number Diff line
import h5py as h5
import numpy as np

class HDFDataSet(object):
    """
    Read / Write an HDF5 dataset using h5py
    """

    #TODO: This is dumb, why did I hard code this...
    def __init__(self, filename='/scratch/jlaura/newrun.h5'):
	    self.filename = filename
	    self.groups = None
    
    @property
    def data(self):
        if not hasattr(self, '_data'):
	        self._data = h5.File(self.filename)
        return self._data

    def getgroups(self):
        """
        Get all of the first order neighbors to the root node.

        Returns
        -------
        groups : list
            A unicode list of the keys of the file.
        """
        if self.groups == None:
            self.groups = self.data.keys()
        return self.groups

        def getattributes(self):
            if self.groups == None:
                self.groups = self.data.keys()
            
            for k in self.groups:
                print self.data[k].attrs.items()
+84 −0
Original line number Diff line number Diff line
import h5py as h5
import numpy as np
import pandas as pd


DEFAULT_COMPRESSION = 'gzip'
DEFAULT_COMPRESSION_VALUE = 8  # 0 - 9


class HDFDataset(h5.File):
    """
    Read / Write an HDF5 dataset using h5py.  If HDF5 is compiled with
    parallel support, this class will support parallel I/O of all supported
    types as well as Pandas dataframes.
    """

    def __init__(self, filename, mode='a'):
        super(HDFDataset, self).__init__(filename, mode)

    def __del__(self):
        self.close()

    @staticmethod
    def df_to_sarray(df):
        """
        Convert a pandas DataFrame object to a numpy structured array.
        This is functionally equivalent to but more efficient than
        np.array(df.to_array())

        From: http://stackoverflow.com/questions/30773073/save-pandas-dataframe-using-h5py-for-interoperabilty-with-other-hdf5-readers

        Parameters
        ----------
        df : dataframe
             the data frame to convert

        Returns
        -------
        z : ndarray
            a numpy structured array representation of df
        """
        v = df.values
        cols = df.columns
        types = [(cols[i], df[k].dtype.type) for (i, k) in enumerate(cols)]
        dtype = np.dtype(types)
        z = np.zeros(v.shape[0], dtype)
        for (i, k) in enumerate(z.dtype.names):
            z[k] = v[:, i]
        return z

    @staticmethod
    def sarray_to_df(sarray, index_column='index'):
        """
        Convert from a structured array back to a Pandas Dataframe

        Parameters
        ----------
        sarray : array
                 numpy structured array

        index_column : str
                       The name of the index column.  Default: 'index'

        Returns
        -------
         : dataframe
           A pandas dataframe
        """

        def remove_field_name(a, name):
            names = list(a.dtype.names)
            if name in names:
                names.remove(name)
            b = a[names]
            return b
        if index_column is not None:
            index = sarray[index_column]
            clean_array = remove_field_name(sarray, 'index')
        else:
            clean_array = sarray
            index = None
        columns = clean_array.dtype.names

        return pd.DataFrame(data=sarray, index=index, columns=columns)
+3 −3
Original line number Diff line number Diff line
@@ -207,12 +207,12 @@ class TestWriter(unittest.TestCase):
                    SPHEROID["Moon_2000_IAU_IAG",1737400,0]],
                PRIMEM["Reference_Meridian",0],
                UNIT["Degree",0.017453292519943295]],
            PROJECTION["Mercator_1SP"],
            PROJECTION["Mercator_2SP"],
            PARAMETER["central_meridian",180],
            PARAMETER["false_easting",0],
            PARAMETER["false_northing",0],
            UNIT["Meter",1],
            PARAMETER["latitude_of_origin",0.0]]"""
            PARAMETER["standard_parallel_1",0],
            UNIT["Meter",1]]"""
        dataset = io_gdal.GeoDataset('test.tif')
        test_srs = dataset.spatial_reference.__str__()
        self.assertEqual(test_srs.split(), expected_srs.split())
Loading