Commit b95a55b3 authored by jlaura's avatar jlaura Committed by GitHub
Browse files

Merge pull request #17 from acpaquette/pysat_io

PySAT io to PL io
parents a6b9d17b 8d4d0077
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ install:
  - conda config --add channels conda-forge
  - conda config --add channels jlaura
  - conda install -c conda-forge gdal h5py
  - conda install pandas sqlalchemy pyyaml networkx affine protobuf
  - conda install pandas sqlalchemy pyyaml networkx affine protobuf scipy
  - pip install pvl

  # Development installation
+1 −1
Original line number Diff line number Diff line
@@ -54,7 +54,7 @@ install:
    - cmd: conda config --add channels conda-forge
    - cmd: conda config --add channels jlaura
    - cmd: conda install --yes -c conda-forge gdal h5py
    - cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine
    - cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine scipy
    - cmd: conda install --yes -c jlaura protobuf pvl

    # Development installation

plio/io/io_ccam_pds.py

0 → 100644
+201 −0
Original line number Diff line number Diff line
# This code is used to read individual ChemCam files
# Header data is stored as attributes of the data frame
# White space is stripped from the column names
import os

import numpy as np
import pandas as pd
import scipy.io as io

from plio.utils.utils import lookup
from plio.utils.utils import file_search


def CCAM_CSV(input_data, ave=True):
    try:
        df = pd.read_csv(input_data, header=14, engine='c')
        cols = list(df.columns.values)
        df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
        df.set_index(['wave'], inplace=True)  # use wavelengths as indices
        # read the file header and put information into the dataframe as new columns
        metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None)
    except:
        try:  # handle files with an extra header row containing temperature
            df = pd.read_csv(input_data, header=15, engine='c')
            cols = list(df.columns.values)
            df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
            df.set_index(['wave'], inplace=True)  # use wavelengths as indices
            # read the file header and put information into the dataframe as new columns
            metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None)
        except:  # handle files with an extra header row containing temperature and target name
            df = pd.read_csv(input_data, header=16, engine='c')
            cols = list(df.columns.values)
            df.columns = [i.strip().replace('# ', '') for i in cols]  # strip whitespace from column names
            df.set_index(['wave'], inplace=True)  # use wavelengths as indices
            # read the file header and put information into the dataframe as new columns
            metadata = pd.read_csv(input_data, sep='=', nrows=16, comment=',', engine='c', index_col=0, header=None)

    if ave:
        df = pd.DataFrame(df['mean'])
    else:
        df = df.drop(['mean', 'median'], axis=1)
    df.index = [['wvl'] * len(df.index),
                df.index.values.round(4)]  # create multiindex so spectra can be easily extracted with a single key
    df = df.T  # transpose so that each spectrum is a row

    # remove extraneous stuff from the metadataindices
    metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values]
    metadata = metadata.T

    # extract info from the file name
    fname = os.path.basename(input_data)
    metadata['sclock'] = fname[4:13]
    metadata['seqid'] = fname[25:34].upper()
    metadata['Pversion'] = fname[34:36]

    # duplicate the metadata for each row in the df
    if not ave:
        metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True)
    metadata.index = df.index  # make the indices match
    metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]  # make the columns into multiindex
    df = pd.concat([metadata, df], axis=1)  # combine the spectra with the metadata
    return df


def CCAM_SAV(input_data, ave=True):
    # read the IDL .SAV file

    data = io.readsav(input_data, python_dict=True)

    # put the spectra into data frames and combine them
    df_UV = pd.DataFrame(data['uv'], index=data['defuv'])
    df_VIS = pd.DataFrame(data['vis'], index=data['defvis'])
    df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir'])
    df_spect = pd.concat([df_UV, df_VIS, df_VNIR])
    df_spect.columns = ['shot' + str(i + 1) for i in
                        df_spect.columns]  # add 1 to the columns so they correspond to shot number

    df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average'])
    df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average'])
    df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average'])
    df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR])

    df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median'])
    df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median'])
    df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median'])
    df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR])

    df = pd.concat([df_spect, df_ave, df_med], axis=1)
    # create multiindex to access wavelength values
    # also, round the wavlength values to a more reasonable level of precision
    df.index = [['wvl'] * len(df.index), df.index.values.round(4)]
    # transpose so that spectra are rows rather than columns
    df = df.T

    # extract metadata from the file name and add it to the data frame
    # use the multiindex label "meta" for all metadata

    fname = os.path.basename(input_data)

    # for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect'
    # this try-except pair converts to 'darkname' when needed
    try:
        data['darkname']
    except:
        data['darkname'] = data['darkspec']

    metadata = [fname,
                fname[4:13],
                fname[25:34].upper(),
                fname[34:36],
                data['continuumvismin'],
                data['continuumvnirmin'],
                data['continuumuvmin'],
                data['continuumvnirend'],
                data['distt'],
                data['darkname'],
                data['nshots'],
                data['dnoiseiter'],
                data['dnoisesig'],
                data['matchedfilter']]
    metadata = np.tile(metadata, (len(df.index), 1))
    metadata_cols = list(zip(['meta'] * len(df.index), ['file',
                                                        'sclock',
                                                        'seqid',
                                                        'Pversion',
                                                        'continuumvismin',
                                                        'continuumvnirmin',
                                                        'continuumuvmin',
                                                        'continuumvnirend',
                                                        'distt',
                                                        'dark',
                                                        'nshots',
                                                        'dnoiseiter',
                                                        'dnoisesig',
                                                        'matchedfilter']))
    metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index)

    df = pd.concat([metadata, df], axis=1)
    if ave == True:
        df = df.loc['average']
        df = df.to_frame().T
    else:
        pass

    return df


def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None):
    # Determine if the file is a .csv or .SAV
    if '.sav' in searchstring.lower():
        is_sav = True
    else:
        is_sav = False
    filelist = file_search(directory, searchstring)
    basenames = np.zeros_like(filelist)
    sclocks = np.zeros_like(filelist)
    P_version = np.zeros_like(filelist, dtype='int')

    # Extract the sclock and version for each file and ensure that only one
    # file per sclock is being read, and that it is the one with the highest version number
    for i, name in enumerate(filelist):
        basenames[i] = os.path.basename(name)
        sclocks[i] = basenames[i][4:13]  # extract the sclock
        P_version[i] = basenames[i][-5:-4]  # extract the version

    sclocks_unique = np.unique(sclocks)  # find unique sclocks
    filelist_new = np.array([], dtype='str')
    for i in sclocks_unique:
        match = (sclocks == i)  # find all instances with matching sclocks
        maxP = P_version[match] == max(P_version[match])  # find the highest version among these files
        filelist_new = np.append(filelist_new, filelist[match][maxP])  # keep only the file with thei highest version

    filelist = filelist_new
    # Should add a progress bar for importing large numbers of files
    dt = []

    for i, file in enumerate(filelist):
        print(file)
        if is_sav:
            tmp = CCAM_SAV(file, ave=ave)
        else:
            tmp = CCAM_CSV(file, ave=ave)
        if i == 0:
            combined = tmp
        else:
            # This ensures that rounding errors are not causing mismatches in columns
            cols1 = list(combined['wvl'].columns)
            cols2 = list(tmp['wvl'].columns)
            if set(cols1) == set(cols2):
                combined = pd.concat([combined, tmp])
            else:
                print("Wavelengths don't match!")

    combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')])

    if lookupfile is not None:

        combined = lookup(combined, lookupfile=lookupfile.replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))
    if to_csv is not None:
        combined.to_csv(to_csv)
    return combined

plio/io/io_edr.py

0 → 100644
+80 −0
Original line number Diff line number Diff line
import os

import numpy as np
import pandas as pd


def EDR(input_file):
    f = open(input_file, 'rb')  # read as bytes so python won't complain about the binary part of the file

    # read lines of the header until reaching the end of the libs table (collecting other metadata along the way)
    end_of_libs_table = False
    while end_of_libs_table is False:
        line = str(f.readline(), 'utf-8').replace('\r', '').replace('\n',
                                                                    '')  # convert the current line to a string and get rid of newline characters
        line = line.split('=')  # split the line on equals sign if present
        # look for the name of the value we want, if the current line has it, then set the value
        if 'RECORD_BYTES' in line[0]:
            rbytes = int(line[1])
        if 'LABEL_RECORDS' in line[0]:
            lrecs = int(line[1])
        if 'SPACECRAFT_CLOCK_START_COUNT' in line[0]:
            sclock = int(line[1].replace('"', '').split('.')[0])
        if 'SEQUENCE_ID' in line[0]:
            seqID = line[1].replace('"', '')
        if 'INSTRUMENT_FOCUS_DISTANCE' in line[0]:
            focus_dist = int(line[1])

        if 'INSTRUMENT_TEMPERATURE' in line[0]:
            instrument_temps = line[1] \
                               + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
                               + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
                               + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
            instrument_temps = [float(i) for i in
                                instrument_temps.replace('<degC>', '').replace('(', '').replace(')', '').replace(' ',
                                                                                                                 '').split(
                                    ',')]
            instrument_temps_name = str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
            instrument_temps_name = instrument_temps_name.split('=')[1] \
                                    + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
                                    + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
                                    + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
                                    + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
            instrument_temps_name = instrument_temps_name.replace(' ', '').replace('(', '').replace(')', '').replace(
                '"', '').split(',')
            f.readline()
            pass
        try:
            if 'CCAM_LIBS_DATA_CONTAINER' in line[1]:
                nshots = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1])
                start_byte = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1])
            if 'END_OBJECT' in line[0] and 'CCAM_LIBS_TABLE' in line[1]:
                end_of_libs_table = True
        except:
            pass

    f.close()
    header_skip = lrecs * rbytes  # calculate the number of header bytes to skip to get to the real data

    with open(input_file, "rb") as f:
        f.seek(header_skip + start_byte - 1, 0)
        spectra = []
        while spectra.__len__() < nshots:
            spectrum = []
            while spectrum.__len__() < 6444:
                spectrum.append(int.from_bytes(f.read(2), byteorder='big', signed=False))
            spectra.append(spectrum)
    spectra = np.array(spectra, dtype='int')
    cols = np.array(list(range(spectra.shape[1]))) + 1
    cols = [('channel', i) for i in cols]
    inds = np.array(list(range(spectra.shape[0]))) + 1
    sp = pd.DataFrame(spectra, columns=pd.MultiIndex.from_tuples(cols), index=inds)
    sp[('meta', 'EDR_file')] = os.path.basename(input_file)
    sp[('meta', 'Spacecraft_Clock')] = sclock
    sp[('meta', 'Shot')] = sp.index
    sp[('meta', 'SeqID')] = seqID
    sp[('meta', 'Focus_Distance')] = focus_dist
    for ind, name in enumerate(instrument_temps_name):
        sp[('meta', name + '_temp')] = instrument_temps[ind]
    sp.to_csv('test.csv')
    return sp

plio/io/io_jsc.py

0 → 100644
+202 −0
Original line number Diff line number Diff line
import os

import numpy as np
import pandas as pd
from pandas.core.common import array_equivalent

from plio.utils.utils import file_search


# This function reads the lookup tables used to expand metadata from the file names
# This is separated from parsing the filenames so that for large lists of files the
# lookup tables don't need to be read over and over
#
# Info in the tables is stored in a dict of dataframes so that only one variable
# (the dict) needs to be passed between functions
def read_refdata(LUT_files):
    ID_info = pd.read_csv(LUT_files['ID'], index_col=0)
    spectrometer_info = pd.read_csv(LUT_files['spect'], index_col=0)
    # spectrometer_info.reset_index(inplace=True)
    laser_info = pd.read_csv(LUT_files['laser'], index_col=0)
    # laser_info.reset_index(inplace=True)
    exp_info = pd.read_csv(LUT_files['exp'], index_col=0)
    # exp_info.reset_index(inplace=True)
    sample_info = pd.read_csv(LUT_files['sample'], index_col=0)
    # sample_info.reset_index(inplace=True)
    refdata = {'spect': spectrometer_info, 'laser': laser_info, 'exp': exp_info, 'sample': sample_info, 'ID': ID_info}
    return refdata


# This function parses the file names to record metadata related to the observation
def jsc_filename_parse(filename, refdata):
    filename = os.path.basename(filename)  # strip the path off of the file name
    filename = filename.split('_')  # split the file name on underscores
    libs_ID = filename[0]
    laserID = filename[4][0]
    expID = filename[5]
    spectID = filename[6]

    try:
        sampleID = refdata['ID'].loc[libs_ID].values[0]
        file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
        if file_info.columns.shape[0] < file_info.index.shape[0]:
            file_info = file_info.T
        if file_info.index.shape[0] > 1:
            print('More than one matching row for ' + sampleID + '!')
            tempID = 'Unknown'
            file_info = pd.DataFrame(refdata['sample'].loc[tempID])
            if file_info.columns.shape[0] < file_info.index.shape[0]:
                file_info = file_info.T


    except:
        sampleID = 'Unknown'
        file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
        if file_info.columns.shape[0] < file_info.index.shape[0]:
            file_info = file_info.T

    file_info['Sample ID'] = sampleID
    file_info['LIBS ID'] = libs_ID
    file_info.reset_index(level=0, inplace=True, drop=True)
    file_info['loc'] = int(filename[1])
    file_info['lab'] = filename[2]
    file_info['gas'] = filename[3][0]
    file_info['pressure'] = float(filename[3][1:])

    if laserID in refdata['laser'].index:
        laser_info = pd.DataFrame(refdata['laser'].loc[laserID]).T
        laser_info.index.name = 'Laser Identifier'
        laser_info.reset_index(level=0, inplace=True)
        file_info = pd.concat([file_info, laser_info], axis=1)

    file_info['laser_power'] = float(filename[4][1:])
    if expID in refdata['exp'].index:
        exp_info = pd.DataFrame(refdata['exp'].loc[expID]).T
        exp_info.index.name = 'Exp Identifier'
        exp_info.reset_index(level=0, inplace=True)
        file_info = pd.concat([file_info, exp_info], axis=1)

    file_info['spectrometer'] = spectID
    if spectID in refdata['spect'].index:
        temp = refdata['spect'].loc[spectID]
        temp = [temp[2], temp[4:]]
        spect_info = pd.DataFrame(refdata['spect'].loc[spectID]).T
        spect_info.index.name = 'Spectrometer Identifier'
        spect_info.reset_index(level=0, inplace=True)
        file_info = pd.concat([file_info, spect_info], axis=1)

    return file_info


def JSC(input_files, refdata):
    try:
        # read the first file
        data = pd.read_csv(input_files[0], skiprows=14, sep='\t', engine='c')
        data = data.rename(columns={data.columns[0]: 'time1', data.columns[1]: 'time2'})
        metadata = pd.concat([jsc_filename_parse(input_files[0], refdata)] * len(data.index))
        metadata.drop('spectrometer', axis=1, inplace=True)

        # read the next files and merge them with the first
        for file in input_files[1:]:
            datatemp = pd.read_csv(file, skiprows=14, sep='\t', engine='c')
            datatemp = datatemp.rename(columns={datatemp.columns[0]: 'time1', datatemp.columns[1]: 'time2'})
            data = data.merge(datatemp)

        time = data[['time1', 'time2']]  # split the two time columns from the data frame
        data.drop(['time1', 'time2'], axis=1, inplace=True)  # trim the data frame so it is just the spectra

        # make a multiindex for each wavlength column so they can be easily isolated from metadata later
        data.columns = [['wvl'] * len(data.columns), np.array(data.columns.values, dtype='float').round(4)]

        metadata.index = data.index
        metadata = pd.concat([metadata, time], axis=1)
        compcols = ['SiO2', 'TiO2', 'Al2O3', 'Cr2O3', 'Fe2O3T', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O', 'P2O5',
                    'SO3 LOI Residue', 'Total', 'Total Includes', '%LOI', 'FeO',
                    'Fe2O3', 'SO3 Actual', 'Fe(3+)/Fe(Total)', 'Rb (ug/g)', 'Sr (ug/g)', 'Y (ug/g)', 'Zr (ug/g)',
                    'V (ug/g)', 'Ni (ug/g)', 'Cr (ug/g)',
                    'Nb (ug/g)', 'Ga (ug/g)', 'Cu (ug/g)', 'Zn (ug/g)', 'Co (ug/g)', 'Ba (ug/g)', 'La (ug/g)',
                    'Ce (ug/g)', 'U (ug/g)', 'Th (ug/g)', 'Sc (ug/g)',
                    'Pb (ug/g)', 'Ge (ug/g)', 'As (ug/g)', 'Cl (ug/g)']
        compdata = metadata[compcols]
        metadata.drop(compcols, axis=1, inplace=True)
        metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]
        compdata.columns = [['comp'] * len(compdata.columns), compdata.columns.values]
        data = pd.concat([data, metadata, compdata], axis=1)

        data[('meta', 'Scan #')] = data.index
        data.set_index(('meta', 'time2'), drop=False, inplace=True)

        return data
    except:
        print('Problem reading:' + input_file)
        print('Moving to Problem_Files')
        os.rename(input_file,
                  r"Problem_Files\\" + os.path.basename(
                      input_file))
        return None


def jsc_batch(directory, LUT_files, searchstring='*.txt', to_csv=None):
    # Read in the lookup tables to expand filename metadata
    refdata = read_refdata(LUT_files)
    # get the list of files that match the search string in the given directory
    filelist = file_search(directory, searchstring)
    spectIDs = []  # create an empty list to hold the spectrometer IDs
    libsIDs = []
    timestamps = []
    locs = []
    for file in filelist:
        filesplit = os.path.basename(file).split('_')
        spectIDs.append(filesplit[6])  # get the spectrometer IDs for each file in the list
        libsIDs.append(filesplit[0])
        timestamps.append(filesplit[-1].split('.')[0])
        locs.append(filesplit[1])
    spectIDs_unique = np.unique(spectIDs)  # get the unique spectrometer IDs
    libsIDs_unique = np.unique(libsIDs)
    dfs = []  # create an empty list to hold the data frames for each spectrometer

    # loop through each LIBS ID
    alldata = []
    for ID in libsIDs_unique:
        print('Working on : ' + str(ID))
        sublist = filelist[np.in1d(libsIDs, ID)]
        locs = []
        for file in sublist:
            locs.append(os.path.basename(file).split('_')[1])
        locs_unique = np.unique(locs)
        # loop through each location for that libs ID
        for loc in locs_unique:
            print(loc)
            sub_sublist = sublist[np.in1d(locs, loc)]  # get the files for that LIBSID and location
            data = JSC(sub_sublist, refdata)
            alldata.append(data)
            pass

    combined = pd.concat(alldata)
    if to_csv is not None:
        print('Writing combined data to: ' + to_csv)
        combined.to_csv(to_csv)
    return combined


# got this function from stack overflow: http://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
# it's slow but doesn't crash python like combined.T.drop_duplicates().T does in some cases with very large sets of data
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:, i].values
            for j in range(i + 1, lcs):
                ja = vs.iloc[:, j].values
                if array_equivalent(ia, ja):
                    dups.append(cs[i])
                    break

    return dups
Loading