Commit 6c6fbf93 authored by Ryan Anderson's avatar Ryan Anderson
Browse files

Adding Ryan's LIBS stuff to autocnet

These are some file i/o and preprocessing functions for LIBS data types.
parent c3fdf9f1
Loading
Loading
Loading
Loading
+127 −0
Original line number Diff line number Diff line
# This code is used to read individual ChemCam CCS .csv files
# Header data is stored as attributes of the data frame
# White space is stripped from the column names
import os
import numpy as np
import pandas as pd
import scipy
from pysat.fileio.header_parser import header_parser
from pysat.fileio.utils import file_search

def CCS(input_data):
    df = pd.DataFrame.from_csv(input_data, header=14)
    df.rename(columns=lambda x: x.strip(),inplace=True) #strip whitespace from column names
    df=df.transpose()
    
    cols=df.columns.tolist()
    for i,x in enumerate(cols):
        cols[i]=('wvl',round(float(x),5))
    df.columns=pd.MultiIndex.from_tuples(cols)
    #extract info from the file name
    fname=os.path.basename(input_data)
    df['sclock']=fname[4:13]
    df['sclock']=pd.to_numeric(df['sclock'])
    df['seqid']=fname[25:34].upper()
    df['Pversion']=fname[34:36]        
    #transpose the data frame
    
    #read the file header and put information into the dataframe as new columns (inneficient, but much easier to concatenate data from multiple files)
    with open(input_data,'r') as f:
        header={}
        for i,row in enumerate(f.readlines()):
            if i<14:
                row=row.split(',')[0]
                header.update(header_parser(row,'='))    
                
    for label,data in header.items(): 
        if '_float' in label:
            label=label.replace('_float','')
        df[label]=data 

    return df
        
def CCS_SAV(input_data):
    
    d=scipy.io.readsav(input_data,python_dict=True)
    #combine the three spectrometers
    spectra=np.vstack([d['uv'],d['vis'],d['vnir']])
    aspectra=np.array([np.hstack([d['auv'],d['avis'],d['avnir']])]).T
    mspectra=np.array([np.hstack([d['muv'],d['mvis'],d['mvnir']])]).T
    
    wvls=list(np.hstack([d['defuv'],d['defvis'],d['defvnir']]))
    for i,x in enumerate(wvls):
        wvls[i]=('wvl',round(x,5))
    
    #remove the above elements from the dict
    del d['uv']
    del d['vis']
    del d['vnir']
    del d['auv']
    del d['avis']
    del d['avnir']
    del d['muv']
    del d['mvis']
    del d['mvnir']
    del d['defuv']
    del d['defvis']
    del d['defvnir']
    
    #define column names
    shotnums=list(range(1,d['nshots']+1))
    shots=['shot'+str(i) for i in shotnums]
    shots.extend(['ave','median'])
    df = pd.DataFrame(np.hstack([spectra,aspectra,mspectra]),columns=shots,index=pd.MultiIndex.from_tuples(wvls))        
    df=df.transpose()

        #        #extract data from the PDS label info
#        pdslabel={}
#        for i in d['label_info']:
#            print(str(i.decode()))
#            if type(i) is bytes:
#                pdslabel.update(io_header_parser(i.decode(),'='))
#            elif len(i)>0:
#                pdslabel.update(io_header_parser(i,'='))
        
        
    del d['label_info']  #not currently using PDS label info        
    
    #extract info from the file name
    fname=os.path.basename(input_data)
    d['sclock']=fname[4:13]
    d['seqid']=fname[25:34].upper()
    d['Pversion']=fname[34:36]
    for label,data in d.items(): 
        if type(data) is bytes: data=data.decode()
        df[label]=data
    
    df['sclock']=pd.to_numeric(df['sclock'])
   
    
    return df    

def ccs_batch(directory,searchstring='*CCS*.csv',is_sav=False):
    if 'SAV' in searchstring:
        is_sav=True
    else:
        is_sav=False
    filelist=file_search(directory,searchstring)
    for i in filelist:
        
        if is_sav:
            tmp=CCS_SAV(i)
        else:
            tmp=CCS(i)
            
        try:
            cols1=list(combined.columns[combined.dtypes=='float'])
            cols2=list(tmp.columns[tmp.dtypes=='float'])
            if set(cols1)==set(cols2):
                combined=pd.concat([combined,tmp])
            else:
                print("Wavelengths don't match!")
                print('foo')
        except:
            combined=tmp
    return combined
    
        
 No newline at end of file
+36 −0
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 30 08:58:07 2015

@author: rbanderson
This is a simple function to read in CSV data.
If setindex is specified, then it uses the columnd of the CSV with the 
specified name as the row index of the data frame
"""
import pandas as pd
def CSV(filename,sep=',',setindex=None):
    print('Reading '+filename)
    df = pd.read_csv(filename, sep=sep)
    wvlindex=[]
    cols_wvl=[]
    nonwvlindex=[]
    for i,x in enumerate(df.columns):
        try:
            x=round(float(x),5)
            cols_wvl.append(('wvl',x))
            wvlindex.extend([i])
        except:
            nonwvlindex.extend([i])
    
    df_spectra=df[wvlindex]
    df_data=df[nonwvlindex]
    df_spectra.columns=pd.MultiIndex.from_tuples(cols_wvl)
    for i,x in enumerate(df_data.columns):
        df_spectra[x]=df_data[x]
    df=df_spectra

    if setindex:
        df=df.set_index([setindex])


    return df
 No newline at end of file
+27 −0
Original line number Diff line number Diff line

import pandas as pd
from pysat.fileio.header_parser import header_parser

def EDR(input_data):
    with open(input_data, 'r') as f:
        header={}
        for i, row in enumerate(f.readlines()):
            
            if i<2 or i==28:
                pass
            elif i<28:
                header.update(header_parser(row,':')) #read the header values into a dict
            elif i==29:
                row=row.split()
                shotnums=list(range(1,len(row)+1))
                shots=['shot'+str(i) for i in shotnums]

    df = pd.read_csv(input_data, sep='    ',skiprows=29,names=shots)        
    df=df.transpose()
            #insert the header metadata as columns
    for label,data in header.items(): 
        df[label]=data
    return df
          
          
                    
+109 −0
Original line number Diff line number Diff line
import datetime
import os
import re

import numpy as np
import pandas as pd

from pysat.spectral.spectra import Spectra
from pysat.fileio.header_parser import header_parser
from pysat.fileio.utils import file_search

#This function reads the lookup tables used to expand metadata from the file names
#This is separated from parsing the filenames so that for large lists of files the 
#lookup tables don't need to be read over and over
#
#Info in the tables is stored in a dict of dataframes so that only one variable 
#(the dict) needs to be passed between functions
def read_refdata(LUT_files):
    spectrometer_info=pd.read_csv(LUT_files['spect'],index_col=0)
    laser_info=pd.read_csv(LUT_files['laser'],index_col=0)
    exp_info=pd.read_csv(LUT_files['exp'],index_col=0)
    sample_info=pd.read_csv(LUT_files['sample'],index_col=0)
    refdata={'spect':spectrometer_info,'laser':laser_info,'exp':exp_info,'sample':sample_info}
    return refdata

#This function parses the file names to record metadata related to the observation
def jsc_filename_parse(filename,refdata):
    filename=os.path.basename(filename) #strip the path off of the file name
    filename=filename.split('_') #split the file name on underscores
    libs_ID=filename[0]
    laserID=filename[4][0]
    expID=filename[5]
    spectID=filename[6]
    if libs_ID in refdata['sample']:
        file_info=pd.DataFrame(refdata['sample'].loc[libs_ID]).T
    else:
        file_info=pd.DataFrame(refdata['sample'].loc['Unknown']).T
    file_info.index.name='LIBS ID'
    file_info.reset_index(level=0,inplace=True)
    file_info['loc']=int(filename[1])
    file_info['lab']=filename[2]
    file_info['gas']=filename[3][0]
    file_info['pressure']=float(filename[3][1:])
    
    if laserID in refdata['laser'].index:
        laser_info=pd.DataFrame(refdata['laser'].loc[laserID]).T
        laser_info.index.name='Laser Identifier'
        laser_info.reset_index(level=0,inplace=True)
        file_info=pd.concat([file_info,laser_info],axis=1)
        
    file_info['laser_power']=float(filename[4][1:])
    if expID in refdata['exp'].index:
        exp_info=pd.DataFrame(refdata['exp'].loc[expID]).T
        exp_info.index.name='Exp Identifier'
        exp_info.reset_index(level=0,inplace=True)
        file_info=pd.concat([file_info,exp_info],axis=1)
        
#    file_info['spectrometer']=spectID        
#    if spectID in refdata['spect'].index:
#        temp=refdata['spect'].loc[spectID]
#        temp=[temp[2],temp[4:]]
#        spect_info=pd.DataFrame(refdata['spect'].loc[spectID]).T
#        spect_info.index.name='Spectrometer Identifier'
#        spect_info.reset_index(level=0,inplace=True)
#        file_info=pd.concat([file_info,spect_info],axis=1)
    
    return file_info
    

def JSC(input_file,refdata):
    data=pd.read_csv(input_file,skiprows=14,sep='\t')
    data=data.rename(columns={data.columns[0]:'time1',data.columns[1]:'time2'})
    
    metadata=pd.concat([jsc_filename_parse(input_file,refdata)]*len(data.index))
    metadata.index=data.index
    df=pd.concat([metadata,data],axis=1)
    
    return df
   
        


def jsc_batch(directory, LUT_files,searchstring='*.txt'):
    #Read in the lookup tables to expand filename metadata                  
    refdata=read_refdata(LUT_files)
    #get the list of files that match the search string in the given directory    
    filelist=file_search(directory,searchstring)
    spectIDs=[] #create an empty list to hold the spectrometer IDs
    
    for file in filelist:
        spectIDs.append(os.path.basename(file).split('_')[6]) #get the spectrometer IDs for each file in the list
    spectIDs_unique=np.unique(spectIDs) #get the unique spectrometer IDs
    dfs=[]  #create an empty list to hold the data frames for each spectrometer  

    #loop through each spectrometer, read the spectra and combine them into a single data frame for that spectrometer    
    for spect in spectIDs_unique:
        sublist=filelist[np.in1d(spectIDs,spect)]
        temp=[JSC(sublist[0],refdata)        ]
        for file in sublist[1:]:
            temp.append(JSC(file,refdata))
        dfs.append(pd.concat(temp))

    #now combine the data frames for the different spectrometers into a single data frame containing all the data    
    combined=dfs[0]
    for df in dfs[1:]:
        combined=combined.merge(df)
        
    return combined
                    

autocnet/fileio/io_libs.py

deleted100644 → 0
+0 −39
Original line number Diff line number Diff line
import numpy as np
import pandas as pd

from autocnet.spectral.spectra import Spectra

class LIBS(object):
    
    def __init__(self, input_data):
        self.spectra = None
        with open(input_data, 'r') as f:
            """
            Could easily add regex to the parsing to be more robust reading,
            could also peg metadata to t
            """

            for i, l in enumerate(f.readlines()):
                if i == 14:
                    wavelengths = np.fromstring(l, sep=' ')
                elif i > 14:
                    sl = l.split('\t')
                    time = sl[0]
                    sid = sl[1]
                    rawsp = np.asarray(map(float,sl[2:]))
                    if not self.spectra:
                        df = pd.DataFrame(rawsp, columns=[sid],
                                          index=wavelengths)
                        self.spectra = Spectra(df)
                    else:
                        self.spectra.df[sid] = rawsp
                elif i == 0 or i == 13:
                    pass
                elif i == 1:
                    date = ' '.join(l.rstrip().split(':')[1:])
                    #date = datetime.datetime(date) #Format needs to be specified
                    setattr(self, 'Date', date)
                else:
                    key, v = l.split(':')
                    k = '_'.join(key.split())
                    setattr(self, k, v.rstrip())
Loading