Loading .travis.yml +6 −6 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ install: - conda config --add channels conda-forge - conda config --add channels jlaura - conda install -c conda-forge gdal h5py - conda install pandas sqlalchemy pyyaml networkx affine protobuf - conda install pandas sqlalchemy pyyaml networkx affine protobuf scipy - pip install pvl # Development installation Loading appveyor.yml +1 −1 Original line number Diff line number Diff line Loading @@ -54,7 +54,7 @@ install: - cmd: conda config --add channels conda-forge - cmd: conda config --add channels jlaura - cmd: conda install --yes -c conda-forge gdal h5py - cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine - cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine scipy - cmd: conda install --yes -c jlaura protobuf pvl # Development installation Loading plio/io/io_ccam_pds.py 0 → 100644 +201 −0 Original line number Diff line number Diff line # This code is used to read individual ChemCam files # Header data is stored as attributes of the data frame # White space is stripped from the column names import os import numpy as np import pandas as pd import scipy.io as io from plio.utils.utils import lookup from plio.utils.utils import file_search def CCAM_CSV(input_data, ave=True): try: df = pd.read_csv(input_data, header=14, engine='c') cols = list(df.columns.values) df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names df.set_index(['wave'], inplace=True) # use wavelengths as indices # read the file header and put information into the dataframe as new columns metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None) except: try: # handle files with an extra header row containing temperature df = pd.read_csv(input_data, header=15, engine='c') cols = list(df.columns.values) df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names df.set_index(['wave'], inplace=True) # use wavelengths as indices # read the file header and put information into the dataframe as new columns metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None) except: # handle files with an extra header row containing temperature and target name df = pd.read_csv(input_data, header=16, engine='c') cols = list(df.columns.values) df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names df.set_index(['wave'], inplace=True) # use wavelengths as indices # read the file header and put information into the dataframe as new columns metadata = pd.read_csv(input_data, sep='=', nrows=16, comment=',', engine='c', index_col=0, header=None) if ave: df = pd.DataFrame(df['mean']) else: df = df.drop(['mean', 'median'], axis=1) df.index = [['wvl'] * len(df.index), df.index.values.round(4)] # create multiindex so spectra can be easily extracted with a single key df = df.T # transpose so that each spectrum is a row # remove extraneous stuff from the metadataindices metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values] metadata = metadata.T # extract info from the file name fname = os.path.basename(input_data) metadata['sclock'] = fname[4:13] metadata['seqid'] = fname[25:34].upper() metadata['Pversion'] = fname[34:36] # duplicate the metadata for each row in the df if not ave: metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True) metadata.index = df.index # make the indices match metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values] # make the columns into multiindex df = pd.concat([metadata, df], axis=1) # combine the spectra with the metadata return df def CCAM_SAV(input_data, ave=True): # read the IDL .SAV file data = io.readsav(input_data, python_dict=True) # put the spectra into data frames and combine them df_UV = pd.DataFrame(data['uv'], index=data['defuv']) df_VIS = pd.DataFrame(data['vis'], index=data['defvis']) df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir']) df_spect = pd.concat([df_UV, df_VIS, df_VNIR]) df_spect.columns = ['shot' + str(i + 1) for i in df_spect.columns] # add 1 to the columns so they correspond to shot number df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average']) df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average']) df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average']) df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR]) df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median']) df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median']) df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median']) df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR]) df = pd.concat([df_spect, df_ave, df_med], axis=1) # create multiindex to access wavelength values # also, round the wavlength values to a more reasonable level of precision df.index = [['wvl'] * len(df.index), df.index.values.round(4)] # transpose so that spectra are rows rather than columns df = df.T # extract metadata from the file name and add it to the data frame # use the multiindex label "meta" for all metadata fname = os.path.basename(input_data) # for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect' # this try-except pair converts to 'darkname' when needed try: data['darkname'] except: data['darkname'] = data['darkspec'] metadata = [fname, fname[4:13], fname[25:34].upper(), fname[34:36], data['continuumvismin'], data['continuumvnirmin'], data['continuumuvmin'], data['continuumvnirend'], data['distt'], data['darkname'], data['nshots'], data['dnoiseiter'], data['dnoisesig'], data['matchedfilter']] metadata = np.tile(metadata, (len(df.index), 1)) metadata_cols = list(zip(['meta'] * len(df.index), ['file', 'sclock', 'seqid', 'Pversion', 'continuumvismin', 'continuumvnirmin', 'continuumuvmin', 'continuumvnirend', 'distt', 'dark', 'nshots', 'dnoiseiter', 'dnoisesig', 'matchedfilter'])) metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index) df = pd.concat([metadata, df], axis=1) if ave == True: df = df.loc['average'] df = df.to_frame().T else: pass return df def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None): # Determine if the file is a .csv or .SAV if '.sav' in searchstring.lower(): is_sav = True else: is_sav = False filelist = file_search(directory, searchstring) basenames = np.zeros_like(filelist) sclocks = np.zeros_like(filelist) P_version = np.zeros_like(filelist, dtype='int') # Extract the sclock and version for each file and ensure that only one # file per sclock is being read, and that it is the one with the highest version number for i, name in enumerate(filelist): basenames[i] = os.path.basename(name) sclocks[i] = basenames[i][4:13] # extract the sclock P_version[i] = basenames[i][-5:-4] # extract the version sclocks_unique = np.unique(sclocks) # find unique sclocks filelist_new = np.array([], dtype='str') for i in sclocks_unique: match = (sclocks == i) # find all instances with matching sclocks maxP = P_version[match] == max(P_version[match]) # find the highest version among these files filelist_new = np.append(filelist_new, filelist[match][maxP]) # keep only the file with thei highest version filelist = filelist_new # Should add a progress bar for importing large numbers of files dt = [] for i, file in enumerate(filelist): print(file) if is_sav: tmp = CCAM_SAV(file, ave=ave) else: tmp = CCAM_CSV(file, ave=ave) if i == 0: combined = tmp else: # This ensures that rounding errors are not causing mismatches in columns cols1 = list(combined['wvl'].columns) cols2 = list(tmp['wvl'].columns) if set(cols1) == set(cols2): combined = pd.concat([combined, tmp]) else: print("Wavelengths don't match!") combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')]) if lookupfile is not None: combined = lookup(combined, lookupfile=lookupfile.replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')) if to_csv is not None: combined.to_csv(to_csv) return combined plio/io/io_edr.py 0 → 100644 +80 −0 Original line number Diff line number Diff line import os import numpy as np import pandas as pd def EDR(input_file): f = open(input_file, 'rb') # read as bytes so python won't complain about the binary part of the file # read lines of the header until reaching the end of the libs table (collecting other metadata along the way) end_of_libs_table = False while end_of_libs_table is False: line = str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') # convert the current line to a string and get rid of newline characters line = line.split('=') # split the line on equals sign if present # look for the name of the value we want, if the current line has it, then set the value if 'RECORD_BYTES' in line[0]: rbytes = int(line[1]) if 'LABEL_RECORDS' in line[0]: lrecs = int(line[1]) if 'SPACECRAFT_CLOCK_START_COUNT' in line[0]: sclock = int(line[1].replace('"', '').split('.')[0]) if 'SEQUENCE_ID' in line[0]: seqID = line[1].replace('"', '') if 'INSTRUMENT_FOCUS_DISTANCE' in line[0]: focus_dist = int(line[1]) if 'INSTRUMENT_TEMPERATURE' in line[0]: instrument_temps = line[1] \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') instrument_temps = [float(i) for i in instrument_temps.replace('<degC>', '').replace('(', '').replace(')', '').replace(' ', '').split( ',')] instrument_temps_name = str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') instrument_temps_name = instrument_temps_name.split('=')[1] \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') instrument_temps_name = instrument_temps_name.replace(' ', '').replace('(', '').replace(')', '').replace( '"', '').split(',') f.readline() pass try: if 'CCAM_LIBS_DATA_CONTAINER' in line[1]: nshots = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1]) start_byte = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1]) if 'END_OBJECT' in line[0] and 'CCAM_LIBS_TABLE' in line[1]: end_of_libs_table = True except: pass f.close() header_skip = lrecs * rbytes # calculate the number of header bytes to skip to get to the real data with open(input_file, "rb") as f: f.seek(header_skip + start_byte - 1, 0) spectra = [] while spectra.__len__() < nshots: spectrum = [] while spectrum.__len__() < 6444: spectrum.append(int.from_bytes(f.read(2), byteorder='big', signed=False)) spectra.append(spectrum) spectra = np.array(spectra, dtype='int') cols = np.array(list(range(spectra.shape[1]))) + 1 cols = [('channel', i) for i in cols] inds = np.array(list(range(spectra.shape[0]))) + 1 sp = pd.DataFrame(spectra, columns=pd.MultiIndex.from_tuples(cols), index=inds) sp[('meta', 'EDR_file')] = os.path.basename(input_file) sp[('meta', 'Spacecraft_Clock')] = sclock sp[('meta', 'Shot')] = sp.index sp[('meta', 'SeqID')] = seqID sp[('meta', 'Focus_Distance')] = focus_dist for ind, name in enumerate(instrument_temps_name): sp[('meta', name + '_temp')] = instrument_temps[ind] sp.to_csv('test.csv') return sp plio/io/io_jsc.py 0 → 100644 +202 −0 Original line number Diff line number Diff line import os import numpy as np import pandas as pd from pandas.core.common import array_equivalent from plio.utils.utils import file_search # This function reads the lookup tables used to expand metadata from the file names # This is separated from parsing the filenames so that for large lists of files the # lookup tables don't need to be read over and over # # Info in the tables is stored in a dict of dataframes so that only one variable # (the dict) needs to be passed between functions def read_refdata(LUT_files): ID_info = pd.read_csv(LUT_files['ID'], index_col=0) spectrometer_info = pd.read_csv(LUT_files['spect'], index_col=0) # spectrometer_info.reset_index(inplace=True) laser_info = pd.read_csv(LUT_files['laser'], index_col=0) # laser_info.reset_index(inplace=True) exp_info = pd.read_csv(LUT_files['exp'], index_col=0) # exp_info.reset_index(inplace=True) sample_info = pd.read_csv(LUT_files['sample'], index_col=0) # sample_info.reset_index(inplace=True) refdata = {'spect': spectrometer_info, 'laser': laser_info, 'exp': exp_info, 'sample': sample_info, 'ID': ID_info} return refdata # This function parses the file names to record metadata related to the observation def jsc_filename_parse(filename, refdata): filename = os.path.basename(filename) # strip the path off of the file name filename = filename.split('_') # split the file name on underscores libs_ID = filename[0] laserID = filename[4][0] expID = filename[5] spectID = filename[6] try: sampleID = refdata['ID'].loc[libs_ID].values[0] file_info = pd.DataFrame(refdata['sample'].loc[sampleID]) if file_info.columns.shape[0] < file_info.index.shape[0]: file_info = file_info.T if file_info.index.shape[0] > 1: print('More than one matching row for ' + sampleID + '!') tempID = 'Unknown' file_info = pd.DataFrame(refdata['sample'].loc[tempID]) if file_info.columns.shape[0] < file_info.index.shape[0]: file_info = file_info.T except: sampleID = 'Unknown' file_info = pd.DataFrame(refdata['sample'].loc[sampleID]) if file_info.columns.shape[0] < file_info.index.shape[0]: file_info = file_info.T file_info['Sample ID'] = sampleID file_info['LIBS ID'] = libs_ID file_info.reset_index(level=0, inplace=True, drop=True) file_info['loc'] = int(filename[1]) file_info['lab'] = filename[2] file_info['gas'] = filename[3][0] file_info['pressure'] = float(filename[3][1:]) if laserID in refdata['laser'].index: laser_info = pd.DataFrame(refdata['laser'].loc[laserID]).T laser_info.index.name = 'Laser Identifier' laser_info.reset_index(level=0, inplace=True) file_info = pd.concat([file_info, laser_info], axis=1) file_info['laser_power'] = float(filename[4][1:]) if expID in refdata['exp'].index: exp_info = pd.DataFrame(refdata['exp'].loc[expID]).T exp_info.index.name = 'Exp Identifier' exp_info.reset_index(level=0, inplace=True) file_info = pd.concat([file_info, exp_info], axis=1) file_info['spectrometer'] = spectID if spectID in refdata['spect'].index: temp = refdata['spect'].loc[spectID] temp = [temp[2], temp[4:]] spect_info = pd.DataFrame(refdata['spect'].loc[spectID]).T spect_info.index.name = 'Spectrometer Identifier' spect_info.reset_index(level=0, inplace=True) file_info = pd.concat([file_info, spect_info], axis=1) return file_info def JSC(input_files, refdata): try: # read the first file data = pd.read_csv(input_files[0], skiprows=14, sep='\t', engine='c') data = data.rename(columns={data.columns[0]: 'time1', data.columns[1]: 'time2'}) metadata = pd.concat([jsc_filename_parse(input_files[0], refdata)] * len(data.index)) metadata.drop('spectrometer', axis=1, inplace=True) # read the next files and merge them with the first for file in input_files[1:]: datatemp = pd.read_csv(file, skiprows=14, sep='\t', engine='c') datatemp = datatemp.rename(columns={datatemp.columns[0]: 'time1', datatemp.columns[1]: 'time2'}) data = data.merge(datatemp) time = data[['time1', 'time2']] # split the two time columns from the data frame data.drop(['time1', 'time2'], axis=1, inplace=True) # trim the data frame so it is just the spectra # make a multiindex for each wavlength column so they can be easily isolated from metadata later data.columns = [['wvl'] * len(data.columns), np.array(data.columns.values, dtype='float').round(4)] metadata.index = data.index metadata = pd.concat([metadata, time], axis=1) compcols = ['SiO2', 'TiO2', 'Al2O3', 'Cr2O3', 'Fe2O3T', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O', 'P2O5', 'SO3 LOI Residue', 'Total', 'Total Includes', '%LOI', 'FeO', 'Fe2O3', 'SO3 Actual', 'Fe(3+)/Fe(Total)', 'Rb (ug/g)', 'Sr (ug/g)', 'Y (ug/g)', 'Zr (ug/g)', 'V (ug/g)', 'Ni (ug/g)', 'Cr (ug/g)', 'Nb (ug/g)', 'Ga (ug/g)', 'Cu (ug/g)', 'Zn (ug/g)', 'Co (ug/g)', 'Ba (ug/g)', 'La (ug/g)', 'Ce (ug/g)', 'U (ug/g)', 'Th (ug/g)', 'Sc (ug/g)', 'Pb (ug/g)', 'Ge (ug/g)', 'As (ug/g)', 'Cl (ug/g)'] compdata = metadata[compcols] metadata.drop(compcols, axis=1, inplace=True) metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values] compdata.columns = [['comp'] * len(compdata.columns), compdata.columns.values] data = pd.concat([data, metadata, compdata], axis=1) data[('meta', 'Scan #')] = data.index data.set_index(('meta', 'time2'), drop=False, inplace=True) return data except: print('Problem reading:' + input_file) print('Moving to Problem_Files') os.rename(input_file, r"Problem_Files\\" + os.path.basename( input_file)) return None def jsc_batch(directory, LUT_files, searchstring='*.txt', to_csv=None): # Read in the lookup tables to expand filename metadata refdata = read_refdata(LUT_files) # get the list of files that match the search string in the given directory filelist = file_search(directory, searchstring) spectIDs = [] # create an empty list to hold the spectrometer IDs libsIDs = [] timestamps = [] locs = [] for file in filelist: filesplit = os.path.basename(file).split('_') spectIDs.append(filesplit[6]) # get the spectrometer IDs for each file in the list libsIDs.append(filesplit[0]) timestamps.append(filesplit[-1].split('.')[0]) locs.append(filesplit[1]) spectIDs_unique = np.unique(spectIDs) # get the unique spectrometer IDs libsIDs_unique = np.unique(libsIDs) dfs = [] # create an empty list to hold the data frames for each spectrometer # loop through each LIBS ID alldata = [] for ID in libsIDs_unique: print('Working on : ' + str(ID)) sublist = filelist[np.in1d(libsIDs, ID)] locs = [] for file in sublist: locs.append(os.path.basename(file).split('_')[1]) locs_unique = np.unique(locs) # loop through each location for that libs ID for loc in locs_unique: print(loc) sub_sublist = sublist[np.in1d(locs, loc)] # get the files for that LIBSID and location data = JSC(sub_sublist, refdata) alldata.append(data) pass combined = pd.concat(alldata) if to_csv is not None: print('Writing combined data to: ' + to_csv) combined.to_csv(to_csv) return combined # got this function from stack overflow: http://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns # it's slow but doesn't crash python like combined.T.drop_duplicates().T does in some cases with very large sets of data def duplicate_columns(frame): groups = frame.columns.to_series().groupby(frame.dtypes).groups dups = [] for t, v in groups.items(): cs = frame[v].columns vs = frame[v] lcs = len(cs) for i in range(lcs): ia = vs.iloc[:, i].values for j in range(i + 1, lcs): ja = vs.iloc[:, j].values if array_equivalent(ia, ja): dups.append(cs[i]) break return dups Loading
.travis.yml +6 −6 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ install: - conda config --add channels conda-forge - conda config --add channels jlaura - conda install -c conda-forge gdal h5py - conda install pandas sqlalchemy pyyaml networkx affine protobuf - conda install pandas sqlalchemy pyyaml networkx affine protobuf scipy - pip install pvl # Development installation Loading
appveyor.yml +1 −1 Original line number Diff line number Diff line Loading @@ -54,7 +54,7 @@ install: - cmd: conda config --add channels conda-forge - cmd: conda config --add channels jlaura - cmd: conda install --yes -c conda-forge gdal h5py - cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine - cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine scipy - cmd: conda install --yes -c jlaura protobuf pvl # Development installation Loading
plio/io/io_ccam_pds.py 0 → 100644 +201 −0 Original line number Diff line number Diff line # This code is used to read individual ChemCam files # Header data is stored as attributes of the data frame # White space is stripped from the column names import os import numpy as np import pandas as pd import scipy.io as io from plio.utils.utils import lookup from plio.utils.utils import file_search def CCAM_CSV(input_data, ave=True): try: df = pd.read_csv(input_data, header=14, engine='c') cols = list(df.columns.values) df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names df.set_index(['wave'], inplace=True) # use wavelengths as indices # read the file header and put information into the dataframe as new columns metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None) except: try: # handle files with an extra header row containing temperature df = pd.read_csv(input_data, header=15, engine='c') cols = list(df.columns.values) df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names df.set_index(['wave'], inplace=True) # use wavelengths as indices # read the file header and put information into the dataframe as new columns metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None) except: # handle files with an extra header row containing temperature and target name df = pd.read_csv(input_data, header=16, engine='c') cols = list(df.columns.values) df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names df.set_index(['wave'], inplace=True) # use wavelengths as indices # read the file header and put information into the dataframe as new columns metadata = pd.read_csv(input_data, sep='=', nrows=16, comment=',', engine='c', index_col=0, header=None) if ave: df = pd.DataFrame(df['mean']) else: df = df.drop(['mean', 'median'], axis=1) df.index = [['wvl'] * len(df.index), df.index.values.round(4)] # create multiindex so spectra can be easily extracted with a single key df = df.T # transpose so that each spectrum is a row # remove extraneous stuff from the metadataindices metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values] metadata = metadata.T # extract info from the file name fname = os.path.basename(input_data) metadata['sclock'] = fname[4:13] metadata['seqid'] = fname[25:34].upper() metadata['Pversion'] = fname[34:36] # duplicate the metadata for each row in the df if not ave: metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True) metadata.index = df.index # make the indices match metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values] # make the columns into multiindex df = pd.concat([metadata, df], axis=1) # combine the spectra with the metadata return df def CCAM_SAV(input_data, ave=True): # read the IDL .SAV file data = io.readsav(input_data, python_dict=True) # put the spectra into data frames and combine them df_UV = pd.DataFrame(data['uv'], index=data['defuv']) df_VIS = pd.DataFrame(data['vis'], index=data['defvis']) df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir']) df_spect = pd.concat([df_UV, df_VIS, df_VNIR]) df_spect.columns = ['shot' + str(i + 1) for i in df_spect.columns] # add 1 to the columns so they correspond to shot number df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average']) df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average']) df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average']) df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR]) df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median']) df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median']) df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median']) df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR]) df = pd.concat([df_spect, df_ave, df_med], axis=1) # create multiindex to access wavelength values # also, round the wavlength values to a more reasonable level of precision df.index = [['wvl'] * len(df.index), df.index.values.round(4)] # transpose so that spectra are rows rather than columns df = df.T # extract metadata from the file name and add it to the data frame # use the multiindex label "meta" for all metadata fname = os.path.basename(input_data) # for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect' # this try-except pair converts to 'darkname' when needed try: data['darkname'] except: data['darkname'] = data['darkspec'] metadata = [fname, fname[4:13], fname[25:34].upper(), fname[34:36], data['continuumvismin'], data['continuumvnirmin'], data['continuumuvmin'], data['continuumvnirend'], data['distt'], data['darkname'], data['nshots'], data['dnoiseiter'], data['dnoisesig'], data['matchedfilter']] metadata = np.tile(metadata, (len(df.index), 1)) metadata_cols = list(zip(['meta'] * len(df.index), ['file', 'sclock', 'seqid', 'Pversion', 'continuumvismin', 'continuumvnirmin', 'continuumuvmin', 'continuumvnirend', 'distt', 'dark', 'nshots', 'dnoiseiter', 'dnoisesig', 'matchedfilter'])) metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index) df = pd.concat([metadata, df], axis=1) if ave == True: df = df.loc['average'] df = df.to_frame().T else: pass return df def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None): # Determine if the file is a .csv or .SAV if '.sav' in searchstring.lower(): is_sav = True else: is_sav = False filelist = file_search(directory, searchstring) basenames = np.zeros_like(filelist) sclocks = np.zeros_like(filelist) P_version = np.zeros_like(filelist, dtype='int') # Extract the sclock and version for each file and ensure that only one # file per sclock is being read, and that it is the one with the highest version number for i, name in enumerate(filelist): basenames[i] = os.path.basename(name) sclocks[i] = basenames[i][4:13] # extract the sclock P_version[i] = basenames[i][-5:-4] # extract the version sclocks_unique = np.unique(sclocks) # find unique sclocks filelist_new = np.array([], dtype='str') for i in sclocks_unique: match = (sclocks == i) # find all instances with matching sclocks maxP = P_version[match] == max(P_version[match]) # find the highest version among these files filelist_new = np.append(filelist_new, filelist[match][maxP]) # keep only the file with thei highest version filelist = filelist_new # Should add a progress bar for importing large numbers of files dt = [] for i, file in enumerate(filelist): print(file) if is_sav: tmp = CCAM_SAV(file, ave=ave) else: tmp = CCAM_CSV(file, ave=ave) if i == 0: combined = tmp else: # This ensures that rounding errors are not causing mismatches in columns cols1 = list(combined['wvl'].columns) cols2 = list(tmp['wvl'].columns) if set(cols1) == set(cols2): combined = pd.concat([combined, tmp]) else: print("Wavelengths don't match!") combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')]) if lookupfile is not None: combined = lookup(combined, lookupfile=lookupfile.replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')) if to_csv is not None: combined.to_csv(to_csv) return combined
plio/io/io_edr.py 0 → 100644 +80 −0 Original line number Diff line number Diff line import os import numpy as np import pandas as pd def EDR(input_file): f = open(input_file, 'rb') # read as bytes so python won't complain about the binary part of the file # read lines of the header until reaching the end of the libs table (collecting other metadata along the way) end_of_libs_table = False while end_of_libs_table is False: line = str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') # convert the current line to a string and get rid of newline characters line = line.split('=') # split the line on equals sign if present # look for the name of the value we want, if the current line has it, then set the value if 'RECORD_BYTES' in line[0]: rbytes = int(line[1]) if 'LABEL_RECORDS' in line[0]: lrecs = int(line[1]) if 'SPACECRAFT_CLOCK_START_COUNT' in line[0]: sclock = int(line[1].replace('"', '').split('.')[0]) if 'SEQUENCE_ID' in line[0]: seqID = line[1].replace('"', '') if 'INSTRUMENT_FOCUS_DISTANCE' in line[0]: focus_dist = int(line[1]) if 'INSTRUMENT_TEMPERATURE' in line[0]: instrument_temps = line[1] \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') instrument_temps = [float(i) for i in instrument_temps.replace('<degC>', '').replace('(', '').replace(')', '').replace(' ', '').split( ',')] instrument_temps_name = str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') instrument_temps_name = instrument_temps_name.split('=')[1] \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \ + str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') instrument_temps_name = instrument_temps_name.replace(' ', '').replace('(', '').replace(')', '').replace( '"', '').split(',') f.readline() pass try: if 'CCAM_LIBS_DATA_CONTAINER' in line[1]: nshots = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1]) start_byte = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1]) if 'END_OBJECT' in line[0] and 'CCAM_LIBS_TABLE' in line[1]: end_of_libs_table = True except: pass f.close() header_skip = lrecs * rbytes # calculate the number of header bytes to skip to get to the real data with open(input_file, "rb") as f: f.seek(header_skip + start_byte - 1, 0) spectra = [] while spectra.__len__() < nshots: spectrum = [] while spectrum.__len__() < 6444: spectrum.append(int.from_bytes(f.read(2), byteorder='big', signed=False)) spectra.append(spectrum) spectra = np.array(spectra, dtype='int') cols = np.array(list(range(spectra.shape[1]))) + 1 cols = [('channel', i) for i in cols] inds = np.array(list(range(spectra.shape[0]))) + 1 sp = pd.DataFrame(spectra, columns=pd.MultiIndex.from_tuples(cols), index=inds) sp[('meta', 'EDR_file')] = os.path.basename(input_file) sp[('meta', 'Spacecraft_Clock')] = sclock sp[('meta', 'Shot')] = sp.index sp[('meta', 'SeqID')] = seqID sp[('meta', 'Focus_Distance')] = focus_dist for ind, name in enumerate(instrument_temps_name): sp[('meta', name + '_temp')] = instrument_temps[ind] sp.to_csv('test.csv') return sp
plio/io/io_jsc.py 0 → 100644 +202 −0 Original line number Diff line number Diff line import os import numpy as np import pandas as pd from pandas.core.common import array_equivalent from plio.utils.utils import file_search # This function reads the lookup tables used to expand metadata from the file names # This is separated from parsing the filenames so that for large lists of files the # lookup tables don't need to be read over and over # # Info in the tables is stored in a dict of dataframes so that only one variable # (the dict) needs to be passed between functions def read_refdata(LUT_files): ID_info = pd.read_csv(LUT_files['ID'], index_col=0) spectrometer_info = pd.read_csv(LUT_files['spect'], index_col=0) # spectrometer_info.reset_index(inplace=True) laser_info = pd.read_csv(LUT_files['laser'], index_col=0) # laser_info.reset_index(inplace=True) exp_info = pd.read_csv(LUT_files['exp'], index_col=0) # exp_info.reset_index(inplace=True) sample_info = pd.read_csv(LUT_files['sample'], index_col=0) # sample_info.reset_index(inplace=True) refdata = {'spect': spectrometer_info, 'laser': laser_info, 'exp': exp_info, 'sample': sample_info, 'ID': ID_info} return refdata # This function parses the file names to record metadata related to the observation def jsc_filename_parse(filename, refdata): filename = os.path.basename(filename) # strip the path off of the file name filename = filename.split('_') # split the file name on underscores libs_ID = filename[0] laserID = filename[4][0] expID = filename[5] spectID = filename[6] try: sampleID = refdata['ID'].loc[libs_ID].values[0] file_info = pd.DataFrame(refdata['sample'].loc[sampleID]) if file_info.columns.shape[0] < file_info.index.shape[0]: file_info = file_info.T if file_info.index.shape[0] > 1: print('More than one matching row for ' + sampleID + '!') tempID = 'Unknown' file_info = pd.DataFrame(refdata['sample'].loc[tempID]) if file_info.columns.shape[0] < file_info.index.shape[0]: file_info = file_info.T except: sampleID = 'Unknown' file_info = pd.DataFrame(refdata['sample'].loc[sampleID]) if file_info.columns.shape[0] < file_info.index.shape[0]: file_info = file_info.T file_info['Sample ID'] = sampleID file_info['LIBS ID'] = libs_ID file_info.reset_index(level=0, inplace=True, drop=True) file_info['loc'] = int(filename[1]) file_info['lab'] = filename[2] file_info['gas'] = filename[3][0] file_info['pressure'] = float(filename[3][1:]) if laserID in refdata['laser'].index: laser_info = pd.DataFrame(refdata['laser'].loc[laserID]).T laser_info.index.name = 'Laser Identifier' laser_info.reset_index(level=0, inplace=True) file_info = pd.concat([file_info, laser_info], axis=1) file_info['laser_power'] = float(filename[4][1:]) if expID in refdata['exp'].index: exp_info = pd.DataFrame(refdata['exp'].loc[expID]).T exp_info.index.name = 'Exp Identifier' exp_info.reset_index(level=0, inplace=True) file_info = pd.concat([file_info, exp_info], axis=1) file_info['spectrometer'] = spectID if spectID in refdata['spect'].index: temp = refdata['spect'].loc[spectID] temp = [temp[2], temp[4:]] spect_info = pd.DataFrame(refdata['spect'].loc[spectID]).T spect_info.index.name = 'Spectrometer Identifier' spect_info.reset_index(level=0, inplace=True) file_info = pd.concat([file_info, spect_info], axis=1) return file_info def JSC(input_files, refdata): try: # read the first file data = pd.read_csv(input_files[0], skiprows=14, sep='\t', engine='c') data = data.rename(columns={data.columns[0]: 'time1', data.columns[1]: 'time2'}) metadata = pd.concat([jsc_filename_parse(input_files[0], refdata)] * len(data.index)) metadata.drop('spectrometer', axis=1, inplace=True) # read the next files and merge them with the first for file in input_files[1:]: datatemp = pd.read_csv(file, skiprows=14, sep='\t', engine='c') datatemp = datatemp.rename(columns={datatemp.columns[0]: 'time1', datatemp.columns[1]: 'time2'}) data = data.merge(datatemp) time = data[['time1', 'time2']] # split the two time columns from the data frame data.drop(['time1', 'time2'], axis=1, inplace=True) # trim the data frame so it is just the spectra # make a multiindex for each wavlength column so they can be easily isolated from metadata later data.columns = [['wvl'] * len(data.columns), np.array(data.columns.values, dtype='float').round(4)] metadata.index = data.index metadata = pd.concat([metadata, time], axis=1) compcols = ['SiO2', 'TiO2', 'Al2O3', 'Cr2O3', 'Fe2O3T', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O', 'P2O5', 'SO3 LOI Residue', 'Total', 'Total Includes', '%LOI', 'FeO', 'Fe2O3', 'SO3 Actual', 'Fe(3+)/Fe(Total)', 'Rb (ug/g)', 'Sr (ug/g)', 'Y (ug/g)', 'Zr (ug/g)', 'V (ug/g)', 'Ni (ug/g)', 'Cr (ug/g)', 'Nb (ug/g)', 'Ga (ug/g)', 'Cu (ug/g)', 'Zn (ug/g)', 'Co (ug/g)', 'Ba (ug/g)', 'La (ug/g)', 'Ce (ug/g)', 'U (ug/g)', 'Th (ug/g)', 'Sc (ug/g)', 'Pb (ug/g)', 'Ge (ug/g)', 'As (ug/g)', 'Cl (ug/g)'] compdata = metadata[compcols] metadata.drop(compcols, axis=1, inplace=True) metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values] compdata.columns = [['comp'] * len(compdata.columns), compdata.columns.values] data = pd.concat([data, metadata, compdata], axis=1) data[('meta', 'Scan #')] = data.index data.set_index(('meta', 'time2'), drop=False, inplace=True) return data except: print('Problem reading:' + input_file) print('Moving to Problem_Files') os.rename(input_file, r"Problem_Files\\" + os.path.basename( input_file)) return None def jsc_batch(directory, LUT_files, searchstring='*.txt', to_csv=None): # Read in the lookup tables to expand filename metadata refdata = read_refdata(LUT_files) # get the list of files that match the search string in the given directory filelist = file_search(directory, searchstring) spectIDs = [] # create an empty list to hold the spectrometer IDs libsIDs = [] timestamps = [] locs = [] for file in filelist: filesplit = os.path.basename(file).split('_') spectIDs.append(filesplit[6]) # get the spectrometer IDs for each file in the list libsIDs.append(filesplit[0]) timestamps.append(filesplit[-1].split('.')[0]) locs.append(filesplit[1]) spectIDs_unique = np.unique(spectIDs) # get the unique spectrometer IDs libsIDs_unique = np.unique(libsIDs) dfs = [] # create an empty list to hold the data frames for each spectrometer # loop through each LIBS ID alldata = [] for ID in libsIDs_unique: print('Working on : ' + str(ID)) sublist = filelist[np.in1d(libsIDs, ID)] locs = [] for file in sublist: locs.append(os.path.basename(file).split('_')[1]) locs_unique = np.unique(locs) # loop through each location for that libs ID for loc in locs_unique: print(loc) sub_sublist = sublist[np.in1d(locs, loc)] # get the files for that LIBSID and location data = JSC(sub_sublist, refdata) alldata.append(data) pass combined = pd.concat(alldata) if to_csv is not None: print('Writing combined data to: ' + to_csv) combined.to_csv(to_csv) return combined # got this function from stack overflow: http://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns # it's slow but doesn't crash python like combined.T.drop_duplicates().T does in some cases with very large sets of data def duplicate_columns(frame): groups = frame.columns.to_series().groupby(frame.dtypes).groups dups = [] for t, v in groups.items(): cs = frame[v].columns vs = frame[v] lcs = len(cs) for i in range(lcs): ia = vs.iloc[:, i].values for j in range(i + 1, lcs): ja = vs.iloc[:, j].values if array_equivalent(ia, ja): dups.append(cs[i]) break return dups