Merge pull request #17 from acpaquette/pysat_io (b95a55b3) · Commits · aflab / astrogeology / Plio

.travis.yml

+6 −6

Original line number	Diff line number	Diff line
		@@ -35,7 +35,7 @@ install:
		- conda config --add channels conda-forge
		- conda config --add channels jlaura
		- conda install -c conda-forge gdal h5py
		- conda install pandas sqlalchemy pyyaml networkx affine protobuf
		- conda install pandas sqlalchemy pyyaml networkx affine protobuf scipy
		- pip install pvl

		# Development installation

appveyor.yml

+1 −1

Original line number	Diff line number	Diff line
		@@ -54,7 +54,7 @@ install:
		- cmd: conda config --add channels conda-forge
		- cmd: conda config --add channels jlaura
		- cmd: conda install --yes -c conda-forge gdal h5py
		- cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine
		- cmd: conda install --yes pandas sqlalchemy pyyaml networkx affine scipy
		- cmd: conda install --yes -c jlaura protobuf pvl

		# Development installation

plio/io/io_ccam_pds.py

0 → 100644

+201 −0

Original line number	Diff line number	Diff line
		# This code is used to read individual ChemCam files
		# Header data is stored as attributes of the data frame
		# White space is stripped from the column names
		import os

		import numpy as np
		import pandas as pd
		import scipy.io as io

		from plio.utils.utils import lookup
		from plio.utils.utils import file_search


		def CCAM_CSV(input_data, ave=True):
		try:
		df = pd.read_csv(input_data, header=14, engine='c')
		cols = list(df.columns.values)
		df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names
		df.set_index(['wave'], inplace=True) # use wavelengths as indices
		# read the file header and put information into the dataframe as new columns
		metadata = pd.read_csv(input_data, sep='=', nrows=14, comment=',', engine='c', index_col=0, header=None)
		except:
		try: # handle files with an extra header row containing temperature
		df = pd.read_csv(input_data, header=15, engine='c')
		cols = list(df.columns.values)
		df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names
		df.set_index(['wave'], inplace=True) # use wavelengths as indices
		# read the file header and put information into the dataframe as new columns
		metadata = pd.read_csv(input_data, sep='=', nrows=15, comment=',', engine='c', index_col=0, header=None)
		except: # handle files with an extra header row containing temperature and target name
		df = pd.read_csv(input_data, header=16, engine='c')
		cols = list(df.columns.values)
		df.columns = [i.strip().replace('# ', '') for i in cols] # strip whitespace from column names
		df.set_index(['wave'], inplace=True) # use wavelengths as indices
		# read the file header and put information into the dataframe as new columns
		metadata = pd.read_csv(input_data, sep='=', nrows=16, comment=',', engine='c', index_col=0, header=None)

		if ave:
		df = pd.DataFrame(df['mean'])
		else:
		df = df.drop(['mean', 'median'], axis=1)
		df.index = [['wvl'] * len(df.index),
		df.index.values.round(4)] # create multiindex so spectra can be easily extracted with a single key
		df = df.T # transpose so that each spectrum is a row

		# remove extraneous stuff from the metadataindices
		metadata.index = [i.strip().strip('# ').replace(' FLOAT', '').lower() for i in metadata.index.values]
		metadata = metadata.T

		# extract info from the file name
		fname = os.path.basename(input_data)
		metadata['sclock'] = fname[4:13]
		metadata['seqid'] = fname[25:34].upper()
		metadata['Pversion'] = fname[34:36]

		# duplicate the metadata for each row in the df
		if not ave:
		metadata = metadata.append([metadata] * (len(df.index) - 1), ignore_index=True)
		metadata.index = df.index # make the indices match
		metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values] # make the columns into multiindex
		df = pd.concat([metadata, df], axis=1) # combine the spectra with the metadata
		return df


		def CCAM_SAV(input_data, ave=True):
		# read the IDL .SAV file

		data = io.readsav(input_data, python_dict=True)

		# put the spectra into data frames and combine them
		df_UV = pd.DataFrame(data['uv'], index=data['defuv'])
		df_VIS = pd.DataFrame(data['vis'], index=data['defvis'])
		df_VNIR = pd.DataFrame(data['vnir'], index=data['defvnir'])
		df_spect = pd.concat([df_UV, df_VIS, df_VNIR])
		df_spect.columns = ['shot' + str(i + 1) for i in
		df_spect.columns] # add 1 to the columns so they correspond to shot number

		df_aUV = pd.DataFrame(data['auv'], index=data['defuv'], columns=['average'])
		df_aVIS = pd.DataFrame(data['avis'], index=data['defvis'], columns=['average'])
		df_aVNIR = pd.DataFrame(data['avnir'], index=data['defvnir'], columns=['average'])
		df_ave = pd.concat([df_aUV, df_aVIS, df_aVNIR])

		df_mUV = pd.DataFrame(data['muv'], index=data['defuv'], columns=['median'])
		df_mVIS = pd.DataFrame(data['mvis'], index=data['defvis'], columns=['median'])
		df_mVNIR = pd.DataFrame(data['mvnir'], index=data['defvnir'], columns=['median'])
		df_med = pd.concat([df_mUV, df_mVIS, df_mVNIR])

		df = pd.concat([df_spect, df_ave, df_med], axis=1)
		# create multiindex to access wavelength values
		# also, round the wavlength values to a more reasonable level of precision
		df.index = [['wvl'] * len(df.index), df.index.values.round(4)]
		# transpose so that spectra are rows rather than columns
		df = df.T

		# extract metadata from the file name and add it to the data frame
		# use the multiindex label "meta" for all metadata

		fname = os.path.basename(input_data)

		# for some reason, some ChemCam files have the 'darkname' key, others call it 'darkspect'
		# this try-except pair converts to 'darkname' when needed
		try:
		data['darkname']
		except:
		data['darkname'] = data['darkspec']

		metadata = [fname,
		fname[4:13],
		fname[25:34].upper(),
		fname[34:36],
		data['continuumvismin'],
		data['continuumvnirmin'],
		data['continuumuvmin'],
		data['continuumvnirend'],
		data['distt'],
		data['darkname'],
		data['nshots'],
		data['dnoiseiter'],
		data['dnoisesig'],
		data['matchedfilter']]
		metadata = np.tile(metadata, (len(df.index), 1))
		metadata_cols = list(zip(['meta'] * len(df.index), ['file',
		'sclock',
		'seqid',
		'Pversion',
		'continuumvismin',
		'continuumvnirmin',
		'continuumuvmin',
		'continuumvnirend',
		'distt',
		'dark',
		'nshots',
		'dnoiseiter',
		'dnoisesig',
		'matchedfilter']))
		metadata = pd.DataFrame(metadata, columns=pd.MultiIndex.from_tuples(metadata_cols), index=df.index)

		df = pd.concat([metadata, df], axis=1)
		if ave == True:
		df = df.loc['average']
		df = df.to_frame().T
		else:
		pass

		return df


		def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None):
		# Determine if the file is a .csv or .SAV
		if '.sav' in searchstring.lower():
		is_sav = True
		else:
		is_sav = False
		filelist = file_search(directory, searchstring)
		basenames = np.zeros_like(filelist)
		sclocks = np.zeros_like(filelist)
		P_version = np.zeros_like(filelist, dtype='int')

		# Extract the sclock and version for each file and ensure that only one
		# file per sclock is being read, and that it is the one with the highest version number
		for i, name in enumerate(filelist):
		basenames[i] = os.path.basename(name)
		sclocks[i] = basenames[i][4:13] # extract the sclock
		P_version[i] = basenames[i][-5:-4] # extract the version

		sclocks_unique = np.unique(sclocks) # find unique sclocks
		filelist_new = np.array([], dtype='str')
		for i in sclocks_unique:
		match = (sclocks == i) # find all instances with matching sclocks
		maxP = P_version[match] == max(P_version[match]) # find the highest version among these files
		filelist_new = np.append(filelist_new, filelist[match][maxP]) # keep only the file with thei highest version

		filelist = filelist_new
		# Should add a progress bar for importing large numbers of files
		dt = []

		for i, file in enumerate(filelist):
		print(file)
		if is_sav:
		tmp = CCAM_SAV(file, ave=ave)
		else:
		tmp = CCAM_CSV(file, ave=ave)
		if i == 0:
		combined = tmp
		else:
		# This ensures that rounding errors are not causing mismatches in columns
		cols1 = list(combined['wvl'].columns)
		cols2 = list(tmp['wvl'].columns)
		if set(cols1) == set(cols2):
		combined = pd.concat([combined, tmp])
		else:
		print("Wavelengths don't match!")

		combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')])

		if lookupfile is not None:

		combined = lookup(combined, lookupfile=lookupfile.replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))
		if to_csv is not None:
		combined.to_csv(to_csv)
		return combined

plio/io/io_edr.py

0 → 100644

+80 −0

Original line number	Diff line number	Diff line
		import os

		import numpy as np
		import pandas as pd


		def EDR(input_file):
		f = open(input_file, 'rb') # read as bytes so python won't complain about the binary part of the file

		# read lines of the header until reaching the end of the libs table (collecting other metadata along the way)
		end_of_libs_table = False
		while end_of_libs_table is False:
		line = str(f.readline(), 'utf-8').replace('\r', '').replace('\n',
		'') # convert the current line to a string and get rid of newline characters
		line = line.split('=') # split the line on equals sign if present
		# look for the name of the value we want, if the current line has it, then set the value
		if 'RECORD_BYTES' in line[0]:
		rbytes = int(line[1])
		if 'LABEL_RECORDS' in line[0]:
		lrecs = int(line[1])
		if 'SPACECRAFT_CLOCK_START_COUNT' in line[0]:
		sclock = int(line[1].replace('"', '').split('.')[0])
		if 'SEQUENCE_ID' in line[0]:
		seqID = line[1].replace('"', '')
		if 'INSTRUMENT_FOCUS_DISTANCE' in line[0]:
		focus_dist = int(line[1])

		if 'INSTRUMENT_TEMPERATURE' in line[0]:
		instrument_temps = line[1] \
		+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
		+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
		+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
		instrument_temps = [float(i) for i in
		instrument_temps.replace('<degC>', '').replace('(', '').replace(')', '').replace(' ',
		'').split(
		',')]
		instrument_temps_name = str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
		instrument_temps_name = instrument_temps_name.split('=')[1] \
		+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
		+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
		+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '') \
		+ str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '')
		instrument_temps_name = instrument_temps_name.replace(' ', '').replace('(', '').replace(')', '').replace(
		'"', '').split(',')
		f.readline()
		pass
		try:
		if 'CCAM_LIBS_DATA_CONTAINER' in line[1]:
		nshots = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1])
		start_byte = int(str(f.readline(), 'utf-8').replace('\r', '').replace('\n', '').split('=')[1])
		if 'END_OBJECT' in line[0] and 'CCAM_LIBS_TABLE' in line[1]:
		end_of_libs_table = True
		except:
		pass

		f.close()
		header_skip = lrecs * rbytes # calculate the number of header bytes to skip to get to the real data

		with open(input_file, "rb") as f:
		f.seek(header_skip + start_byte - 1, 0)
		spectra = []
		while spectra.__len__() < nshots:
		spectrum = []
		while spectrum.__len__() < 6444:
		spectrum.append(int.from_bytes(f.read(2), byteorder='big', signed=False))
		spectra.append(spectrum)
		spectra = np.array(spectra, dtype='int')
		cols = np.array(list(range(spectra.shape[1]))) + 1
		cols = [('channel', i) for i in cols]
		inds = np.array(list(range(spectra.shape[0]))) + 1
		sp = pd.DataFrame(spectra, columns=pd.MultiIndex.from_tuples(cols), index=inds)
		sp[('meta', 'EDR_file')] = os.path.basename(input_file)
		sp[('meta', 'Spacecraft_Clock')] = sclock
		sp[('meta', 'Shot')] = sp.index
		sp[('meta', 'SeqID')] = seqID
		sp[('meta', 'Focus_Distance')] = focus_dist
		for ind, name in enumerate(instrument_temps_name):
		sp[('meta', name + '_temp')] = instrument_temps[ind]
		sp.to_csv('test.csv')
		return sp

plio/io/io_jsc.py

0 → 100644

+202 −0

Original line number	Diff line number	Diff line
		import os

		import numpy as np
		import pandas as pd
		from pandas.core.common import array_equivalent

		from plio.utils.utils import file_search


		# This function reads the lookup tables used to expand metadata from the file names
		# This is separated from parsing the filenames so that for large lists of files the
		# lookup tables don't need to be read over and over
		#
		# Info in the tables is stored in a dict of dataframes so that only one variable
		# (the dict) needs to be passed between functions
		def read_refdata(LUT_files):
		ID_info = pd.read_csv(LUT_files['ID'], index_col=0)
		spectrometer_info = pd.read_csv(LUT_files['spect'], index_col=0)
		# spectrometer_info.reset_index(inplace=True)
		laser_info = pd.read_csv(LUT_files['laser'], index_col=0)
		# laser_info.reset_index(inplace=True)
		exp_info = pd.read_csv(LUT_files['exp'], index_col=0)
		# exp_info.reset_index(inplace=True)
		sample_info = pd.read_csv(LUT_files['sample'], index_col=0)
		# sample_info.reset_index(inplace=True)
		refdata = {'spect': spectrometer_info, 'laser': laser_info, 'exp': exp_info, 'sample': sample_info, 'ID': ID_info}
		return refdata


		# This function parses the file names to record metadata related to the observation
		def jsc_filename_parse(filename, refdata):
		filename = os.path.basename(filename) # strip the path off of the file name
		filename = filename.split('_') # split the file name on underscores
		libs_ID = filename[0]
		laserID = filename[4][0]
		expID = filename[5]
		spectID = filename[6]

		try:
		sampleID = refdata['ID'].loc[libs_ID].values[0]
		file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
		if file_info.columns.shape[0] < file_info.index.shape[0]:
		file_info = file_info.T
		if file_info.index.shape[0] > 1:
		print('More than one matching row for ' + sampleID + '!')
		tempID = 'Unknown'
		file_info = pd.DataFrame(refdata['sample'].loc[tempID])
		if file_info.columns.shape[0] < file_info.index.shape[0]:
		file_info = file_info.T


		except:
		sampleID = 'Unknown'
		file_info = pd.DataFrame(refdata['sample'].loc[sampleID])
		if file_info.columns.shape[0] < file_info.index.shape[0]:
		file_info = file_info.T

		file_info['Sample ID'] = sampleID
		file_info['LIBS ID'] = libs_ID
		file_info.reset_index(level=0, inplace=True, drop=True)
		file_info['loc'] = int(filename[1])
		file_info['lab'] = filename[2]
		file_info['gas'] = filename[3][0]
		file_info['pressure'] = float(filename[3][1:])

		if laserID in refdata['laser'].index:
		laser_info = pd.DataFrame(refdata['laser'].loc[laserID]).T
		laser_info.index.name = 'Laser Identifier'
		laser_info.reset_index(level=0, inplace=True)
		file_info = pd.concat([file_info, laser_info], axis=1)

		file_info['laser_power'] = float(filename[4][1:])
		if expID in refdata['exp'].index:
		exp_info = pd.DataFrame(refdata['exp'].loc[expID]).T
		exp_info.index.name = 'Exp Identifier'
		exp_info.reset_index(level=0, inplace=True)
		file_info = pd.concat([file_info, exp_info], axis=1)

		file_info['spectrometer'] = spectID
		if spectID in refdata['spect'].index:
		temp = refdata['spect'].loc[spectID]
		temp = [temp[2], temp[4:]]
		spect_info = pd.DataFrame(refdata['spect'].loc[spectID]).T
		spect_info.index.name = 'Spectrometer Identifier'
		spect_info.reset_index(level=0, inplace=True)
		file_info = pd.concat([file_info, spect_info], axis=1)

		return file_info


		def JSC(input_files, refdata):
		try:
		# read the first file
		data = pd.read_csv(input_files[0], skiprows=14, sep='\t', engine='c')
		data = data.rename(columns={data.columns[0]: 'time1', data.columns[1]: 'time2'})
		metadata = pd.concat([jsc_filename_parse(input_files[0], refdata)] * len(data.index))
		metadata.drop('spectrometer', axis=1, inplace=True)

		# read the next files and merge them with the first
		for file in input_files[1:]:
		datatemp = pd.read_csv(file, skiprows=14, sep='\t', engine='c')
		datatemp = datatemp.rename(columns={datatemp.columns[0]: 'time1', datatemp.columns[1]: 'time2'})
		data = data.merge(datatemp)

		time = data[['time1', 'time2']] # split the two time columns from the data frame
		data.drop(['time1', 'time2'], axis=1, inplace=True) # trim the data frame so it is just the spectra

		# make a multiindex for each wavlength column so they can be easily isolated from metadata later
		data.columns = [['wvl'] * len(data.columns), np.array(data.columns.values, dtype='float').round(4)]

		metadata.index = data.index
		metadata = pd.concat([metadata, time], axis=1)
		compcols = ['SiO2', 'TiO2', 'Al2O3', 'Cr2O3', 'Fe2O3T', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O', 'P2O5',
		'SO3 LOI Residue', 'Total', 'Total Includes', '%LOI', 'FeO',
		'Fe2O3', 'SO3 Actual', 'Fe(3+)/Fe(Total)', 'Rb (ug/g)', 'Sr (ug/g)', 'Y (ug/g)', 'Zr (ug/g)',
		'V (ug/g)', 'Ni (ug/g)', 'Cr (ug/g)',
		'Nb (ug/g)', 'Ga (ug/g)', 'Cu (ug/g)', 'Zn (ug/g)', 'Co (ug/g)', 'Ba (ug/g)', 'La (ug/g)',
		'Ce (ug/g)', 'U (ug/g)', 'Th (ug/g)', 'Sc (ug/g)',
		'Pb (ug/g)', 'Ge (ug/g)', 'As (ug/g)', 'Cl (ug/g)']
		compdata = metadata[compcols]
		metadata.drop(compcols, axis=1, inplace=True)
		metadata.columns = [['meta'] * len(metadata.columns), metadata.columns.values]
		compdata.columns = [['comp'] * len(compdata.columns), compdata.columns.values]
		data = pd.concat([data, metadata, compdata], axis=1)

		data[('meta', 'Scan #')] = data.index
		data.set_index(('meta', 'time2'), drop=False, inplace=True)

		return data
		except:
		print('Problem reading:' + input_file)
		print('Moving to Problem_Files')
		os.rename(input_file,
		r"Problem_Files\\" + os.path.basename(
		input_file))
		return None


		def jsc_batch(directory, LUT_files, searchstring='*.txt', to_csv=None):
		# Read in the lookup tables to expand filename metadata
		refdata = read_refdata(LUT_files)
		# get the list of files that match the search string in the given directory
		filelist = file_search(directory, searchstring)
		spectIDs = [] # create an empty list to hold the spectrometer IDs
		libsIDs = []
		timestamps = []
		locs = []
		for file in filelist:
		filesplit = os.path.basename(file).split('_')
		spectIDs.append(filesplit[6]) # get the spectrometer IDs for each file in the list
		libsIDs.append(filesplit[0])
		timestamps.append(filesplit[-1].split('.')[0])
		locs.append(filesplit[1])
		spectIDs_unique = np.unique(spectIDs) # get the unique spectrometer IDs
		libsIDs_unique = np.unique(libsIDs)
		dfs = [] # create an empty list to hold the data frames for each spectrometer

		# loop through each LIBS ID
		alldata = []
		for ID in libsIDs_unique:
		print('Working on : ' + str(ID))
		sublist = filelist[np.in1d(libsIDs, ID)]
		locs = []
		for file in sublist:
		locs.append(os.path.basename(file).split('_')[1])
		locs_unique = np.unique(locs)
		# loop through each location for that libs ID
		for loc in locs_unique:
		print(loc)
		sub_sublist = sublist[np.in1d(locs, loc)] # get the files for that LIBSID and location
		data = JSC(sub_sublist, refdata)
		alldata.append(data)
		pass

		combined = pd.concat(alldata)
		if to_csv is not None:
		print('Writing combined data to: ' + to_csv)
		combined.to_csv(to_csv)
		return combined


		# got this function from stack overflow: http://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
		# it's slow but doesn't crash python like combined.T.drop_duplicates().T does in some cases with very large sets of data
		def duplicate_columns(frame):
		groups = frame.columns.to_series().groupby(frame.dtypes).groups
		dups = []

		for t, v in groups.items():

		cs = frame[v].columns
		vs = frame[v]
		lcs = len(cs)

		for i in range(lcs):
		ia = vs.iloc[:, i].values
		for j in range(i + 1, lcs):
		ja = vs.iloc[:, j].values
		if array_equivalent(ia, ja):
		dups.append(cs[i])
		break

		return dups