Initial addition of ipf reader. (c0ee7950) · Commits · aflab / astrogeology / Plio

plio/io/io_bae.py

+135 −10

Original line number	Diff line number	Diff line
		@@ -52,6 +52,118 @@ def socetset_keywords_to_json(keywords, ell=None):
		parse(ell)
		return json.dumps(stream)

		def read_ipfs(input_data_list):
		"""
		Read a socet ipf file into a pandas data frame

		Parameters
		----------
		input_data_list : list
		list of paths to the a set of input data files

		Returns
		-------
		df : pd.DataFrame
		containing the ipf data with appropriate column names and indices
		"""

		default_columns = np.genfromtxt(input_data_list[0], skip_header=2, dtype='unicode',
		max_rows = 1, delimiter = ',')

		columns = []

		for column in default_columns:

		if '(' in column and ')' in column:
		column_name ,suffix = column.split('(')
		num = int(suffix.split(')')[0])

		for column_num in range(int(num)):
		new_column = '{}{}'.format(column_name, column_num)
		columns.append(new_column);

		else:
		columns.append(column)

		d_total = []

		for input_file in input_data_list:
		d = read_ipf(input_file)
		for point in d:
		d_total.append(point)

		df = pd.DataFrame(d_total, columns=columns)

		# Soft conversion of numeric types to numerics, allows str in first col for point_id
		df = df.apply(pd.to_numeric, errors='ignore')

		# Validate the read data with the header point count
		# assert int(cnt) == len(df), 'Dataframe length {} does not match point length {}.'.format(int(cnt), len(df))

		return df

		def read_ipf(input_data):
		"""
		Read a socet ipf file into a pandas data frame

		Parameters
		----------
		input_data : str
		path to the an input data file

		Returns
		-------
		df : pd.DataFrame
		containing the ipf data with appropriate column names and indices
		"""

		# Check that the number of rows is matching the expected number
		with open(input_data, 'r') as f:
		for i, l in enumerate(f):
		if i == 1:
		cnt = int(l)
		elif i == 2:
		col = l
		break

		# default_columns = np.genfromtxt(input_data, skip_header=2, dtype='unicode',
		# max_rows = 1, delimiter = ',')
		#
		# columns = []
		#
		# for column in default_columns:
		#
		# if '(' in column and ')' in column:
		# column_name ,suffix = column.split('(')
		# num = int(suffix.split(')')[0])
		#
		# for column_num in range(int(num)):
		# new_column = '{}{}'.format(column_name, column_num)
		# columns.append(new_column);
		#
		# else:
		# columns.append(column)

		# TODO: Add unicode conversion

		d = [line.split() for line in open(input_data, 'r')]
		d = np.hstack(np.array(d[3:]))
		d = d.reshape(-1, 12)

		assert int(cnt) == len(d), 'Dataframe length {} does not match point length {}.'.format(int(cnt), len(df))

		return d

		# df = pd.DataFrame(d, columns=columns)
		#
		# # Soft conversion of numeric types to numerics, allows str in first col for point_id
		# df = df.apply(pd.to_numeric, errors='ignore')
		#
		# # Validate the read data with the header point count
		# assert int(cnt) == len(df), 'Dataframe length {} does not match point length {}.'.format(int(cnt), len(df))
		#
		# return df

		def read_gpf(input_data):
		"""
		Read a socet gpf file into a pandas data frame
		@@ -76,22 +188,35 @@ def read_gpf(input_data):
		col = l
		break

		default_columns = np.genfromtxt(input_data, skip_header=2, dtype='unicode',
		max_rows = 1, delimiter = ',')

		columns = []

		for column in default_columns:

		if '(' in column and ')' in column:
		column_name ,suffix = column.split('(')
		num = int(suffix.split(')')[0])

		for column_num in range(int(num)):
		new_column = '{}{}'.format(column_name, column_num)
		columns.append(new_column);

		else:
		columns.append(column)

		# Mixed types requires read as unicode - let pandas soft convert
		d = np.genfromtxt(input_data, skip_header=3, dtype='unicode')
		d = d.reshape(-1, 12)

		#TODO: cols should be used to dynamically generate the column names

		df = pd.DataFrame(d, columns=['point_id', 'stat', 'known',
		'lat_Y_North', 'long_X_East','ht',
		'sigma0', 'sigma1', 'sigma2',
		'res0', 'res1', 'res2'])
		df = pd.DataFrame(d, columns=columns)

		# Soft conversion of numeric types to numerics, allows str in first col for point_id
		df = df.apply(pd.to_numeric, errors='ignore')

		# Validate the read data with the header point count
		assert int(cnt) == len(df)
		assert int(cnt) == len(df), 'Dataframe length {} does not match point length {}.'.format(int(cnt), len(df))

		return df