Loading etl/assets/commons/__init__.py +10 −0 Original line number Diff line number Diff line Loading @@ -83,6 +83,16 @@ def load_config_file(config_file_path: str, return config def moving_average(x: np.array , window_size: int) -> np.array: """ Compute the moving average of a 1D array :param x: the 1D array to compute the moving average on :param window_size: the size of the moving average window :return: the moving average of the array """ return np.convolve(x, np.ones(window_size), 'valid') / window_size def get_moldata(species_names: list, logger: logging.Logger, path: Union[str, None] = None, Loading etl/assets/data/radex_data.csv 0 → 100644 +73 −0 Original line number Diff line number Diff line rot_trans,log_col_dens,log_density,temperature,line1,line2,line3 2-1,15.4,3,10,0.78,0.17,0.01 2-1,15.4,3,30,1.8,0.35,0.018 2-1,15.4,4,10,3.37,0.78,0.04 2-1,15.4,4,30,7.96,1.28,0.07 2-1,15.4,5,10,5.64,2.43,0.34 2-1,15.4,5,30,12.5,2.98,0.46 2-1,15.4,6,10,6.11,4.11,1.41 2-1,15.4,6,30,8.49,3.77,1.34 2-1,15.4,7,10,5.96,4.62,2.12 2-1,15.4,7,30,5.68,3.63,2.06 2-1,15.4,8,10,5.84,4.73,2.30 2-1,15.4,8,30,4.13,3.63,2.24 2-1,14,3,10,0.069,0.0042,0.0001 2-1,14,3,30,0.16,0.0091,0.00026 2-1,14,4,10,0.47,0.028,0.00083 2-1,14,4,30,0.85,0.046,0.0019 2-1,14,5,10,0.91,0.085,0.0076 2-1,14,5,30,0.86,0.09,0.014 2-1,14,6,10,0.75,0.20,0.05 2-1,14,6,30,0.46,0.14,0.05 2-1,14,7,10,0.59,0.28,0.09 2-1,14,7,30,0.28,0.15,0.08 2-1,14,8,10,0.52,0.31,0.1 2-1,14,8,30,0.19,0.15,0.09 5-4,15.4,3,10,1.1e-3,2.3e-2 5-4,15.4,3,30,1.3e-2,0.26, 5-4,15.4,4,10,1.7e-2,0.32, 5-4,15.4,4,30,0.22,2.33, 5-4,15.4,5,10,0.25,2.14, 5-4,15.4,5,30,2.72,10.44, 5-4,15.4,6,10,1.43,3.85, 5-4,15.4,6,30,6.79,16.5, 5-4,15.4,7,10,2.29,3.97, 5-4,15.4,7,30,7.76,13.87, 5-4,15.4,8,10,2.50,3.88, 5-4,15.4,8,30,8.07,11.0, 5-4,14,3,10,3.6e-5,8.2e-4, 5-4,14,3,30,4.5e-4,1.0e-2, 5-4,14,4,10,4.1e-4,0.010, 5-4,14,4,30,4.7e-3,0.12, 5-4,14,5,10,5.4e-3,0.13, 5-4,14,5,30,5.2e-2,0.92, 5-4,14,6,10,4.8e-2,0.32, 5-4,14,6,30,0.24,1.30, 5-4,14,7,10,0.11,0.31, 5-4,14,7,30,0.36,0.87 5-4,14,8,10,0.13,0.29, 5-4,14,8,30,0.39,0.60, 7-6,15.4,3,10,1.4e-5,3.3e-4, 7-6,15.4,3,30,1.4e-3,2.6e-2, 7-6,15.4,4,10,2.0e-4,5.7e-3, 7-6,15.4,4,30,2.0e-2,3.8e-1, 7-6,15.4,5,10,4.8e-3,0.1, 7-6,15.4,5,30,0.47,4.1, 7-6,15.4,6,10,8.6e-2,0.52, 7-6,15.4,6,30,3.54,11.37, 7-6,15.4,7,10,0.26,0.67, 7-6,15.4,7,30,5.46,10.82, 7-6,15.4,8,10,0.32,0.66, 7-6,15.4,8,30,5.97,8.53, 7-6,14,3,10,5.3e-7,1.1e-5, 7-6,14,3,30,5.4e-5,8.6e-4, 7-6,14,4,10,6.2e-6,1.6e-4, 7-6,14,4,30,6.2e-4,1.2e-2, 7-6,14,5,10,1.2e-4,3.3e-3, 7-6,14,5,30,1.1e-2,0.19, 7-6,14,6,10,2.5e-3,2.2e-2, 7-6,14,6,30,0.11,0.74, 7-6,14,7,10,9.7e-3,3.0e-2, 7-6,14,7,30,0.24,0.62, 7-6,14,8,10,1.3e-2,2.9e-2, 7-6,14,8,30,0.27,0.44, No newline at end of file etl/prs/prs_analytical_representations.py +43 −27 Original line number Diff line number Diff line Loading @@ -2,17 +2,19 @@ import numpy as np import os import matplotlib.pyplot as plt import pickle from scipy.interpolate import splrep, BSpline, RegularGridInterpolator from scipy.optimize import curve_fit from assets.commons import (get_data, load_config_file, setup_logger, validate_parameter) import yaml from typing import Union, List def approx(x, x_shift, a, b, exponent, scale=1): return (np.abs(np.arctanh(2 * (x - x_shift) / scale) + a)) ** exponent + b def approx(x, x_shift, b, exponent, scale): arg = 2 * (x - x_shift) / scale clean_arg = np.where(np.abs(arg) > 0.99999, np.sign(arg) * 0.99999, arg) return (np.arctanh(clean_arg) + b) ** exponent def main(ratios_to_fit: Union[List[str], None] = None): Loading @@ -25,19 +27,12 @@ def main(ratios_to_fit: Union[List[str], None] = None): logger.info(f'Using {config["run_type"]} to fit data...') logger.info(f'Data contains {data.shape[0]} rows') bw = { '87-86': 0.1, '88-87': 0.1, '88-86': 0.1, '257-256': 0.4, '381-380': 0.2} x0 = { '87-86': ((0.55, 2.3, 3.75, 0.9),), '88-87': ((0.59, 2.3, 3.9, 1.05, 0.96),), '88-86': ((0.55, 2.5, 4.3, 0.83, 1.08),), '257-256': ((5, 2.5, 2.45, 0.75, 8), (5, 2.5, 3.33, 0.75, -8)), '381-380': ((2.04, 2.5, 3.1, 0.6, 2.1), (2.18, 2.5, 3.7, 0.79, -2.4)), '87-86': ((0.55, 6.75, 0.9, 1),), '88-87': ((0.59, 6.9, 1.05, 0.96),), '88-86': ((0.55, 6.3, 0.83, 1.08),), '257-256': ((5, 4.45, 0.75, 8), (4.5, 8.5, 0.79, -7)), '381-380': ((2.04, 5.1, 0.6, 2.1), (2.18, 5.7, 0.79, -2.4)), } min_ratio = { '87-86': 0.08, Loading @@ -51,31 +46,52 @@ def main(ratios_to_fit: Union[List[str], None] = None): '88-86': 1.07, '257-256': 7.5, '381-380': 2.9} density_limits = { '87-86': ((5e4, 2e7),), '88-87': ((1e5, 3e7),), '88-86': ((1e5, 3e7),), '257-256': ((2e3, 7.5e4), (7.5e4, 4e6)), '381-380': ((1e4, 1.9e5), (1.9e5, 1e7)) } best_fit_params = {} for ratio_string in _ratio_to_fit: _column_to_fit = f'ratio_{ratio_string}' data.sort_values(by=_column_to_fit, inplace=True) plt.clf() plt.scatter(data['avg_nh2'], data[_column_to_fit], marker='+', alpha=0.1) x_reg = np.linspace(min_ratio[ratio_string], max_ratio[ratio_string], 100) x_reg = np.linspace(min_ratio[ratio_string], max_ratio[ratio_string], 1000) kde_max = pickle.load( open(f'prs/output/run_type/{config["run_type"]}/kde_smoothed_{ratio_string}_max_locus.pickle', 'rb')) data_clean = data[[_column_to_fit, 'avg_nh2']].loc[data[_column_to_fit] <= max_ratio[ratio_string]].copy() kde = pickle.load( open(f'prs/output/run_type/{config["run_type"]}/trained_model/ratio_density_kde_{ratio_string}.pickle', 'rb')) rgi = RegularGridInterpolator((kde['y'] * bw[ratio_string], kde['x'] * 0.2), kde['values'].reshape(200, 200).T) density = rgi(data_clean[[_column_to_fit, 'avg_nh2']]) data_clean = data_clean.loc[(density > density.mean() - density.std())] plt.scatter(data_clean['avg_nh2'], data_clean[_column_to_fit], marker='+', alpha=0.1, color='red') plt.plot(kde_max[1], kde_max[0], color='red') plt.semilogy() plt.xlabel('log(<n(H2)>)') plt.ylabel('Ratio') for components in x0[ratio_string]: plt.plot(approx(x_reg, *components),x_reg , color='cyan') best_fit_params[ratio_string] = [] for guesses, limits in zip(x0[ratio_string], density_limits[ratio_string]): clean_mask = (~np.isnan(kde_max[0]) & ~np.isnan(kde_max[1]) & (kde_max[0] >= limits[0]) & (kde_max[0] <= limits[1])) best_fit, par_cov = curve_fit(approx, kde_max[1][clean_mask], np.log10(kde_max[0][clean_mask]), p0=guesses, bounds=((0.1, 2, 0.3, -9), (7, 10, 1.5, 9))) approx_density = 10 ** approx(x_reg, *best_fit) print(best_fit) best_fit_params[ratio_string].append({param: float(element) for element, param in zip(best_fit, ['a', 'b', 'nu', 's'])}) approx_density = np.where((approx_density >= limits[0]) & (approx_density <= limits[1]), approx_density, np.nan) plt.plot(x_reg, approx_density) plt.savefig(os.path.join('..', 'publications', '6373bb408e4040043398e495', 'referee', f'analytical_expressions_comparison_{ratio_string}.png')) with (open(os.path.join('prs', 'output', 'run_type', 'constant_abundance_p15_q05', 'best_fit_params.yml'), 'w') as outfile): yaml.dump(best_fit_params, outfile) if __name__ == '__main__': Loading etl/prs/prs_density_inference.py +153 −11 Original line number Diff line number Diff line Loading @@ -11,13 +11,14 @@ from sklearn.model_selection import GridSearchCV from assets.commons import (load_config_file, validate_parameter, setup_logger, get_postprocessed_data) get_postprocessed_data, moving_average) from assets.constants import line_ratio_mapping from multiprocessing import Pool from typing import Tuple, Union, List from scipy.integrate import cumtrapz from scipy.stats import truncnorm from scipy.interpolate import RegularGridInterpolator from scipy.interpolate import RegularGridInterpolator, InterpolatedUnivariateSpline from functools import reduce Loading @@ -35,6 +36,7 @@ def train_kde_model(ratio: List[str], best_bandwidth: Union[None, dict], model_root_folder: Union[str, None] = None, ratio_limits: Union[None, dict] = None, plot_radex_data: Union[None, bool] = False, rt_adjustment_factor: Union[int, float] = 2) -> Tuple[str, KernelDensity]: """ Create the KDE from the modelled datapoints. Loading @@ -48,6 +50,7 @@ def train_kde_model(ratio: List[str], :param rt_adjustment_factor: a scaling factor to apply to the specified bandwidth for the computation of the KDE for the sparser RT-only data :param ratio_limits: fixed plotting limits :param plot_radex_data: whether to plot the radex data :return: the string representing the ratio modelled and the KDE itself """ _model_root_folder = validate_parameter( Loading @@ -62,11 +65,13 @@ def train_kde_model(ratio: List[str], grid = scaled_grid.copy() grid[0] = scaled_grid[0] * 0.2 grid[1] = scaled_grid[1] * best_bandwidth[ratio_string] radex_data = get_radex_data() if plot_radex_data else None plot_kde_ratio_nh2(grid=grid, values_on_grid=z.reshape(points_per_axis, points_per_axis), ratio_string=ratio_string, model_root_folder=_model_root_folder, training_data=training_data, additional_data=radex_data, suffix_outfile='_ml', ratio_limits=ratio_limits[ratio_string]) Loading @@ -83,6 +88,7 @@ def train_kde_model(ratio: List[str], ratio_string=ratio_string, model_root_folder=_model_root_folder, training_data=training_data[training_data['source'] == 'RT'], additional_data=radex_data, ratio_limits=ratio_limits[ratio_string]) with open( os.path.join(_model_root_folder, 'trained_model', f'ratio_density_kde_{ratio_string}.pickle'), 'wb' Loading Loading @@ -140,6 +146,7 @@ def plot_kde_ratio_nh2(grid: np.array, ratio_string: str, model_root_folder: str, training_data: pd.DataFrame, additional_data: pd.DataFrame = None, suffix_outfile: str = None, ratio_limits: Union[None, list] = None): """ Loading @@ -153,24 +160,152 @@ def plot_kde_ratio_nh2(grid: np.array, :param training_data: The DataFrame containing the training data. :param suffix_outfile: Optional. The suffix to append to the output file name. Defaults to an empty string if None. :param ratio_limits: Optional. The limits for the ratio axis. Defaults to None, which auto-scales the axis. :param additional_data: Optional. The DataFrame containing the additional data to plot. :return: None. Saves the plot as a PNG file in the specified folder. """ plt.rcParams.update({'font.size': 20}) _suffix_outfile = validate_parameter(suffix_outfile, default='') plt.clf() plt.figure(figsize=(8, 6)) plt.scatter(training_data['avg_nh2'], training_data[f'ratio_{ratio_string}'], marker='+', alpha=0.1, facecolor='grey') plt.contour(10 ** grid[0], grid[1], values_on_grid, levels=np.arange(0.05, 0.95, 0.15)) smoothed_max_locus_density, smoothed_max_locus_ratio = plot_kde_results(grid=grid, ratio_string=ratio_string, training_data=training_data, values_on_grid=values_on_grid) if (additional_data is not None) and (ratio_string in additional_data.columns): plt.scatter(10 ** additional_data['log_density'], additional_data[ratio_string], marker='*', alpha=1, facecolor='red', s=100) plt.semilogx() plt.xlabel(r'<$n$(H$_2$)> [cm$^{-3}$]') plt.ylabel(f'Ratio {line_ratio_mapping[ratio_string]}') plt.ylim(ratio_limits) plt.tight_layout() plt.legend(loc='best') plt.savefig(os.path.join( model_root_folder, 'figures', f'ratio_vs_avg_density_los_kde_{ratio_string}{_suffix_outfile}.png')) with open(os.path.join(model_root_folder, f'kde_smoothed_{ratio_string}_max_locus.pickle'), 'wb') as outfile: pickle.dump([smoothed_max_locus_density, smoothed_max_locus_ratio], outfile) def plot_kde_results(grid: np.array, ratio_string: str, training_data: pd.DataFrame, values_on_grid: np.array, plot_colours: Union[None, Tuple[str, str]] = None, plot_training_data: bool = True, colour_scatter: Union[None, str] = None) -> Tuple[np.array, np.array]: _plot_colours = validate_parameter(plot_colours, default=('gold', 'cornflowerblue')) _colour_scatter = validate_parameter(colour_scatter, default='grey') if plot_training_data: plt.scatter(training_data['avg_nh2'], training_data[f'ratio_{ratio_string}'], marker='+', alpha=0.1, facecolor=_colour_scatter) values_on_grid_norm = values_on_grid.copy() / np.max(values_on_grid, axis=1, keepdims=True) density_plot_values = 10 ** grid[0][:, 0] for hpd_threshold, colour, label in zip((0.9, 0.5), _plot_colours, ('90% HPD', '50% HPD')): densities, limits_hpd = get_hpd_relation(grid, values_on_grid_norm, hpd_threshold=hpd_threshold) density_plot_values = 10 ** moving_average(densities, 20) ratio_lower_limits = moving_average(limits_hpd[:, 0], 20) ratio_upper_limits = moving_average(limits_hpd[:, 1], 20) plt.fill_between(density_plot_values, ratio_lower_limits, ratio_upper_limits, alpha=0.3, facecolor=colour, label=label) plt.plot(density_plot_values, ratio_lower_limits, color=colour) plt.plot(density_plot_values, ratio_upper_limits, color=colour) maxdens, maxratio = get_max_locus(grid, values_on_grid) smoothed_max_locus_density = 10 ** moving_average(maxdens, 20) smoothed_max_locus_ratio = moving_average(maxratio, 20) mask_condition = (smoothed_max_locus_density >= density_plot_values[0]) & \ (smoothed_max_locus_density <= density_plot_values[-1]) smoothed_max_locus_density = np.where(mask_condition, smoothed_max_locus_density, np.nan) smoothed_max_locus_ratio = np.where(mask_condition, smoothed_max_locus_ratio, np.nan) plt.plot(smoothed_max_locus_density, smoothed_max_locus_ratio, label='Max PDF', color=_plot_colours[1]) return smoothed_max_locus_density, smoothed_max_locus_ratio def get_max_locus(grid, values_on_grid): """ Compute the maximum of the KDE for each density value. Parameters ---------- grid : tuple The grid of values used for the KDE. values_on_grid : array_like The computed KDE values on the grid. Returns ------- max_density : array_like The maximum density values. max_ratio : array_like The corresponding maximum ratio values. """ # Find the maximum ratio value for each density value max_ratio = grid[1][0, np.argmax(values_on_grid, axis=1)] # Replace missing solutions with NaN max_ratio = np.where(max_ratio > 0, max_ratio, np.nan) max_density = grid[0][:, 0] # Apply mask from missing ratio values max_density = np.where(max_ratio > 0, max_density, np.nan) return max_density, max_ratio def get_hpd_relation(grid: Union[tuple, list], normalized_kde_values: np.array, hpd_threshold: float) -> tuple: """ Calculate the highest posterior density (HPD) interval for a given KDE probability density. Parameters ---------- grid : tuple A tuple containing two arrays: the density grid and the ratio grid. normalized_kde_values : np.array The normalized KDE values on the grid. Normalization is performed by dividing by the maximum value for each density. hpd_threshold : float The probability mass threshold for the HPD interval. Returns ------- densities : np.array Array of density values corresponding to the HPD limits. limits_hpd : np.array Array of tuples, each containing the lower and upper bounds of the HPD interval. """ hpd_limits = [] density_values = [] for index, density in enumerate(grid[0][:, 0]): spline = InterpolatedUnivariateSpline(grid[1][0, :], normalized_kde_values[index, :] - (1 - hpd_threshold)) roots = spline.roots() if len(roots) > 1: hpd_limits.append((roots[0], roots[-1])) density_values.append(density) elif len(roots) == 1: if (normalized_kde_values[index, :] - (1 - hpd_threshold))[0] > 0: hpd_limits.append((np.min(grid[1][0, :]), roots[0])) else: hpd_limits.append((roots[0], np.max(grid[1][0, :]))) density_values.append(density) return np.array(density_values), np.array(hpd_limits) def get_radex_data(): df_radex = pd.read_csv(os.path.join('assets', 'data', 'radex_data.csv')) df_radex['87-86'] = df_radex['line2'] / df_radex['line1'] df_radex['88-87'] = df_radex['line3'] / df_radex['line2'] df_radex['88-86'] = df_radex['line3'] / df_radex['line1'] df_radex['257-256'] = df_radex['line2'] / df_radex['line1'] df_radex['381-380'] = df_radex['line2'] / df_radex['line1'] df_radex.loc[df_radex['rot_trans'] == '2-1', '257-256'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '2-1', '381-380'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '5-4', '87-86'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '5-4', '88-86'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '5-4', '381-380'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '7-6', '87-86'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '7-6', '88-86'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '7-6', '257-256'] = pd.NA return df_radex def get_results(x_array: np.array, Loading Loading @@ -323,7 +458,8 @@ def recompute_and_save_kdes(data: pd.DataFrame, pickle_models_dict_filename: Union[None, str] = None, model_root_folder: Union[None, str] = None, best_bandwidths: Union[None, dict] = None, ratio_limits: Union[None, dict] = None): ratio_limits: Union[None, dict] = None, plot_radex_data: Union[None, bool] = False) -> Tuple[dict, KernelDensity]: """ Retrieve the dictionary of the KDE models, either by computing it or unpickling it from previous runs :param points_per_axis: number of points for the KDE grid evaluation Loading @@ -335,6 +471,8 @@ def recompute_and_save_kdes(data: pd.DataFrame, defaults to fiducial model (constant_abundance_p15_q05) :param best_bandwidths: kernel bandwidths to use for each ratio :param ratio_limits: fixed plotting limits :param plot_radex_data: whether to plot the radex data :return: the dictionary of the KDE models and the kernel density model """ _model_root_folder = validate_parameter( model_root_folder, Loading @@ -352,7 +490,7 @@ def recompute_and_save_kdes(data: pd.DataFrame, } _best_bandwidths = validate_parameter(best_bandwidths, default=default_bandwidths) parallel_args = product(line_pairs, [data], [points_per_axis], [_best_bandwidths], [_model_root_folder], [ratio_limits]) [_best_bandwidths], [_model_root_folder], [ratio_limits], [plot_radex_data]) with Pool(nthreads) as pool: results = pool.starmap(train_kde_model, parallel_args) Loading Loading @@ -474,7 +612,8 @@ def main(measured_integrated_intensity_dict: dict, limit_rows: Union[None, int] = None, use_model_for_inference: Union[None, str] = None, best_bandwidths: Union[None, dict] = None, ratio_limits: Union[None, dict] = None): ratio_limits: Union[None, dict] = None, plot_radex_data: Union[None, bool] = True) -> None: _use_model_for_inference = validate_parameter( use_model_for_inference, default='constant_abundance_p15_q05' Loading @@ -491,7 +630,8 @@ def main(measured_integrated_intensity_dict: dict, points_per_axis=points_per_axis, model_root_folder=_model_root_folder, best_bandwidths=best_bandwidths, ratio_limits=ratio_limits) ratio_limits=ratio_limits, plot_radex_data=plot_radex_data) x_grid = np.linspace(np.log10(0.7 * np.nanmin(data['avg_nh2'])), np.log10(1.3 * np.nanmax(data['avg_nh2'])), points_per_axis) Loading @@ -508,7 +648,8 @@ def main(measured_integrated_intensity_dict: dict, line_ids = validate_line_ids(line_pairs, ratio_string) if np.isnan(measured_integrated_intensity_dict[line_ids[0]][source_name]) or \ np.isnan(measured_integrated_intensity_dict[line_ids[1]][source_name]): logger.warning(f'The ratio {ratio_string} is not available for source {source_name}. Proceeding with the remaining ratios...') logger.warning( f'The ratio {ratio_string} is not available for source {source_name}. Proceeding with the remaining ratios...') else: measured_integrated_intensity_coupled[ratio_string] = [ measured_integrated_intensity_dict[line_ids[0]][source_name], Loading Loading @@ -650,4 +791,5 @@ if __name__ == '__main__': limit_rows=limit_rows, use_model_for_inference=use_model_for_inference, best_bandwidths=external_input['best_bandwidths'], ratio_limits=external_input['ratio_limits']) ratio_limits=external_input['ratio_limits'], plot_radex_data=True) etl/prs/prs_make_comparison_figures.py +84 −66 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
etl/assets/commons/__init__.py +10 −0 Original line number Diff line number Diff line Loading @@ -83,6 +83,16 @@ def load_config_file(config_file_path: str, return config def moving_average(x: np.array , window_size: int) -> np.array: """ Compute the moving average of a 1D array :param x: the 1D array to compute the moving average on :param window_size: the size of the moving average window :return: the moving average of the array """ return np.convolve(x, np.ones(window_size), 'valid') / window_size def get_moldata(species_names: list, logger: logging.Logger, path: Union[str, None] = None, Loading
etl/assets/data/radex_data.csv 0 → 100644 +73 −0 Original line number Diff line number Diff line rot_trans,log_col_dens,log_density,temperature,line1,line2,line3 2-1,15.4,3,10,0.78,0.17,0.01 2-1,15.4,3,30,1.8,0.35,0.018 2-1,15.4,4,10,3.37,0.78,0.04 2-1,15.4,4,30,7.96,1.28,0.07 2-1,15.4,5,10,5.64,2.43,0.34 2-1,15.4,5,30,12.5,2.98,0.46 2-1,15.4,6,10,6.11,4.11,1.41 2-1,15.4,6,30,8.49,3.77,1.34 2-1,15.4,7,10,5.96,4.62,2.12 2-1,15.4,7,30,5.68,3.63,2.06 2-1,15.4,8,10,5.84,4.73,2.30 2-1,15.4,8,30,4.13,3.63,2.24 2-1,14,3,10,0.069,0.0042,0.0001 2-1,14,3,30,0.16,0.0091,0.00026 2-1,14,4,10,0.47,0.028,0.00083 2-1,14,4,30,0.85,0.046,0.0019 2-1,14,5,10,0.91,0.085,0.0076 2-1,14,5,30,0.86,0.09,0.014 2-1,14,6,10,0.75,0.20,0.05 2-1,14,6,30,0.46,0.14,0.05 2-1,14,7,10,0.59,0.28,0.09 2-1,14,7,30,0.28,0.15,0.08 2-1,14,8,10,0.52,0.31,0.1 2-1,14,8,30,0.19,0.15,0.09 5-4,15.4,3,10,1.1e-3,2.3e-2 5-4,15.4,3,30,1.3e-2,0.26, 5-4,15.4,4,10,1.7e-2,0.32, 5-4,15.4,4,30,0.22,2.33, 5-4,15.4,5,10,0.25,2.14, 5-4,15.4,5,30,2.72,10.44, 5-4,15.4,6,10,1.43,3.85, 5-4,15.4,6,30,6.79,16.5, 5-4,15.4,7,10,2.29,3.97, 5-4,15.4,7,30,7.76,13.87, 5-4,15.4,8,10,2.50,3.88, 5-4,15.4,8,30,8.07,11.0, 5-4,14,3,10,3.6e-5,8.2e-4, 5-4,14,3,30,4.5e-4,1.0e-2, 5-4,14,4,10,4.1e-4,0.010, 5-4,14,4,30,4.7e-3,0.12, 5-4,14,5,10,5.4e-3,0.13, 5-4,14,5,30,5.2e-2,0.92, 5-4,14,6,10,4.8e-2,0.32, 5-4,14,6,30,0.24,1.30, 5-4,14,7,10,0.11,0.31, 5-4,14,7,30,0.36,0.87 5-4,14,8,10,0.13,0.29, 5-4,14,8,30,0.39,0.60, 7-6,15.4,3,10,1.4e-5,3.3e-4, 7-6,15.4,3,30,1.4e-3,2.6e-2, 7-6,15.4,4,10,2.0e-4,5.7e-3, 7-6,15.4,4,30,2.0e-2,3.8e-1, 7-6,15.4,5,10,4.8e-3,0.1, 7-6,15.4,5,30,0.47,4.1, 7-6,15.4,6,10,8.6e-2,0.52, 7-6,15.4,6,30,3.54,11.37, 7-6,15.4,7,10,0.26,0.67, 7-6,15.4,7,30,5.46,10.82, 7-6,15.4,8,10,0.32,0.66, 7-6,15.4,8,30,5.97,8.53, 7-6,14,3,10,5.3e-7,1.1e-5, 7-6,14,3,30,5.4e-5,8.6e-4, 7-6,14,4,10,6.2e-6,1.6e-4, 7-6,14,4,30,6.2e-4,1.2e-2, 7-6,14,5,10,1.2e-4,3.3e-3, 7-6,14,5,30,1.1e-2,0.19, 7-6,14,6,10,2.5e-3,2.2e-2, 7-6,14,6,30,0.11,0.74, 7-6,14,7,10,9.7e-3,3.0e-2, 7-6,14,7,30,0.24,0.62, 7-6,14,8,10,1.3e-2,2.9e-2, 7-6,14,8,30,0.27,0.44, No newline at end of file
etl/prs/prs_analytical_representations.py +43 −27 Original line number Diff line number Diff line Loading @@ -2,17 +2,19 @@ import numpy as np import os import matplotlib.pyplot as plt import pickle from scipy.interpolate import splrep, BSpline, RegularGridInterpolator from scipy.optimize import curve_fit from assets.commons import (get_data, load_config_file, setup_logger, validate_parameter) import yaml from typing import Union, List def approx(x, x_shift, a, b, exponent, scale=1): return (np.abs(np.arctanh(2 * (x - x_shift) / scale) + a)) ** exponent + b def approx(x, x_shift, b, exponent, scale): arg = 2 * (x - x_shift) / scale clean_arg = np.where(np.abs(arg) > 0.99999, np.sign(arg) * 0.99999, arg) return (np.arctanh(clean_arg) + b) ** exponent def main(ratios_to_fit: Union[List[str], None] = None): Loading @@ -25,19 +27,12 @@ def main(ratios_to_fit: Union[List[str], None] = None): logger.info(f'Using {config["run_type"]} to fit data...') logger.info(f'Data contains {data.shape[0]} rows') bw = { '87-86': 0.1, '88-87': 0.1, '88-86': 0.1, '257-256': 0.4, '381-380': 0.2} x0 = { '87-86': ((0.55, 2.3, 3.75, 0.9),), '88-87': ((0.59, 2.3, 3.9, 1.05, 0.96),), '88-86': ((0.55, 2.5, 4.3, 0.83, 1.08),), '257-256': ((5, 2.5, 2.45, 0.75, 8), (5, 2.5, 3.33, 0.75, -8)), '381-380': ((2.04, 2.5, 3.1, 0.6, 2.1), (2.18, 2.5, 3.7, 0.79, -2.4)), '87-86': ((0.55, 6.75, 0.9, 1),), '88-87': ((0.59, 6.9, 1.05, 0.96),), '88-86': ((0.55, 6.3, 0.83, 1.08),), '257-256': ((5, 4.45, 0.75, 8), (4.5, 8.5, 0.79, -7)), '381-380': ((2.04, 5.1, 0.6, 2.1), (2.18, 5.7, 0.79, -2.4)), } min_ratio = { '87-86': 0.08, Loading @@ -51,31 +46,52 @@ def main(ratios_to_fit: Union[List[str], None] = None): '88-86': 1.07, '257-256': 7.5, '381-380': 2.9} density_limits = { '87-86': ((5e4, 2e7),), '88-87': ((1e5, 3e7),), '88-86': ((1e5, 3e7),), '257-256': ((2e3, 7.5e4), (7.5e4, 4e6)), '381-380': ((1e4, 1.9e5), (1.9e5, 1e7)) } best_fit_params = {} for ratio_string in _ratio_to_fit: _column_to_fit = f'ratio_{ratio_string}' data.sort_values(by=_column_to_fit, inplace=True) plt.clf() plt.scatter(data['avg_nh2'], data[_column_to_fit], marker='+', alpha=0.1) x_reg = np.linspace(min_ratio[ratio_string], max_ratio[ratio_string], 100) x_reg = np.linspace(min_ratio[ratio_string], max_ratio[ratio_string], 1000) kde_max = pickle.load( open(f'prs/output/run_type/{config["run_type"]}/kde_smoothed_{ratio_string}_max_locus.pickle', 'rb')) data_clean = data[[_column_to_fit, 'avg_nh2']].loc[data[_column_to_fit] <= max_ratio[ratio_string]].copy() kde = pickle.load( open(f'prs/output/run_type/{config["run_type"]}/trained_model/ratio_density_kde_{ratio_string}.pickle', 'rb')) rgi = RegularGridInterpolator((kde['y'] * bw[ratio_string], kde['x'] * 0.2), kde['values'].reshape(200, 200).T) density = rgi(data_clean[[_column_to_fit, 'avg_nh2']]) data_clean = data_clean.loc[(density > density.mean() - density.std())] plt.scatter(data_clean['avg_nh2'], data_clean[_column_to_fit], marker='+', alpha=0.1, color='red') plt.plot(kde_max[1], kde_max[0], color='red') plt.semilogy() plt.xlabel('log(<n(H2)>)') plt.ylabel('Ratio') for components in x0[ratio_string]: plt.plot(approx(x_reg, *components),x_reg , color='cyan') best_fit_params[ratio_string] = [] for guesses, limits in zip(x0[ratio_string], density_limits[ratio_string]): clean_mask = (~np.isnan(kde_max[0]) & ~np.isnan(kde_max[1]) & (kde_max[0] >= limits[0]) & (kde_max[0] <= limits[1])) best_fit, par_cov = curve_fit(approx, kde_max[1][clean_mask], np.log10(kde_max[0][clean_mask]), p0=guesses, bounds=((0.1, 2, 0.3, -9), (7, 10, 1.5, 9))) approx_density = 10 ** approx(x_reg, *best_fit) print(best_fit) best_fit_params[ratio_string].append({param: float(element) for element, param in zip(best_fit, ['a', 'b', 'nu', 's'])}) approx_density = np.where((approx_density >= limits[0]) & (approx_density <= limits[1]), approx_density, np.nan) plt.plot(x_reg, approx_density) plt.savefig(os.path.join('..', 'publications', '6373bb408e4040043398e495', 'referee', f'analytical_expressions_comparison_{ratio_string}.png')) with (open(os.path.join('prs', 'output', 'run_type', 'constant_abundance_p15_q05', 'best_fit_params.yml'), 'w') as outfile): yaml.dump(best_fit_params, outfile) if __name__ == '__main__': Loading
etl/prs/prs_density_inference.py +153 −11 Original line number Diff line number Diff line Loading @@ -11,13 +11,14 @@ from sklearn.model_selection import GridSearchCV from assets.commons import (load_config_file, validate_parameter, setup_logger, get_postprocessed_data) get_postprocessed_data, moving_average) from assets.constants import line_ratio_mapping from multiprocessing import Pool from typing import Tuple, Union, List from scipy.integrate import cumtrapz from scipy.stats import truncnorm from scipy.interpolate import RegularGridInterpolator from scipy.interpolate import RegularGridInterpolator, InterpolatedUnivariateSpline from functools import reduce Loading @@ -35,6 +36,7 @@ def train_kde_model(ratio: List[str], best_bandwidth: Union[None, dict], model_root_folder: Union[str, None] = None, ratio_limits: Union[None, dict] = None, plot_radex_data: Union[None, bool] = False, rt_adjustment_factor: Union[int, float] = 2) -> Tuple[str, KernelDensity]: """ Create the KDE from the modelled datapoints. Loading @@ -48,6 +50,7 @@ def train_kde_model(ratio: List[str], :param rt_adjustment_factor: a scaling factor to apply to the specified bandwidth for the computation of the KDE for the sparser RT-only data :param ratio_limits: fixed plotting limits :param plot_radex_data: whether to plot the radex data :return: the string representing the ratio modelled and the KDE itself """ _model_root_folder = validate_parameter( Loading @@ -62,11 +65,13 @@ def train_kde_model(ratio: List[str], grid = scaled_grid.copy() grid[0] = scaled_grid[0] * 0.2 grid[1] = scaled_grid[1] * best_bandwidth[ratio_string] radex_data = get_radex_data() if plot_radex_data else None plot_kde_ratio_nh2(grid=grid, values_on_grid=z.reshape(points_per_axis, points_per_axis), ratio_string=ratio_string, model_root_folder=_model_root_folder, training_data=training_data, additional_data=radex_data, suffix_outfile='_ml', ratio_limits=ratio_limits[ratio_string]) Loading @@ -83,6 +88,7 @@ def train_kde_model(ratio: List[str], ratio_string=ratio_string, model_root_folder=_model_root_folder, training_data=training_data[training_data['source'] == 'RT'], additional_data=radex_data, ratio_limits=ratio_limits[ratio_string]) with open( os.path.join(_model_root_folder, 'trained_model', f'ratio_density_kde_{ratio_string}.pickle'), 'wb' Loading Loading @@ -140,6 +146,7 @@ def plot_kde_ratio_nh2(grid: np.array, ratio_string: str, model_root_folder: str, training_data: pd.DataFrame, additional_data: pd.DataFrame = None, suffix_outfile: str = None, ratio_limits: Union[None, list] = None): """ Loading @@ -153,24 +160,152 @@ def plot_kde_ratio_nh2(grid: np.array, :param training_data: The DataFrame containing the training data. :param suffix_outfile: Optional. The suffix to append to the output file name. Defaults to an empty string if None. :param ratio_limits: Optional. The limits for the ratio axis. Defaults to None, which auto-scales the axis. :param additional_data: Optional. The DataFrame containing the additional data to plot. :return: None. Saves the plot as a PNG file in the specified folder. """ plt.rcParams.update({'font.size': 20}) _suffix_outfile = validate_parameter(suffix_outfile, default='') plt.clf() plt.figure(figsize=(8, 6)) plt.scatter(training_data['avg_nh2'], training_data[f'ratio_{ratio_string}'], marker='+', alpha=0.1, facecolor='grey') plt.contour(10 ** grid[0], grid[1], values_on_grid, levels=np.arange(0.05, 0.95, 0.15)) smoothed_max_locus_density, smoothed_max_locus_ratio = plot_kde_results(grid=grid, ratio_string=ratio_string, training_data=training_data, values_on_grid=values_on_grid) if (additional_data is not None) and (ratio_string in additional_data.columns): plt.scatter(10 ** additional_data['log_density'], additional_data[ratio_string], marker='*', alpha=1, facecolor='red', s=100) plt.semilogx() plt.xlabel(r'<$n$(H$_2$)> [cm$^{-3}$]') plt.ylabel(f'Ratio {line_ratio_mapping[ratio_string]}') plt.ylim(ratio_limits) plt.tight_layout() plt.legend(loc='best') plt.savefig(os.path.join( model_root_folder, 'figures', f'ratio_vs_avg_density_los_kde_{ratio_string}{_suffix_outfile}.png')) with open(os.path.join(model_root_folder, f'kde_smoothed_{ratio_string}_max_locus.pickle'), 'wb') as outfile: pickle.dump([smoothed_max_locus_density, smoothed_max_locus_ratio], outfile) def plot_kde_results(grid: np.array, ratio_string: str, training_data: pd.DataFrame, values_on_grid: np.array, plot_colours: Union[None, Tuple[str, str]] = None, plot_training_data: bool = True, colour_scatter: Union[None, str] = None) -> Tuple[np.array, np.array]: _plot_colours = validate_parameter(plot_colours, default=('gold', 'cornflowerblue')) _colour_scatter = validate_parameter(colour_scatter, default='grey') if plot_training_data: plt.scatter(training_data['avg_nh2'], training_data[f'ratio_{ratio_string}'], marker='+', alpha=0.1, facecolor=_colour_scatter) values_on_grid_norm = values_on_grid.copy() / np.max(values_on_grid, axis=1, keepdims=True) density_plot_values = 10 ** grid[0][:, 0] for hpd_threshold, colour, label in zip((0.9, 0.5), _plot_colours, ('90% HPD', '50% HPD')): densities, limits_hpd = get_hpd_relation(grid, values_on_grid_norm, hpd_threshold=hpd_threshold) density_plot_values = 10 ** moving_average(densities, 20) ratio_lower_limits = moving_average(limits_hpd[:, 0], 20) ratio_upper_limits = moving_average(limits_hpd[:, 1], 20) plt.fill_between(density_plot_values, ratio_lower_limits, ratio_upper_limits, alpha=0.3, facecolor=colour, label=label) plt.plot(density_plot_values, ratio_lower_limits, color=colour) plt.plot(density_plot_values, ratio_upper_limits, color=colour) maxdens, maxratio = get_max_locus(grid, values_on_grid) smoothed_max_locus_density = 10 ** moving_average(maxdens, 20) smoothed_max_locus_ratio = moving_average(maxratio, 20) mask_condition = (smoothed_max_locus_density >= density_plot_values[0]) & \ (smoothed_max_locus_density <= density_plot_values[-1]) smoothed_max_locus_density = np.where(mask_condition, smoothed_max_locus_density, np.nan) smoothed_max_locus_ratio = np.where(mask_condition, smoothed_max_locus_ratio, np.nan) plt.plot(smoothed_max_locus_density, smoothed_max_locus_ratio, label='Max PDF', color=_plot_colours[1]) return smoothed_max_locus_density, smoothed_max_locus_ratio def get_max_locus(grid, values_on_grid): """ Compute the maximum of the KDE for each density value. Parameters ---------- grid : tuple The grid of values used for the KDE. values_on_grid : array_like The computed KDE values on the grid. Returns ------- max_density : array_like The maximum density values. max_ratio : array_like The corresponding maximum ratio values. """ # Find the maximum ratio value for each density value max_ratio = grid[1][0, np.argmax(values_on_grid, axis=1)] # Replace missing solutions with NaN max_ratio = np.where(max_ratio > 0, max_ratio, np.nan) max_density = grid[0][:, 0] # Apply mask from missing ratio values max_density = np.where(max_ratio > 0, max_density, np.nan) return max_density, max_ratio def get_hpd_relation(grid: Union[tuple, list], normalized_kde_values: np.array, hpd_threshold: float) -> tuple: """ Calculate the highest posterior density (HPD) interval for a given KDE probability density. Parameters ---------- grid : tuple A tuple containing two arrays: the density grid and the ratio grid. normalized_kde_values : np.array The normalized KDE values on the grid. Normalization is performed by dividing by the maximum value for each density. hpd_threshold : float The probability mass threshold for the HPD interval. Returns ------- densities : np.array Array of density values corresponding to the HPD limits. limits_hpd : np.array Array of tuples, each containing the lower and upper bounds of the HPD interval. """ hpd_limits = [] density_values = [] for index, density in enumerate(grid[0][:, 0]): spline = InterpolatedUnivariateSpline(grid[1][0, :], normalized_kde_values[index, :] - (1 - hpd_threshold)) roots = spline.roots() if len(roots) > 1: hpd_limits.append((roots[0], roots[-1])) density_values.append(density) elif len(roots) == 1: if (normalized_kde_values[index, :] - (1 - hpd_threshold))[0] > 0: hpd_limits.append((np.min(grid[1][0, :]), roots[0])) else: hpd_limits.append((roots[0], np.max(grid[1][0, :]))) density_values.append(density) return np.array(density_values), np.array(hpd_limits) def get_radex_data(): df_radex = pd.read_csv(os.path.join('assets', 'data', 'radex_data.csv')) df_radex['87-86'] = df_radex['line2'] / df_radex['line1'] df_radex['88-87'] = df_radex['line3'] / df_radex['line2'] df_radex['88-86'] = df_radex['line3'] / df_radex['line1'] df_radex['257-256'] = df_radex['line2'] / df_radex['line1'] df_radex['381-380'] = df_radex['line2'] / df_radex['line1'] df_radex.loc[df_radex['rot_trans'] == '2-1', '257-256'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '2-1', '381-380'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '5-4', '87-86'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '5-4', '88-86'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '5-4', '381-380'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '7-6', '87-86'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '7-6', '88-86'] = pd.NA df_radex.loc[df_radex['rot_trans'] == '7-6', '257-256'] = pd.NA return df_radex def get_results(x_array: np.array, Loading Loading @@ -323,7 +458,8 @@ def recompute_and_save_kdes(data: pd.DataFrame, pickle_models_dict_filename: Union[None, str] = None, model_root_folder: Union[None, str] = None, best_bandwidths: Union[None, dict] = None, ratio_limits: Union[None, dict] = None): ratio_limits: Union[None, dict] = None, plot_radex_data: Union[None, bool] = False) -> Tuple[dict, KernelDensity]: """ Retrieve the dictionary of the KDE models, either by computing it or unpickling it from previous runs :param points_per_axis: number of points for the KDE grid evaluation Loading @@ -335,6 +471,8 @@ def recompute_and_save_kdes(data: pd.DataFrame, defaults to fiducial model (constant_abundance_p15_q05) :param best_bandwidths: kernel bandwidths to use for each ratio :param ratio_limits: fixed plotting limits :param plot_radex_data: whether to plot the radex data :return: the dictionary of the KDE models and the kernel density model """ _model_root_folder = validate_parameter( model_root_folder, Loading @@ -352,7 +490,7 @@ def recompute_and_save_kdes(data: pd.DataFrame, } _best_bandwidths = validate_parameter(best_bandwidths, default=default_bandwidths) parallel_args = product(line_pairs, [data], [points_per_axis], [_best_bandwidths], [_model_root_folder], [ratio_limits]) [_best_bandwidths], [_model_root_folder], [ratio_limits], [plot_radex_data]) with Pool(nthreads) as pool: results = pool.starmap(train_kde_model, parallel_args) Loading Loading @@ -474,7 +612,8 @@ def main(measured_integrated_intensity_dict: dict, limit_rows: Union[None, int] = None, use_model_for_inference: Union[None, str] = None, best_bandwidths: Union[None, dict] = None, ratio_limits: Union[None, dict] = None): ratio_limits: Union[None, dict] = None, plot_radex_data: Union[None, bool] = True) -> None: _use_model_for_inference = validate_parameter( use_model_for_inference, default='constant_abundance_p15_q05' Loading @@ -491,7 +630,8 @@ def main(measured_integrated_intensity_dict: dict, points_per_axis=points_per_axis, model_root_folder=_model_root_folder, best_bandwidths=best_bandwidths, ratio_limits=ratio_limits) ratio_limits=ratio_limits, plot_radex_data=plot_radex_data) x_grid = np.linspace(np.log10(0.7 * np.nanmin(data['avg_nh2'])), np.log10(1.3 * np.nanmax(data['avg_nh2'])), points_per_axis) Loading @@ -508,7 +648,8 @@ def main(measured_integrated_intensity_dict: dict, line_ids = validate_line_ids(line_pairs, ratio_string) if np.isnan(measured_integrated_intensity_dict[line_ids[0]][source_name]) or \ np.isnan(measured_integrated_intensity_dict[line_ids[1]][source_name]): logger.warning(f'The ratio {ratio_string} is not available for source {source_name}. Proceeding with the remaining ratios...') logger.warning( f'The ratio {ratio_string} is not available for source {source_name}. Proceeding with the remaining ratios...') else: measured_integrated_intensity_coupled[ratio_string] = [ measured_integrated_intensity_dict[line_ids[0]][source_name], Loading Loading @@ -650,4 +791,5 @@ if __name__ == '__main__': limit_rows=limit_rows, use_model_for_inference=use_model_for_inference, best_bandwidths=external_input['best_bandwidths'], ratio_limits=external_input['ratio_limits']) ratio_limits=external_input['ratio_limits'], plot_radex_data=True)
etl/prs/prs_make_comparison_figures.py +84 −66 File changed.Preview size limit exceeded, changes collapsed. Show changes