Source code for beetroots.inversion.results.results_optim_map
import copy
import os
from typing import List, Optional, Tuple, Union
import numpy as np
import pandas as pd
from beetroots.inversion.plots.plots_estimator import PlotsEstimator
from beetroots.inversion.results.abstract_results import ResultsExtractor
from beetroots.inversion.results.utils.bayes_pval_plots import ResultsBayesPvalues
from beetroots.inversion.results.utils.clppd import ResultsCLPPD
from beetroots.inversion.results.utils.kernel import ResultsKernels
from beetroots.inversion.results.utils.lowest_obj_estimator import (
ResultsLowestObjective,
)
from beetroots.inversion.results.utils.objective import ResultsObjective
from beetroots.modelling.posterior import Posterior
from beetroots.space_transform.abstract_transform import Scaler
[docs]
class ResultsExtractorOptimMAP(ResultsExtractor):
r"""extractor of inference results for the data of the optimization runs that that was saved."""
__slots__ = (
"path_data_csv_out_optim_map",
"path_img",
"path_raw",
"N_MCMC",
"T_MC",
"T_BI",
"freq_save",
"max_workers",
)
def __init__(
self,
path_data_csv_out_optim_map: str,
path_img: str,
path_raw: str,
N_MCMC: int,
T_MC: int,
T_BI: int,
freq_save: int,
max_workers: int,
):
r"""
Parameters
----------
path_data_csv_out_optim_map : str
path to the csv file in which the performance of estimators is to be saved
path_img : str
path to the folder in which images are to be saved
path_raw : str
path to the raw ``.hdf5`` files
N_MCMC : int
number of optimization procedures to run per posterior distribution
T_MC : int
total size of each optimization procedure
T_BI : int
duration of the `Burn-in` phase
freq_save : int
frequency of saved iterates, 1 means that all iterates were saved (used to show correct optimization procedure sizes in chain plots)
max_workers : int
maximum number of workers that can be used for results extraction
"""
self.path_data_csv_out_optim_map = path_data_csv_out_optim_map
r"""str: path to the csv file in which the performance of estimators is to be saved"""
self.path_img = path_img
r"""str: path to the folder in which images are to be saved"""
self.path_raw = path_raw
r"""str: path to the raw ``.hdf5`` files"""
self.N_MCMC = N_MCMC
r"""int: number of optimization procedures to run per posterior distribution"""
self.T_MC = T_MC
r"""int: total size of each optimization procedure"""
self.T_BI = T_BI
r"""int: duration of the `Burn-in` phase"""
self.freq_save = freq_save
r"""int: frequency of saved iterates, 1 means that all iterates were saved (used to show correct optimization procedure sizes in chain plots)"""
self.max_workers = max_workers
r"""int: maximum number of workers that can be used for results extraction"""
[docs]
@classmethod
def read_estimator(
cls,
path_data_csv_out_optim_map: str,
model_name: str,
) -> Tuple[np.ndarray, pd.DataFrame]:
r"""reads the value of an already estimated MAP from a csv file.
Parameters
----------
path_data_csv_out_optim_map : str
path to the csv file containing an already estimated MAP
model_name : str
name of the model, used to identify the posterior distribution
Returns
-------
np.ndarray
MAP estimator
pd.DataFrame
original DataFrame read from the csv file
"""
path_file = f"{path_data_csv_out_optim_map}/"
path_file += f"estimation_Theta_{model_name}_MAP_optim_map.csv"
assert os.path.isfile(path_file), f"no MAP at {path_file}"
df_results_map = pd.read_csv(path_file)
df_results_map = df_results_map.sort_values(["n", "d"])
N = df_results_map["n"].max() + 1
D = df_results_map["d"].max() + 1
Theta_MAP = np.zeros((N, D))
for d in range(D):
Theta_MAP[:, d] = df_results_map.loc[
df_results_map["d"] == d, "Theta_MAP_optim_map"
].values
return Theta_MAP, df_results_map
[docs]
def main(
self,
posterior: Posterior,
model_name: str,
scaler: Scaler,
#
list_idx_sampling: List[int],
list_fixed_values: Union[List[Optional[float]], np.ndarray],
#
estimator_plot: Optional[PlotsEstimator],
Theta_true_scaled: Optional[np.ndarray] = None,
):
r"""performs the data extraction, in this order:
* step 1 : clppd
* step 2 : kernel analysis
* step 3 : objective evolution
* step 4 : MAP estimator from samples
* step 5 : model checking with Bayesian p-value
Parameters
----------
posterior : Posterior
probability distribution. The goal of the optimization procedure was to find its mode, i.e., the minimum of its negative log pdf.
model_name : str
name of the model, used to identify the posterior distribution
scaler : Scaler
contains the transformation of the Theta values from their natural space to their scaled space (in which the sampling happens) and its inverse
list_idx_sampling : List[int]
indices of the physical parameters that were sampled (the other ones were fixed)
list_fixed_values : List[float]
list of used values for the parameters fixed during the sampling
estimator_plot : PlotsEstimator
object used to plot the estimator figures
Theta_true_scaled : Optional[np.ndarray], optional
true value for the inferred physical parameter :math:`\Theta` (only possible for toy cases), by default None
"""
list_mcmc_folders = [
f"{x[0]}/mc_chains.hdf5"
for x in os.walk(f"{self.path_raw}/{model_name}")
if "optim_MAP_" in x[0]
]
list_mcmc_folders.sort()
N = posterior.N * 1
D_sampling = posterior.D * 1
D = len(list_fixed_values)
L = posterior.L
list_fixed_values_scaled = list(
posterior.likelihood.forward_map.arr_fixed_values
)
list_fixed_values_scaled = [v if v != 0 else None for v in list_fixed_values]
chain_type = "optim_map"
if estimator_plot is not None:
map_shaper = copy.copy(estimator_plot.map_shaper)
else:
map_shaper = None
# clppd
ResultsCLPPD(
model_name=model_name,
chain_type=chain_type,
path_img=self.path_img,
path_data_csv_out=self.path_data_csv_out_optim_map,
N_MCMC=self.N_MCMC,
N=N,
L=L,
).main(list_mcmc_folders, map_shaper)
# kernel analysis
ResultsKernels(
model_name,
chain_type,
self.path_img * 1,
self.N_MCMC * 1,
self.T_MC * 1,
self.freq_save * 1,
).main(list_mcmc_folders)
# objective evolution
if Theta_true_scaled is not None:
forward_map_evals = posterior.likelihood.evaluate_all_forward_map(
Theta_true_scaled,
compute_derivatives=False,
compute_derivatives_2nd_order=False,
)
nll_utils = posterior.likelihood.evaluate_all_nll_utils(
forward_map_evals,
compute_derivatives=False,
compute_derivatives_2nd_order=False,
)
objective_true = posterior.neglog_pdf(
Theta_true_scaled,
idx_pix=np.arange(N),
forward_map_evals=forward_map_evals,
nll_utils=nll_utils,
chromatic_gibbs=False,
)
assert isinstance(objective_true, float)
else:
objective_true = None
idx_lowest_obj, lowest_obj = ResultsObjective(
model_name,
chain_type,
self.path_img * 1,
self.N_MCMC * 1,
self.T_MC * 1,
self.T_BI * 1,
self.freq_save * 1,
N=N,
D=D,
L=L,
).main(list_mcmc_folders, objective_true)
# MAP estimator from samples
if Theta_true_scaled is not None:
Theta_true_scaled_full = np.zeros((N, D))
Theta_true_scaled_full[:, list_idx_sampling] += Theta_true_scaled
for d in range(D):
if list_fixed_values_scaled[d] is not None:
val_d = copy.copy(list_fixed_values_scaled[d])
assert isinstance(val_d, float)
Theta_true_scaled_full[:, d] += val_d
else:
Theta_true_scaled_full = None
if isinstance(list_fixed_values, list):
list_fixed_values_arr = np.array(list_fixed_values)
else:
list_fixed_values_arr = list_fixed_values * 1
ResultsLowestObjective(
model_name,
chain_type,
self.path_img,
self.path_data_csv_out_optim_map,
self.N_MCMC,
self.T_MC,
self.freq_save,
).main(
list_mcmc_folders,
lowest_obj,
idx_lowest_obj,
scaler,
Theta_true_scaled_full, # (N, D_sampling)
list_idx_sampling, # (D_sampling,)
list_fixed_values_arr, # (D,)
estimator_plot,
)
ResultsBayesPvalues(
model_name=model_name,
chain_type=chain_type,
path_img=self.path_img,
path_data_csv_out=self.path_data_csv_out_optim_map,
N_MCMC=self.N_MCMC,
N=N,
D_sampling=D_sampling,
plot_ESS=False,
).main(
list_idx_sampling=list_idx_sampling,
map_shaper=map_shaper if N > 1 else None,
)