Source code for beetroots.inversion.results.results_optim_map

import copy
import os
from typing import List, Optional, Tuple, Union

import numpy as np
import pandas as pd

from beetroots.inversion.plots.plots_estimator import PlotsEstimator
from beetroots.inversion.results.abstract_results import ResultsExtractor
from beetroots.inversion.results.utils.bayes_pval_plots import ResultsBayesPvalues
from beetroots.inversion.results.utils.clppd import ResultsCLPPD
from beetroots.inversion.results.utils.kernel import ResultsKernels
from beetroots.inversion.results.utils.lowest_obj_estimator import (
    ResultsLowestObjective,
)
from beetroots.inversion.results.utils.objective import ResultsObjective
from beetroots.modelling.posterior import Posterior
from beetroots.space_transform.abstract_transform import Scaler



[docs]
class ResultsExtractorOptimMAP(ResultsExtractor):
    r"""extractor of inference results for the data of the optimization runs that that was saved."""

    __slots__ = (
        "path_data_csv_out_optim_map",
        "path_img",
        "path_raw",
        "N_MCMC",
        "T_MC",
        "T_BI",
        "freq_save",
        "max_workers",
    )

    def __init__(
        self,
        path_data_csv_out_optim_map: str,
        path_img: str,
        path_raw: str,
        N_MCMC: int,
        T_MC: int,
        T_BI: int,
        freq_save: int,
        max_workers: int,
    ):
        r"""

        Parameters
        ----------
        path_data_csv_out_optim_map : str
            path to the csv file in which the performance of estimators is to be saved
        path_img : str
            path to the folder in which images are to be saved
        path_raw : str
            path to the raw ``.hdf5`` files
        N_MCMC : int
            number of optimization procedures to run per posterior distribution
        T_MC : int
            total size of each optimization procedure
        T_BI : int
            duration of the `Burn-in` phase
        freq_save : int
            frequency of saved iterates, 1 means that all iterates were saved (used to show correct optimization procedure sizes in chain plots)
        max_workers : int
            maximum number of workers that can be used for results extraction
        """
        self.path_data_csv_out_optim_map = path_data_csv_out_optim_map
        r"""str: path to the csv file in which the performance of estimators is to be saved"""
        self.path_img = path_img
        r"""str: path to the folder in which images are to be saved"""
        self.path_raw = path_raw
        r"""str: path to the raw ``.hdf5`` files"""

        self.N_MCMC = N_MCMC
        r"""int: number of optimization procedures to run per posterior distribution"""
        self.T_MC = T_MC
        r"""int: total size of each optimization procedure"""
        self.T_BI = T_BI
        r"""int: duration of the `Burn-in` phase"""
        self.freq_save = freq_save
        r"""int: frequency of saved iterates, 1 means that all iterates were saved (used to show correct optimization procedure sizes in chain plots)"""

        self.max_workers = max_workers
        r"""int: maximum number of workers that can be used for results extraction"""


[docs]
    @classmethod
    def read_estimator(
        cls,
        path_data_csv_out_optim_map: str,
        model_name: str,
    ) -> Tuple[np.ndarray, pd.DataFrame]:
        r"""reads the value of an already estimated MAP from a csv file.

        Parameters
        ----------
        path_data_csv_out_optim_map : str
            path to the csv file containing an already estimated MAP
        model_name : str
            name of the model, used to identify the posterior distribution

        Returns
        -------
        np.ndarray
            MAP estimator
        pd.DataFrame
            original DataFrame read from the csv file
        """
        path_file = f"{path_data_csv_out_optim_map}/"
        path_file += f"estimation_Theta_{model_name}_MAP_optim_map.csv"
        assert os.path.isfile(path_file), f"no MAP at {path_file}"

        df_results_map = pd.read_csv(path_file)
        df_results_map = df_results_map.sort_values(["n", "d"])

        N = df_results_map["n"].max() + 1
        D = df_results_map["d"].max() + 1

        Theta_MAP = np.zeros((N, D))
        for d in range(D):
            Theta_MAP[:, d] = df_results_map.loc[
                df_results_map["d"] == d, "Theta_MAP_optim_map"
            ].values

        return Theta_MAP, df_results_map



[docs]
    def main(
        self,
        posterior: Posterior,
        model_name: str,
        scaler: Scaler,
        #
        list_idx_sampling: List[int],
        list_fixed_values: Union[List[Optional[float]], np.ndarray],
        #
        estimator_plot: Optional[PlotsEstimator],
        Theta_true_scaled: Optional[np.ndarray] = None,
    ):
        r"""performs the data extraction, in this order:

        * step 1 : clppd
        * step 2 : kernel analysis
        * step 3 : objective evolution
        * step 4 : MAP estimator from samples
        * step 5 : model checking with Bayesian p-value

        Parameters
        ----------
        posterior : Posterior
            probability distribution. The goal of the optimization procedure was to find its mode, i.e., the minimum of its negative log pdf.
        model_name : str
            name of the model, used to identify the posterior distribution
        scaler : Scaler
            contains the transformation of the Theta values from their natural space to their scaled space (in which the sampling happens) and its inverse
        list_idx_sampling : List[int]
            indices of the physical parameters that were sampled (the other ones were fixed)
        list_fixed_values : List[float]
            list of used values for the parameters fixed during the sampling
        estimator_plot : PlotsEstimator
            object used to plot the estimator figures
        Theta_true_scaled : Optional[np.ndarray], optional
            true value for the inferred physical parameter :math:`\Theta` (only possible for toy cases), by default None
        """
        list_mcmc_folders = [
            f"{x[0]}/mc_chains.hdf5"
            for x in os.walk(f"{self.path_raw}/{model_name}")
            if "optim_MAP_" in x[0]
        ]
        list_mcmc_folders.sort()

        N = posterior.N * 1
        D_sampling = posterior.D * 1
        D = len(list_fixed_values)
        L = posterior.L

        list_fixed_values_scaled = list(
            posterior.likelihood.forward_map.arr_fixed_values
        )
        list_fixed_values_scaled = [v if v != 0 else None for v in list_fixed_values]

        chain_type = "optim_map"

        if estimator_plot is not None:
            map_shaper = copy.copy(estimator_plot.map_shaper)
        else:
            map_shaper = None

        # clppd
        ResultsCLPPD(
            model_name=model_name,
            chain_type=chain_type,
            path_img=self.path_img,
            path_data_csv_out=self.path_data_csv_out_optim_map,
            N_MCMC=self.N_MCMC,
            N=N,
            L=L,
        ).main(list_mcmc_folders, map_shaper)

        # kernel analysis
        ResultsKernels(
            model_name,
            chain_type,
            self.path_img * 1,
            self.N_MCMC * 1,
            self.T_MC * 1,
            self.freq_save * 1,
        ).main(list_mcmc_folders)

        # objective evolution
        if Theta_true_scaled is not None:
            forward_map_evals = posterior.likelihood.evaluate_all_forward_map(
                Theta_true_scaled,
                compute_derivatives=False,
                compute_derivatives_2nd_order=False,
            )
            nll_utils = posterior.likelihood.evaluate_all_nll_utils(
                forward_map_evals,
                compute_derivatives=False,
                compute_derivatives_2nd_order=False,
            )
            objective_true = posterior.neglog_pdf(
                Theta_true_scaled,
                idx_pix=np.arange(N),
                forward_map_evals=forward_map_evals,
                nll_utils=nll_utils,
                chromatic_gibbs=False,
            )
            assert isinstance(objective_true, float)
        else:
            objective_true = None

        idx_lowest_obj, lowest_obj = ResultsObjective(
            model_name,
            chain_type,
            self.path_img * 1,
            self.N_MCMC * 1,
            self.T_MC * 1,
            self.T_BI * 1,
            self.freq_save * 1,
            N=N,
            D=D,
            L=L,
        ).main(list_mcmc_folders, objective_true)

        # MAP estimator from samples
        if Theta_true_scaled is not None:
            Theta_true_scaled_full = np.zeros((N, D))
            Theta_true_scaled_full[:, list_idx_sampling] += Theta_true_scaled

            for d in range(D):
                if list_fixed_values_scaled[d] is not None:
                    val_d = copy.copy(list_fixed_values_scaled[d])
                    assert isinstance(val_d, float)
                    Theta_true_scaled_full[:, d] += val_d
        else:
            Theta_true_scaled_full = None

        if isinstance(list_fixed_values, list):
            list_fixed_values_arr = np.array(list_fixed_values)
        else:
            list_fixed_values_arr = list_fixed_values * 1

        ResultsLowestObjective(
            model_name,
            chain_type,
            self.path_img,
            self.path_data_csv_out_optim_map,
            self.N_MCMC,
            self.T_MC,
            self.freq_save,
        ).main(
            list_mcmc_folders,
            lowest_obj,
            idx_lowest_obj,
            scaler,
            Theta_true_scaled_full,  # (N, D_sampling)
            list_idx_sampling,  # (D_sampling,)
            list_fixed_values_arr,  # (D,)
            estimator_plot,
        )

        ResultsBayesPvalues(
            model_name=model_name,
            chain_type=chain_type,
            path_img=self.path_img,
            path_data_csv_out=self.path_data_csv_out_optim_map,
            N_MCMC=self.N_MCMC,
            N=N,
            D_sampling=D_sampling,
            plot_ESS=False,
        ).main(
            list_idx_sampling=list_idx_sampling,
            map_shaper=map_shaper if N > 1 else None,
        )