Source code for beetroots.simulations.astro.observation.abstract_real_data

import warnings
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

from beetroots.inversion.plots.map_shaper import MapShaper
from beetroots.simulations.astro.observation.abstract_observation import (
    SimulationObservation,
)



[docs]
class SimulationRealData(SimulationObservation):
    r"""abstract class that reads the observation data for real observations"""


[docs]
    def setup_observation(
        self,
        data_int_path: str,
        data_err_path: str,
        save_obs: bool = True,
    ) -> Tuple[
        pd.DataFrame,
        np.ndarray,
        np.ndarray,
        np.ndarray,
        np.ndarray,
        np.ndarray,
        np.ndarray,
    ]:
        r"""reads the observation data for real observations

        Parameters
        ----------
        data_int_path : str
            path to the ``.pkl`` file that contains the observation maps
        data_err_path : str
            path to the ``.pkl`` file that contains the maps of additive noise standard deviation
        save_obs : bool, optional
            by default True

        Returns
        -------
        df_int_fit : pd.DataFrame
            DataFrame containing the observations used for inference
        y_fit : np.ndarray of shape (N, L)
            observations to be used for inference
        sigma_a_fit : np.ndarray of shape (N, L)
            additive noise standard deviations associated with the observations used for inference
        omega_fit : np.ndarray of shape (N, L)
            censor threshold associated with the observations used for inference
        y_valid : np.ndarray of shape (N, L_valid)
            observations that are not to be used for inference
        sigma_a_valid : np.ndarray of shape (N, L_valid)
            additive noise standard deviations associated with the observations not used for inference
        omega_valid : np.ndarray of shape (N, L_valid)
            censor threshold associated with the observations not used for inference
        """
        # * read observations
        # intensities
        assert data_int_path.endswith(".pkl") or data_int_path.endswith(
            ".csv"
        ), f"The intensity file must be either a csv or a pickle (extension pkl) file. Current: {data_int_path}"
        if data_int_path.endswith(".pkl"):
            df_int: pd.DataFrame = pd.read_pickle(data_int_path)
        else:
            df_int: pd.DataFrame = pd.read_csv(data_int_path)

        # standard deviations of additive noise
        assert data_err_path.endswith(".pkl") or data_err_path.endswith(
            ".csv"
        ), f"The standard deviation file must be either a csv or a pickle (extension `.pkl`) file. Current: {data_err_path}"
        if data_err_path.endswith(".pkl"):
            df_err: pd.DataFrame = pd.read_pickle(data_err_path)
        else:
            df_err: pd.DataFrame = pd.read_csv(data_err_path)

        if data_int_path.endswith(".csv") or data_err_path.endswith(".csv"):
            warnings.warn(
                "For faster loading, consider using the `.pkl` format instead `.csv`."
            )

        assert list(df_int.index.names) == ["X", "Y"]
        assert list(df_err.index.names) == ["X", "Y"]
        assert len(df_int) == len(df_err)

        df_int = df_int.sort_index()
        df_err = df_err.sort_index()

        if "idx" not in list(df_err.columns):
            df_int["idx"] = np.arange(len(df_int))
            df_err["idx"] = np.arange(len(df_err))

        assert df_int.shape == df_err.shape
        assert list(df_int.columns) == list(df_err.columns)

        for line in self.list_lines_fit:
            assert line in list(df_int.columns)

        self.list_lines_valid = list(
            set(list(df_int.columns)) - set(self.list_lines_fit)
        )
        r"""List[str]: list of observed lines not used for inference, that can be used for validation"""
        self.list_lines_valid.remove("idx")
        self.list_lines_valid.sort()

        df_int_fit = df_int.loc[:, self.list_lines_fit + ["idx"]]
        df_err_fit = df_err.loc[:, self.list_lines_fit + ["idx"]]

        df_int_valid = df_int.loc[:, self.list_lines_valid + ["idx"]]
        df_err_valid = df_err.loc[:, self.list_lines_valid + ["idx"]]

        # eliminate absurd values
        for col in self.list_lines_fit:
            df_int_fit[col] = df_int_fit[col].apply(
                lambda x: x if x <= 1e3 else np.inf,
            )
            df_err_fit[col] = df_err_fit[col].apply(
                lambda x: x if x <= 1e3 else np.inf,
            )
        for col in self.list_lines_valid:
            df_int_valid[col] = df_int_valid[col].apply(
                lambda x: x if x <= 1e3 else np.inf,
            )
            df_err_valid[col] = df_err_valid[col].apply(
                lambda x: x if x <= 1e3 else np.inf,
            )

        self.N = len(df_int_fit)
        r"""int: number of pixels / components in the observation"""

        # * correct values (censoring / errors / etc.)
        df_censor_fit = df_err_fit.copy()
        # df_censor.iloc[:, :-1] *= 3 # with potential censorship
        df_censor_fit.iloc[:, :-1] = 1e-60  # no censorship

        df_censor_valid = df_err_valid.copy()
        # df_censor.iloc[:, :-1] *= 3 # with potential censorship
        df_censor_valid.iloc[:, :-1] = 1e-60  # no censorship

        y_fit = np.nan_to_num(df_int_fit.drop(columns="idx").values, nan=1e-15)
        sigma_a_fit = np.nan_to_num(df_err_fit.drop(columns="idx").values, nan=1e3)
        omega_fit = df_censor_fit.drop(columns="idx").values

        y_valid = np.nan_to_num(df_int_valid.drop(columns="idx").values, nan=1e-15)
        sigma_a_valid = np.nan_to_num(
            df_err_valid.drop(columns="idx").values,
            nan=1e3,
        )
        omega_valid = df_censor_valid.drop(columns="idx").values

        self.Theta_true_scaled = None
        r"""Optional[np.ndarray]: true values of the physical parameters. Always ``None`` for real applications."""
        self.map_shaper = MapShaper(df_int)
        r"""MapShaper: defines the transformation from vectors to 2D maps"""

        # * save observation
        if save_obs:
            df_int_fit.to_pickle(
                f"{self.path_data_csv_in}/observation_maps.pkl",
            )
            df_err_fit.to_pickle(f"{self.path_data_csv_in}/additive_std.pkl")
            df_censor_fit.to_pickle(
                f"{self.path_data_csv_in}/censor_threshold.pkl",
            )

        return (
            df_int_fit,
            y_fit,
            sigma_a_fit,
            omega_fit,
            y_valid,
            sigma_a_valid,
            omega_valid,
        )