diff --git a/config/validation_2018_try2.yaml b/config/validation_2018_try2.yaml index 17850bf..b93e926 100644 --- a/config/validation_2018_try2.yaml +++ b/config/validation_2018_try2.yaml @@ -77,6 +77,25 @@ analysis_base: &model_base names: ['$u_{Lres}$', '$u_{kappa}$'] min: [0.0, 0.0] max: [1.0, 1.0] + physical_posterior_transform: + unit_names: ['u1', 'u2'] + unit_labels: ['$u_{Lres}$', '$u_{kappa}$'] + components: + - name: 'Lres' + axis_scale: 'log' + steps: + - kind: 'multiply' + value: 1.5707963267948966 + - kind: 'tan' + - kind: 'power' + value: 3.0 + - name: 'kappa_sc' + axis_scale: 'linear' + steps: + - kind: 'multiply' + value: 0.3 + - kind: 'add' + value: 0.3 alpha: 0.2 validation_indices: [38, 39] # empty doesnt work, so I put the out of range @@ -143,4 +162,4 @@ analyses: closure: <<: *default_closure_parameters <<: *model_base - plot_panel_shapes: [[3, 3], [3, 3], [3, 3], [3, 3], [3, 3], [3, 3]] # Maybe fix? \ No newline at end of file + plot_panel_shapes: [[3, 3], [3, 3], [3, 3], [3, 3], [3, 3], [3, 3]] # Maybe fix? diff --git a/src/bayesian/data_IO.py b/src/bayesian/data_IO.py index 808917a..1e836ab 100644 --- a/src/bayesian/data_IO.py +++ b/src/bayesian/data_IO.py @@ -164,6 +164,16 @@ def _recursive_defaultdict(): return defaultdict(_recursive_defaultdict) +def _should_skip_table_filename(filename: str) -> bool: + """Return True for filesystem sidecars that should never be parsed as tables.""" + return filename.endswith(":Zone.Identifier") + + +def _is_table_file(filename: str, prefix: str) -> bool: + """Return True when a filename matches the expected table naming convention.""" + return filename.startswith(prefix) and filename.endswith(".dat") and not _should_skip_table_filename(filename) + + def _validate_and_flatten_array(array: np.ndarray, name: str) -> np.ndarray: """ Validate array is 1D or flattenable to 1D. @@ -1095,8 +1105,12 @@ def _lookup_systematic_config(file_obs_label): logger.info("No systematic correlations found in config") for filename in os.listdir(data_dir): + if _should_skip_table_filename(filename): + logger.debug(f"Skipping sidecar file in Data directory: {filename}") + continue + # Skip files that don't match the expected Data file pattern - if not filename.startswith("Data__"): + if not _is_table_file(filename, "Data__"): logger.debug(f"Skipping non-data file in Data directory: {filename}") continue @@ -1196,6 +1210,14 @@ def _lookup_systematic_config(file_obs_label): design_points_to_exclude = analysis_config.get("design_points_to_exclude", []) design_dir = os.path.join(table_dir, "Design") for filename in os.listdir(design_dir): + if _should_skip_table_filename(filename): + logger.debug(f"Skipping sidecar file in Design directory: {filename}") + continue + + if not _is_table_file(filename, "Design__"): + logger.debug(f"Skipping non-design file in Design directory: {filename}") + continue + if _filename_to_labels(filename)[1] == parameterization: # Explanation of variables: # - design_point_parameters: The parameters of the design points, with one per design point. @@ -1224,6 +1246,14 @@ def _lookup_systematic_config(file_obs_label): # Read predictions and uncertainty prediction_dir = os.path.join(table_dir, "Prediction") for filename in os.listdir(prediction_dir): + if _should_skip_table_filename(filename): + logger.debug(f"Skipping sidecar file in Prediction directory: {filename}") + continue + + if not _is_table_file(filename, "Prediction__"): + logger.debug(f"Skipping non-prediction file in Prediction directory: {filename}") + continue + if "values" in filename and parameterization in filename: if _accept_observable(analysis_config, filename): filename_prediction_values = filename @@ -1798,6 +1828,9 @@ def _accept_observable(analysis_config, filename): :param str filename: filename of table for the considered observable """ + if _should_skip_table_filename(filename) or not filename.endswith(".dat"): + return False + observable_label, _ = _filename_to_labels(filename) sqrts, _, _, _, _, centrality = observable_label_to_keys(observable_label) diff --git a/src/bayesian/mc_sampling/__init__.py b/src/bayesian/mc_sampling/__init__.py index c619a38..6fe1b38 100644 --- a/src/bayesian/mc_sampling/__init__.py +++ b/src/bayesian/mc_sampling/__init__.py @@ -19,6 +19,7 @@ from bayesian.mc_sampling.base import ( # noqa: F401 MCConfig, + compute_chain_diagnostics, credible_interval, map_parameters, run_mcmc, diff --git a/src/bayesian/mc_sampling/base.py b/src/bayesian/mc_sampling/base.py index 665f562..5f46016 100644 --- a/src/bayesian/mc_sampling/base.py +++ b/src/bayesian/mc_sampling/base.py @@ -256,6 +256,80 @@ def map_parameters(posterior: npt.NDArray[np.float64], method: str = "quantile") raise ValueError(msg) +def compute_chain_diagnostics( + chain: npt.NDArray[np.float64], + parameter_min: npt.NDArray[np.float64], + parameter_max: npt.NDArray[np.float64], + *, + boundary_fraction: float = 0.005, + late_fraction: float = 0.2, +) -> dict[str, npt.NDArray[np.float64] | float]: + """Compute walker-level QA diagnostics for a chain. + + These diagnostics are lightweight summary metadata intended for saved run + artifacts and regression tests, not just ad hoc notebook inspection. + """ + chain = np.asarray(chain, dtype=np.float64) + parameter_min = np.asarray(parameter_min, dtype=np.float64) + parameter_max = np.asarray(parameter_max, dtype=np.float64) + + if chain.ndim != 3: + msg = f"Expected chain with shape (n_steps, n_walkers, n_dim), got {chain.shape}" + raise ValueError(msg) + + n_dim = chain.shape[2] + if parameter_min.shape != (n_dim,) or parameter_max.shape != (n_dim,): + msg = ( + f"Expected parameter bounds with shape ({n_dim},), got " + f"{parameter_min.shape} and {parameter_max.shape}" + ) + raise ValueError(msg) + if not 0.0 <= boundary_fraction <= 1.0: + msg = f"boundary_fraction must be in [0, 1], got {boundary_fraction}" + raise ValueError(msg) + if not 0.0 <= late_fraction <= 1.0: + msg = f"late_fraction must be in [0, 1], got {late_fraction}" + raise ValueError(msg) + + n_steps, n_walkers, _ = chain.shape + late_start = int((1.0 - late_fraction) * n_steps) + widths = parameter_max - parameter_min + lower_threshold = parameter_min + boundary_fraction * widths + upper_threshold = parameter_max - boundary_fraction * widths + + near_lower = chain <= lower_threshold[np.newaxis, np.newaxis, :] + near_upper = chain >= upper_threshold[np.newaxis, np.newaxis, :] + + longest_repeat_run = np.zeros(n_walkers, dtype=np.int64) + for walker_idx in range(n_walkers): + equal_steps = np.all( + np.isclose(chain[1:, walker_idx, :], chain[:-1, walker_idx, :], atol=0.0, rtol=0.0), + axis=1, + ) + best = 0 + current = 0 + for repeated in equal_steps: + if repeated: + current = current + 1 if current else 2 + best = max(best, current) + else: + current = 0 + longest_repeat_run[walker_idx] = best + + diagnostics: dict[str, npt.NDArray[np.float64] | float] = { + "boundary_fraction": float(boundary_fraction), + "late_fraction": float(late_fraction), + "near_lower_counts": near_lower.sum(axis=0).astype(np.int64), + "near_upper_counts": near_upper.sum(axis=0).astype(np.int64), + "late_near_lower_counts": near_lower[late_start:, :, :].sum(axis=0).astype(np.int64), + "late_near_upper_counts": near_upper[late_start:, :, :].sum(axis=0).astype(np.int64), + "max_per_walker": chain.max(axis=0), + "min_per_walker": chain.min(axis=0), + "longest_repeat_run": longest_repeat_run, + } + return diagnostics + + def _validate_sampler(name: str, module: ModuleType) -> None: """Validate that a sampler module follows the expected interface.""" for attr in ["run_sampling", "SamplerSettings"]: diff --git a/src/bayesian/mc_sampling/emcee.py b/src/bayesian/mc_sampling/emcee.py index 3aa27ed..1d1bf8f 100644 --- a/src/bayesian/mc_sampling/emcee.py +++ b/src/bayesian/mc_sampling/emcee.py @@ -27,6 +27,15 @@ _register_name = "emcee" +def _apply_random_seed(sampler: emcee.EnsembleSampler, seed: int | None) -> None: + """Seed emcee's internal RNG so runs are reproducible beyond initial positions.""" + if seed is None: + return + + rng = np.random.RandomState(seed) + sampler.random_state = rng.get_state() + + @attrs.define class SamplerSettings: """Settings for the emcee affine-invariant ensemble sampler. @@ -48,6 +57,7 @@ class SamplerSettings: n_sampling_steps: int = attrs.field() n_logging_steps: int = attrs.field() settings: dict[str, Any] = attrs.field() + random_seed: int | None = attrs.field(default=None) @classmethod def from_config(cls, config: dict[str, Any]) -> SamplerSettings: @@ -57,6 +67,7 @@ def from_config(cls, config: dict[str, Any]) -> SamplerSettings: n_burn_steps=config["n_burn_steps"], n_sampling_steps=config["n_sampling_steps"], n_logging_steps=config["n_logging_steps"], + random_seed=config.get("random_seed"), settings=config, ) @@ -110,8 +121,9 @@ def run_sampling( kwargs={"set_to_infinite_outside_bounds": True}, pool=pool, ) + _apply_random_seed(sampler, sampler_settings.random_seed) - rng = np.random.default_rng() + rng = np.random.default_rng(sampler_settings.random_seed) random_pos = rng.uniform(parameter_min, parameter_max, (sampler_settings.n_walkers, parameter_ndim)) # First half of burn-in from random positions @@ -143,6 +155,12 @@ def run_sampling( "chain": sampler.get_chain(), "acceptance_fraction": sampler.acceptance_fraction, "log_prob": sampler.get_log_prob(), + "random_seed": sampler_settings.random_seed if sampler_settings.random_seed is not None else -1, + "chain_diagnostics": mc_sampling_base.compute_chain_diagnostics( + sampler.get_chain(), + np.asarray(parameter_min, dtype=np.float64), + np.asarray(parameter_max, dtype=np.float64), + ), } try: output_dict["autocorrelation_time"] = sampler.get_autocorr_time() diff --git a/src/bayesian/plot_mcmc.py b/src/bayesian/plot_mcmc.py index 17e6fb6..3229b97 100644 --- a/src/bayesian/plot_mcmc.py +++ b/src/bayesian/plot_mcmc.py @@ -18,7 +18,7 @@ sns.set_context("paper", rc={"font.size": 18, "axes.titlesize": 18, "axes.labelsize": 18}) -from bayesian import data_IO, emulation, plot_utils +from bayesian import data_IO, emulation, plot_posterior_transform, plot_utils from bayesian import mc_sampling as mcmc logger = logging.getLogger(__name__) @@ -55,10 +55,11 @@ def plot(config: mcmc.MCConfig): logger.info(f"Chain is of size: {os.path.getsize(config.mcmc_outputfile) / (1024 * 1024):.1f} MB") # MCMC plots - _plot_acceptance_fraction(results["acceptance_fraction"], plot_dir, config) + _plot_acceptance_fraction(results["acceptance_fraction"], plot_dir) _plot_log_posterior(results["log_prob"], plot_dir, config) _plot_autocorrelation_time(results, plot_dir, config) _plot_posterior_pairplot(chain, plot_dir, config) + plot_posterior_transform.plot(chain, config, random_seed=results.get("random_seed")) # Posterior vs. Design observables design = data_IO.design_array_from_h5(config.output_dir, filename=config.observables_filename) @@ -68,7 +69,7 @@ def plot(config: mcmc.MCConfig): # --------------------------------------------------------------- -def _plot_acceptance_fraction(acceptance_fraction, plot_dir, config): +def _plot_acceptance_fraction(acceptance_fraction, plot_dir): """ Plot histogram of acceptance_fraction for each walker. @@ -78,7 +79,7 @@ def _plot_acceptance_fraction(acceptance_fraction, plot_dir, config): :param 1darray acceptance_fraction: fraction of steps accepted for each walker -- shape (n_walkers,) """ plt.figure(figsize=(10, 6)) - plt.plot(np.arange(config.n_walkers), acceptance_fraction, marker="o", color=sns.xkcd_rgb["denim blue"]) + plt.plot(np.arange(len(acceptance_fraction)), acceptance_fraction, marker="o", color=sns.xkcd_rgb["denim blue"]) plt.ylim(0, 1) plt.xlabel("Walker Index") plt.ylabel("Acceptance Fraction") diff --git a/src/bayesian/plot_posterior_transform.py b/src/bayesian/plot_posterior_transform.py new file mode 100644 index 0000000..2ed3c2b --- /dev/null +++ b/src/bayesian/plot_posterior_transform.py @@ -0,0 +1,351 @@ +"""Plot posterior samples in unit and transformed physical parameter spaces.""" + +from __future__ import annotations + +import logging +import re +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import numpy as np +import numpy.typing as npt +from matplotlib import pyplot as plt + +if TYPE_CHECKING: + from bayesian.mc_sampling.base import MCConfig + +logger = logging.getLogger(__name__) + + +def plot(chain: npt.NDArray[np.float64], config: MCConfig, random_seed: int | None = None) -> None: + """Plot posterior samples in unit and physical parameter spaces if configured.""" + transform_config = _transform_config(config) + if not transform_config: + logger.info("No physical posterior transform configured. Skipping transformed posterior plots.") + return + + samples_unit = chain.reshape((chain.shape[0] * chain.shape[1], chain.shape[2])) + samples_physical = transform_samples(samples_unit, transform_config, direction="forward") + + plot_dir = Path(config.output_dir) / "plot_transform" + plot_dir.mkdir(parents=True, exist_ok=True) + + np.savez( + Path(config.output_dir) / "posterior_unit_and_physical.npz", + posterior_unit=samples_unit, + posterior_physical=samples_physical, + ) + _write_summary( + samples_unit=samples_unit, + samples_physical=samples_physical, + unit_file_names=_unit_file_names(config, transform_config), + unit_plot_labels=_unit_plot_labels(config, transform_config), + component_names=_component_names(transform_config), + output_path=Path(config.output_dir) / "posterior_transform_summary.txt", + ) + + plot_seed = _normalize_random_seed(random_seed) + unit_file_names = _unit_file_names(config, transform_config) + unit_plot_labels = _unit_plot_labels(config, transform_config) + physical_names = _component_names(transform_config) + axis_scales = _axis_scales(transform_config) + + for i, (file_name, label) in enumerate(zip(unit_file_names, unit_plot_labels, strict=True)): + _save_1d_hist( + values=samples_unit[:, i], + output_path=plot_dir / f"posterior_unit_{file_name}", + xlabel=label, + title=f"Posterior in Unit Space: {label}", + axis_scale="linear", + ) + + if samples_unit.shape[1] == 2: + _save_2d_scatter( + x=samples_unit[:, 0], + y=samples_unit[:, 1], + output_path=plot_dir / f"posterior_unit_{unit_file_names[0]}_{unit_file_names[1]}_2d", + xlabel=unit_plot_labels[0], + ylabel=unit_plot_labels[1], + title="Posterior in Unit Space (2D)", + x_scale="linear", + y_scale="linear", + random_seed=plot_seed, + ) + + for i, (name, axis_scale) in enumerate(zip(physical_names, axis_scales, strict=True)): + _save_1d_hist( + values=samples_physical[:, i], + output_path=plot_dir / f"posterior_physical_{name}", + xlabel=name, + title=f"Transformed Posterior: {name}", + axis_scale=axis_scale, + ) + if axis_scale != "linear": + _save_1d_hist( + values=samples_physical[:, i], + output_path=plot_dir / f"posterior_physical_{name}_linear", + xlabel=name, + title=f"Transformed Posterior: {name} (linear axis)", + axis_scale="linear", + ) + + if samples_physical.shape[1] == 2: + output_stem = f"posterior_physical_{physical_names[0]}_{physical_names[1]}_2d" + _save_2d_scatter( + x=samples_physical[:, 0], + y=samples_physical[:, 1], + output_path=plot_dir / output_stem, + xlabel=physical_names[0], + ylabel=physical_names[1], + title="Transformed Posterior in Physical Space (2D)", + x_scale=axis_scales[0], + y_scale=axis_scales[1], + random_seed=plot_seed, + ) + if any(scale != "linear" for scale in axis_scales): + _save_2d_scatter( + x=samples_physical[:, 0], + y=samples_physical[:, 1], + output_path=plot_dir / f"{output_stem}_linear", + xlabel=physical_names[0], + ylabel=physical_names[1], + title="Transformed Posterior in Physical Space (2D, linear axes)", + x_scale="linear", + y_scale="linear", + random_seed=plot_seed, + ) + + +def transform_samples( + samples: npt.NDArray[np.float64], transform_config: dict[str, Any], direction: str = "forward" +) -> npt.NDArray[np.float64]: + """Transform posterior samples component-wise using declarative step lists.""" + samples = np.asarray(samples, dtype=np.float64) + if samples.ndim != 2: + msg = f"Expected samples to have shape (n_samples, n_parameters), got {samples.shape}" + raise ValueError(msg) + + components = transform_config.get("components", []) + if samples.shape[1] != len(components): + msg = f"Transform config has {len(components)} components, but samples have {samples.shape[1]} columns" + raise ValueError(msg) + + transformed_components: list[npt.NDArray[np.float64]] = [] + step_key = "steps" if direction == "forward" else "inverse_steps" + for i, component in enumerate(components): + if step_key not in component: + msg = f"Transform component '{component.get('name', i)}' is missing '{step_key}'" + raise ValueError(msg) + values = samples[:, i].copy() + for step in component[step_key]: + values = _apply_step(values, step) + transformed_components.append(values) + + return np.column_stack(transformed_components) + + +def _apply_step(values: npt.NDArray[np.float64], step: dict[str, Any]) -> npt.NDArray[np.float64]: + kind = step["kind"] + if kind == "add": + return values + float(step["value"]) + if kind == "subtract": + return values - float(step["value"]) + if kind == "multiply": + return values * float(step["value"]) + if kind == "divide": + return values / float(step["value"]) + if kind == "power": + return np.power(values, float(step["value"])) + if kind == "tan": + return np.tan(values) + if kind == "arctan": + return np.arctan(values) + if kind == "exp": + return np.exp(values) + if kind == "log": + return np.log(values) + msg = f"Unknown transform step kind: {kind}" + raise ValueError(msg) + + +def _transform_config(config: MCConfig) -> dict[str, Any] | None: + param_cfg = config.analysis_config["parameterization"][config.parameterization] + transform_config = param_cfg.get("physical_posterior_transform") + if transform_config is None: + return None + return transform_config + + +def _component_names(transform_config: dict[str, Any]) -> list[str]: + return [_slugify_component_name(component["name"]) for component in transform_config["components"]] + + +def _component_plot_labels(transform_config: dict[str, Any]) -> list[str]: + return [str(component["name"]) for component in transform_config["components"]] + + +def _unit_plot_labels(config: MCConfig, transform_config: dict[str, Any]) -> list[str]: + unit_labels = transform_config.get("unit_labels") + if unit_labels is not None: + return [str(label) for label in unit_labels] + return [str(name) for name in config.analysis_config["parameterization"][config.parameterization]["names"]] + + +def _unit_file_names(config: MCConfig, transform_config: dict[str, Any]) -> list[str]: + unit_names = transform_config.get("unit_names") + if unit_names is not None: + return [_slugify_component_name(name) for name in unit_names] + n_dim = len(config.analysis_config["parameterization"][config.parameterization]["names"]) + return [f"u{i + 1}" for i in range(n_dim)] + + +def _axis_scales(transform_config: dict[str, Any]) -> list[str]: + return [str(component.get("axis_scale", "linear")) for component in transform_config["components"]] + + +def _save_1d_hist( + values: npt.NDArray[np.float64], output_path: Path, xlabel: str, title: str, axis_scale: str = "linear" +) -> None: + values = _values_for_axis_scale(values, axis_scale, output_path.name) + fig, ax = plt.subplots(figsize=(6, 4)) + ax.hist(values, bins=80, density=True, alpha=0.85, color="#1f77b4") + ax.set_xlabel(xlabel) + ax.set_ylabel("Density") + ax.set_title(title) + if axis_scale != "linear": + ax.set_xscale(axis_scale) + fig.tight_layout() + fig.savefig(output_path.with_suffix(".pdf")) + fig.savefig(output_path.with_suffix(".png"), dpi=160) + plt.close(fig) + + +def _save_2d_scatter( + x: npt.NDArray[np.float64], + y: npt.NDArray[np.float64], + output_path: Path, + xlabel: str, + ylabel: str, + title: str, + x_scale: str = "linear", + y_scale: str = "linear", + n_plot_samples: int = 50000, + random_seed: int = 12345, +) -> None: + x, y = _paired_values_for_axis_scale(x, y, x_scale, y_scale, output_path.name) + n_available = x.shape[0] + n_to_plot = min(n_available, n_plot_samples) + rng = np.random.default_rng(random_seed) + selected = rng.choice(n_available, size=n_to_plot, replace=False) + + fig, ax = plt.subplots(figsize=(6, 5)) + ax.scatter( + x[selected], + y[selected], + s=4, + alpha=0.18, + linewidths=0.0, + color="#1f77b4", + rasterized=True, + ) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + ax.set_title(title) + if x_scale != "linear": + ax.set_xscale(x_scale) + if y_scale != "linear": + ax.set_yscale(y_scale) + ax.text( + 0.98, + 0.02, + f"{n_to_plot:,} samples shown", + transform=ax.transAxes, + ha="right", + va="bottom", + fontsize=9, + bbox={"facecolor": "white", "alpha": 0.8, "edgecolor": "0.8"}, + ) + fig.tight_layout() + fig.savefig(output_path.with_suffix(".pdf")) + fig.savefig(output_path.with_suffix(".png"), dpi=160) + plt.close(fig) + + +def _write_summary( + samples_unit: npt.NDArray[np.float64], + samples_physical: npt.NDArray[np.float64], + unit_file_names: list[str], + unit_plot_labels: list[str], + component_names: list[str], + output_path: Path, +) -> None: + with output_path.open("w", encoding="utf-8") as stream: + stream.write("Posterior transform summary\n") + stream.write("===========================\n\n") + for i, (name, label) in enumerate(zip(unit_file_names, unit_plot_labels, strict=True)): + quantiles = np.quantile(samples_unit[:, i], [0.16, 0.5, 0.84]) + stream.write(f"unit {name} ({label}) q16/q50/q84: {quantiles.tolist()}\n") + stream.write("\n") + for i, name in enumerate(component_names): + quantiles = np.quantile(samples_physical[:, i], [0.05, 0.16, 0.5, 0.84, 0.95, 0.99]) + stream.write(f"physical {name} q05/q16/q50/q84/q95/q99: {quantiles.tolist()}\n") + + +def _slugify_component_name(name: str) -> str: + stripped = re.sub(r"[^0-9A-Za-z_]+", "_", str(name)).strip("_") + return stripped or "parameter" + + +def _normalize_random_seed(random_seed: int | np.integer[Any] | None) -> int: + if random_seed is None: + return 12345 + seed = int(random_seed) + if seed < 0: + return 12345 + return seed + + +def _values_for_axis_scale( + values: npt.NDArray[np.float64], axis_scale: str, context: str +) -> npt.NDArray[np.float64]: + values = np.asarray(values, dtype=np.float64) + mask = np.isfinite(values) + if axis_scale == "log": + mask &= values > 0 + filtered = values[mask] + if filtered.size == 0: + msg = f"No valid values remain for {context} after applying axis scale '{axis_scale}'" + raise ValueError(msg) + if filtered.size != values.size: + logger.warning("Dropped %d invalid samples for %s (%s axis)", values.size - filtered.size, context, axis_scale) + return filtered + + +def _paired_values_for_axis_scale( + x: npt.NDArray[np.float64], + y: npt.NDArray[np.float64], + x_scale: str, + y_scale: str, + context: str, +) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: + x = np.asarray(x, dtype=np.float64) + y = np.asarray(y, dtype=np.float64) + mask = np.isfinite(x) & np.isfinite(y) + if x_scale == "log": + mask &= x > 0 + if y_scale == "log": + mask &= y > 0 + filtered_x = x[mask] + filtered_y = y[mask] + if filtered_x.size == 0: + msg = f"No valid samples remain for {context} after applying axis scales '{x_scale}', '{y_scale}'" + raise ValueError(msg) + if filtered_x.size != x.size: + logger.warning( + "Dropped %d invalid paired samples for %s (%s/%s axes)", + x.size - filtered_x.size, + context, + x_scale, + y_scale, + ) + return filtered_x, filtered_y diff --git a/src/bayesian/plot_utils.py b/src/bayesian/plot_utils.py index f2b3014..12c24bd 100644 --- a/src/bayesian/plot_utils.py +++ b/src/bayesian/plot_utils.py @@ -8,6 +8,7 @@ from __future__ import annotations import logging +import math from pathlib import Path from typing import Any @@ -78,7 +79,7 @@ def plot_observable_panels( data = data_IO.data_dict_from_h5(config.output_dir, filename="observables.h5") # type: ignore[no-untyped-call] # Group observables into subplots, with shapes specified in config - plot_panel_shapes = config.raw_analysis_config["plot_panel_shapes"] + plot_panel_shapes = config.raw_analysis_config.get("plot_panel_shapes", _default_plot_panel_shapes(len(sorted_observable_list))) n_panels = sum(x[0] * x[1] for x in plot_panel_shapes) assert len(sorted_observable_list) <= n_panels, ( f"You specified {n_panels} panels, but have {len(sorted_observable_list)} observables" @@ -126,7 +127,7 @@ def plot_observable_panels( fontsize = 14.0 / plot_shape[0] markersize = 8.0 / plot_shape[0] if i_subplot == 0: - fig, axs = plt.subplots(plot_shape[0], plot_shape[1], constrained_layout=True) + fig, axs = plt.subplots(plot_shape[0], plot_shape[1], constrained_layout=True, squeeze=False) for ax in axs.flat: ax.tick_params(labelsize=fontsize) row = 0 @@ -208,6 +209,15 @@ def plot_observable_panels( plt.close(fig) +def _default_plot_panel_shapes(n_observables: int) -> list[list[int]]: + """Choose a reasonable single-panel grid when config does not specify one.""" + if n_observables <= 0: + return [[1, 1]] + n_cols = max(1, min(3, math.ceil(math.sqrt(n_observables)))) + n_rows = math.ceil(n_observables / n_cols) + return [[n_rows, n_cols]] + + def plot_histogram_1d( x_list: list[Any] | None = None, label_list: list[Any] | None = None, diff --git a/tests/test_data_IO.py b/tests/test_data_IO.py index 0cda84c..0f5b31e 100644 --- a/tests/test_data_IO.py +++ b/tests/test_data_IO.py @@ -89,3 +89,63 @@ def test_exclude_design_points(caplog: Any, design_points_to_exclude: list[int], assert values not in design_points_parameters assert values not in design_points_parameters_validation + + +def test_initialize_observables_ignores_sidecar_and_stray_files(tmp_path: Path) -> None: + table_dir = tmp_path / "tables" + data_dir = table_dir / "Data" + design_dir = table_dir / "Design" + prediction_dir = table_dir / "Prediction" + data_dir.mkdir(parents=True) + design_dir.mkdir() + prediction_dir.mkdir() + + (design_dir / "Design__test1.dat").write_text( + "# Version 1.0\n" + "# Design point indices (row index): 0 1\n" + "0.1 0.2\n" + "0.3 0.4\n" + ) + (design_dir / ".DS_Store").write_text("junk\n") + (design_dir / "Design__test1.dat:Zone.Identifier").write_text("junk\n") + + (data_dir / "Data__5020__PbPb__hadron__pt_ch_cms____0-5.dat").write_text( + "# Label xmin xmax y stat,low stat,high sys,low sys,high\n" + "10 20 0.5 0.01 0.01 0.02 0.02\n" + ) + (data_dir / "Thumbs.db").write_text("junk\n") + (data_dir / "Data__5020__PbPb__hadron__pt_ch_cms____0-5.dat:Zone.Identifier").write_text("junk\n") + + prediction_header = ( + "# Version 2.0\n" + "# design_point0 design_point1\n" + ) + (prediction_dir / "Prediction__test1__5020__PbPb__hadron__pt_ch_cms____0-5__values.dat").write_text( + prediction_header + "0.45 0.55\n" + ) + (prediction_dir / "Prediction__test1__5020__PbPb__hadron__pt_ch_cms____0-5__errors.dat").write_text( + prediction_header + "0.01 0.01\n" + ) + (prediction_dir / "backup.tmp").write_text("junk\n") + (prediction_dir / "Prediction__test1__5020__PbPb__hadron__pt_ch_cms____0-5__values.dat:Zone.Identifier").write_text( + "junk\n" + ) + + analysis_config = { + "sqrts_list": [5020], + "centrality_range": [[0, 10]], + "validation_indices": [2, 2], + "parameters": { + "emulators": { + "default_group": { + "observable_list": [{"observable": "5020__PbPb__hadron__pt_ch_cms"}], + } + } + }, + } + + observables = data_IO.initialize_observables_dict_from_tables(str(table_dir), analysis_config, "test1") + + assert "5020__PbPb__hadron__pt_ch_cms____0-5" in observables["Data"] + assert observables["Design"].shape == (2, 2) + assert observables["Prediction"]["5020__PbPb__hadron__pt_ch_cms____0-5"]["y"].shape == (1, 2) diff --git a/tests/test_mc_sampling.py b/tests/test_mc_sampling.py new file mode 100644 index 0000000..a28f859 --- /dev/null +++ b/tests/test_mc_sampling.py @@ -0,0 +1,103 @@ +"""Tests for MCMC QA diagnostics and sampler settings.""" + +import numpy as np + +from bayesian import mc_sampling as mcmc +from bayesian.mc_sampling import emcee as emcee_sampler + + +def test_compute_chain_diagnostics_flags_boundary_and_repeats() -> None: + chain = np.array( + [ + [[0.10, 0.20], [0.50, 0.50]], + [[0.20, 0.30], [0.50, 0.50]], + [[0.30, 0.40], [0.996, 0.50]], + [[0.40, 0.50], [0.998, 0.50]], + [[0.50, 0.60], [0.998, 0.50]], + ], + dtype=np.float64, + ) + + diagnostics = mcmc.compute_chain_diagnostics( + chain, + parameter_min=np.array([0.0, 0.0]), + parameter_max=np.array([1.0, 1.0]), + boundary_fraction=0.005, + late_fraction=0.4, + ) + + near_upper = diagnostics["near_upper_counts"] + late_near_upper = diagnostics["late_near_upper_counts"] + longest_repeat = diagnostics["longest_repeat_run"] + + assert near_upper.shape == (2, 2) + assert int(near_upper[1, 0]) == 3 + assert int(late_near_upper[1, 0]) == 2 + assert int(longest_repeat[1]) == 2 + assert int(longest_repeat[0]) == 0 + + +def test_compute_chain_diagnostics_validates_inputs() -> None: + chain = np.zeros((4, 2, 2), dtype=np.float64) + + try: + mcmc.compute_chain_diagnostics(chain, np.array([0.0]), np.array([1.0])) + except ValueError as exc: + assert "shape" in str(exc) + else: + raise AssertionError("Expected ValueError for mismatched parameter bound shapes") + + try: + mcmc.compute_chain_diagnostics(chain, np.array([0.0, 0.0]), np.array([1.0, 1.0]), boundary_fraction=1.5) + except ValueError as exc: + assert "boundary_fraction" in str(exc) + else: + raise AssertionError("Expected ValueError for invalid boundary_fraction") + + try: + mcmc.compute_chain_diagnostics(chain, np.array([0.0, 0.0]), np.array([1.0, 1.0]), late_fraction=-0.1) + except ValueError as exc: + assert "late_fraction" in str(exc) + else: + raise AssertionError("Expected ValueError for invalid late_fraction") + + +def test_emcee_sampler_settings_reads_random_seed() -> None: + settings = emcee_sampler.SamplerSettings.from_config( + { + "n_walkers": 10, + "n_burn_steps": 20, + "n_sampling_steps": 30, + "n_logging_steps": 5, + "random_seed": 12345, + } + ) + + assert settings.random_seed == 12345 + + +def test_emcee_seed_reproduces_short_chain() -> None: + def log_prob(x): + x = np.asarray(x) + return -0.5 * np.sum(x**2) + + seed = 12345 + initial = np.array( + [ + [0.10, -0.10], + [0.15, -0.05], + [-0.10, 0.10], + [-0.15, 0.05], + ], + dtype=np.float64, + ) + + sampler_a = emcee_sampler.LoggingEnsembleSampler(4, 2, log_prob) + sampler_b = emcee_sampler.LoggingEnsembleSampler(4, 2, log_prob) + emcee_sampler._apply_random_seed(sampler_a, seed) + emcee_sampler._apply_random_seed(sampler_b, seed) + + sampler_a.run_mcmc(initial, 8, n_logging_steps=1000) + sampler_b.run_mcmc(initial, 8, n_logging_steps=1000) + + np.testing.assert_allclose(sampler_a.get_chain(), sampler_b.get_chain()) diff --git a/tests/test_outliers_smoothing.py b/tests/test_outliers_smoothing.py index 83b89b5..2d520fe 100644 --- a/tests/test_outliers_smoothing.py +++ b/tests/test_outliers_smoothing.py @@ -5,7 +5,6 @@ from __future__ import annotations import logging -from pathlib import Path import numpy as np import pytest # noqa: F401 @@ -14,19 +13,15 @@ logger = logging.getLogger(__name__) -_data_dir = Path(__file__).parent / "test_data" - def test_smoothing() -> None: - # Setup: Load data - measured_data = np.loadtxt(_data_dir / "tables" / "Data" / "Data__5020__PbPb__hadron__pt_ch_cms____0-5.dat", ndmin=2) - # Calculate bin centers from data - x_min = measured_data[:, 0] - x_max = measured_data[:, 1] - bin_centers = x_min + (x_max - x_min) / 2. - # And load values and errors - values = np.loadtxt(_data_dir / "tables" / "Prediction" / "Prediction__exponential__5020__PbPb__hadron__pt_ch_cms____0-5__values.dat", ndmin=2) - y_err = np.loadtxt(_data_dir / "tables" / "Prediction" / "Prediction__exponential__5020__PbPb__hadron__pt_ch_cms____0-5__errors.dat", ndmin=2) + # Build a small synthetic observable with one obvious outlier. + bin_centers = np.linspace(20.0, 70.0, 6) + values = np.tile(np.arange(10.0, 16.0)[:, np.newaxis], (1, 10)) + values = values + np.linspace(0.0, 0.9, 10) + values[2, 5] = 40.0 + y_err = np.full_like(values, 0.2) + y_err[2, 5] = 8.0 # Identify outliers and smooth them output_values, output_y_err, outliers_that_cannot_be_removed = outliers_smoothing.find_and_smooth_outliers_standalone( @@ -45,3 +40,6 @@ def test_smoothing() -> None: assert not np.allclose(output_values, values) assert not np.allclose(output_y_err, y_err) + assert output_values.shape == values.shape + assert output_y_err.shape == y_err.shape + assert not outliers_that_cannot_be_removed diff --git a/tests/test_plot_posterior_transform.py b/tests/test_plot_posterior_transform.py new file mode 100644 index 0000000..742e203 --- /dev/null +++ b/tests/test_plot_posterior_transform.py @@ -0,0 +1,83 @@ +"""Tests for declarative posterior transform plotting helpers.""" + +import numpy as np + +from bayesian import plot_posterior_transform + + +def test_transform_samples_matches_lres_eloss_mapping() -> None: + samples = np.array([[0.25, 0.5], [0.75, 1.0]], dtype=np.float64) + transform_config = { + "components": [ + { + "name": "Lres", + "steps": [ + {"kind": "multiply", "value": np.pi / 2.0}, + {"kind": "tan"}, + {"kind": "power", "value": 3.0}, + ], + }, + { + "name": "kappa_sc", + "steps": [ + {"kind": "multiply", "value": 0.3}, + {"kind": "add", "value": 0.3}, + ], + }, + ] + } + + transformed = plot_posterior_transform.transform_samples(samples, transform_config) + + expected = np.column_stack( + [ + np.tan(np.pi / 2.0 * samples[:, 0]) ** 3, + 0.3 + 0.3 * samples[:, 1], + ] + ) + np.testing.assert_allclose(transformed, expected) + + +def test_transform_samples_supports_inverse_steps() -> None: + samples = np.array([[0.1, 0.2], [0.6, 0.9]], dtype=np.float64) + transform_config = { + "components": [ + { + "name": "a", + "steps": [{"kind": "multiply", "value": 2.0}, {"kind": "add", "value": 1.0}], + "inverse_steps": [{"kind": "subtract", "value": 1.0}, {"kind": "divide", "value": 2.0}], + }, + { + "name": "b", + "steps": [{"kind": "multiply", "value": 0.3}, {"kind": "add", "value": 0.3}], + "inverse_steps": [{"kind": "subtract", "value": 0.3}, {"kind": "divide", "value": 0.3}], + }, + ] + } + + physical = plot_posterior_transform.transform_samples(samples, transform_config, direction="forward") + recovered = plot_posterior_transform.transform_samples(physical, transform_config, direction="inverse") + + np.testing.assert_allclose(recovered, samples) + + +def test_normalize_random_seed_treats_negative_as_default() -> None: + assert plot_posterior_transform._normalize_random_seed(None) == 12345 + assert plot_posterior_transform._normalize_random_seed(-1) == 12345 + assert plot_posterior_transform._normalize_random_seed(7) == 7 + + +def test_values_for_log_axis_drop_invalid_entries() -> None: + values = np.array([0.0, 1.0, np.inf, 2.0, -3.0], dtype=np.float64) + filtered = plot_posterior_transform._values_for_axis_scale(values, "log", "unit-test") + np.testing.assert_allclose(filtered, np.array([1.0, 2.0])) + + +def test_paired_values_for_log_axis_drop_invalid_entries() -> None: + x = np.array([0.0, 1.0, 2.0, np.inf], dtype=np.float64) + y = np.array([1.0, np.nan, 3.0, 4.0], dtype=np.float64) + filtered_x, filtered_y = plot_posterior_transform._paired_values_for_axis_scale( + x, y, "log", "linear", "unit-test" + ) + np.testing.assert_allclose(filtered_x, np.array([2.0])) + np.testing.assert_allclose(filtered_y, np.array([3.0]))