Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion config/validation_2018_try2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,25 @@ analysis_base: &model_base
names: ['$u_{Lres}$', '$u_{kappa}$']
min: [0.0, 0.0]
max: [1.0, 1.0]
physical_posterior_transform:
unit_names: ['u1', 'u2']
unit_labels: ['$u_{Lres}$', '$u_{kappa}$']
components:
- name: 'Lres'
axis_scale: 'log'
steps:
- kind: 'multiply'
value: 1.5707963267948966
- kind: 'tan'
- kind: 'power'
value: 3.0
- name: 'kappa_sc'
axis_scale: 'linear'
steps:
- kind: 'multiply'
value: 0.3
- kind: 'add'
value: 0.3
alpha: 0.2
validation_indices: [38, 39] # empty doesnt work, so I put the out of range

Expand Down Expand Up @@ -143,4 +162,4 @@ analyses:
closure:
<<: *default_closure_parameters
<<: *model_base
plot_panel_shapes: [[3, 3], [3, 3], [3, 3], [3, 3], [3, 3], [3, 3]] # Maybe fix?
plot_panel_shapes: [[3, 3], [3, 3], [3, 3], [3, 3], [3, 3], [3, 3]] # Maybe fix?
35 changes: 34 additions & 1 deletion src/bayesian/data_IO.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,16 @@ def _recursive_defaultdict():
return defaultdict(_recursive_defaultdict)


def _should_skip_table_filename(filename: str) -> bool:
"""Return True for filesystem sidecars that should never be parsed as tables."""
return filename.endswith(":Zone.Identifier")


def _is_table_file(filename: str, prefix: str) -> bool:
"""Return True when a filename matches the expected table naming convention."""
return filename.startswith(prefix) and filename.endswith(".dat") and not _should_skip_table_filename(filename)


def _validate_and_flatten_array(array: np.ndarray, name: str) -> np.ndarray:
"""
Validate array is 1D or flattenable to 1D.
Expand Down Expand Up @@ -1095,8 +1105,12 @@ def _lookup_systematic_config(file_obs_label):
logger.info("No systematic correlations found in config")

for filename in os.listdir(data_dir):
if _should_skip_table_filename(filename):
logger.debug(f"Skipping sidecar file in Data directory: {filename}")
continue

# Skip files that don't match the expected Data file pattern
if not filename.startswith("Data__"):
if not _is_table_file(filename, "Data__"):
logger.debug(f"Skipping non-data file in Data directory: {filename}")
continue

Expand Down Expand Up @@ -1196,6 +1210,14 @@ def _lookup_systematic_config(file_obs_label):
design_points_to_exclude = analysis_config.get("design_points_to_exclude", [])
design_dir = os.path.join(table_dir, "Design")
for filename in os.listdir(design_dir):
if _should_skip_table_filename(filename):
logger.debug(f"Skipping sidecar file in Design directory: {filename}")
continue

if not _is_table_file(filename, "Design__"):
logger.debug(f"Skipping non-design file in Design directory: {filename}")
continue

if _filename_to_labels(filename)[1] == parameterization:
# Explanation of variables:
# - design_point_parameters: The parameters of the design points, with one per design point.
Expand Down Expand Up @@ -1224,6 +1246,14 @@ def _lookup_systematic_config(file_obs_label):
# Read predictions and uncertainty
prediction_dir = os.path.join(table_dir, "Prediction")
for filename in os.listdir(prediction_dir):
if _should_skip_table_filename(filename):
logger.debug(f"Skipping sidecar file in Prediction directory: {filename}")
continue

if not _is_table_file(filename, "Prediction__"):
logger.debug(f"Skipping non-prediction file in Prediction directory: {filename}")
continue

if "values" in filename and parameterization in filename:
if _accept_observable(analysis_config, filename):
filename_prediction_values = filename
Expand Down Expand Up @@ -1798,6 +1828,9 @@ def _accept_observable(analysis_config, filename):
:param str filename: filename of table for the considered observable
"""

if _should_skip_table_filename(filename) or not filename.endswith(".dat"):
return False

observable_label, _ = _filename_to_labels(filename)

sqrts, _, _, _, _, centrality = observable_label_to_keys(observable_label)
Expand Down
1 change: 1 addition & 0 deletions src/bayesian/mc_sampling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from bayesian.mc_sampling.base import ( # noqa: F401
MCConfig,
compute_chain_diagnostics,
credible_interval,
map_parameters,
run_mcmc,
Expand Down
74 changes: 74 additions & 0 deletions src/bayesian/mc_sampling/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,80 @@ def map_parameters(posterior: npt.NDArray[np.float64], method: str = "quantile")
raise ValueError(msg)


def compute_chain_diagnostics(
chain: npt.NDArray[np.float64],
parameter_min: npt.NDArray[np.float64],
parameter_max: npt.NDArray[np.float64],
*,
boundary_fraction: float = 0.005,
late_fraction: float = 0.2,
) -> dict[str, npt.NDArray[np.float64] | float]:
"""Compute walker-level QA diagnostics for a chain.

These diagnostics are lightweight summary metadata intended for saved run
artifacts and regression tests, not just ad hoc notebook inspection.
"""
chain = np.asarray(chain, dtype=np.float64)
parameter_min = np.asarray(parameter_min, dtype=np.float64)
parameter_max = np.asarray(parameter_max, dtype=np.float64)

if chain.ndim != 3:
msg = f"Expected chain with shape (n_steps, n_walkers, n_dim), got {chain.shape}"
raise ValueError(msg)

n_dim = chain.shape[2]
if parameter_min.shape != (n_dim,) or parameter_max.shape != (n_dim,):
msg = (
f"Expected parameter bounds with shape ({n_dim},), got "
f"{parameter_min.shape} and {parameter_max.shape}"
)
raise ValueError(msg)
if not 0.0 <= boundary_fraction <= 1.0:
msg = f"boundary_fraction must be in [0, 1], got {boundary_fraction}"
raise ValueError(msg)
if not 0.0 <= late_fraction <= 1.0:
msg = f"late_fraction must be in [0, 1], got {late_fraction}"
raise ValueError(msg)

n_steps, n_walkers, _ = chain.shape
late_start = int((1.0 - late_fraction) * n_steps)
widths = parameter_max - parameter_min
lower_threshold = parameter_min + boundary_fraction * widths
upper_threshold = parameter_max - boundary_fraction * widths

near_lower = chain <= lower_threshold[np.newaxis, np.newaxis, :]
near_upper = chain >= upper_threshold[np.newaxis, np.newaxis, :]

longest_repeat_run = np.zeros(n_walkers, dtype=np.int64)
for walker_idx in range(n_walkers):
equal_steps = np.all(
np.isclose(chain[1:, walker_idx, :], chain[:-1, walker_idx, :], atol=0.0, rtol=0.0),
axis=1,
)
best = 0
current = 0
for repeated in equal_steps:
if repeated:
current = current + 1 if current else 2
best = max(best, current)
else:
current = 0
longest_repeat_run[walker_idx] = best

diagnostics: dict[str, npt.NDArray[np.float64] | float] = {
"boundary_fraction": float(boundary_fraction),
"late_fraction": float(late_fraction),
"near_lower_counts": near_lower.sum(axis=0).astype(np.int64),
"near_upper_counts": near_upper.sum(axis=0).astype(np.int64),
"late_near_lower_counts": near_lower[late_start:, :, :].sum(axis=0).astype(np.int64),
"late_near_upper_counts": near_upper[late_start:, :, :].sum(axis=0).astype(np.int64),
"max_per_walker": chain.max(axis=0),
"min_per_walker": chain.min(axis=0),
"longest_repeat_run": longest_repeat_run,
}
return diagnostics


def _validate_sampler(name: str, module: ModuleType) -> None:
"""Validate that a sampler module follows the expected interface."""
for attr in ["run_sampling", "SamplerSettings"]:
Expand Down
20 changes: 19 additions & 1 deletion src/bayesian/mc_sampling/emcee.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@
_register_name = "emcee"


def _apply_random_seed(sampler: emcee.EnsembleSampler, seed: int | None) -> None:
"""Seed emcee's internal RNG so runs are reproducible beyond initial positions."""
if seed is None:
return

rng = np.random.RandomState(seed)
sampler.random_state = rng.get_state()


@attrs.define
class SamplerSettings:
"""Settings for the emcee affine-invariant ensemble sampler.
Expand All @@ -48,6 +57,7 @@ class SamplerSettings:
n_sampling_steps: int = attrs.field()
n_logging_steps: int = attrs.field()
settings: dict[str, Any] = attrs.field()
random_seed: int | None = attrs.field(default=None)

@classmethod
def from_config(cls, config: dict[str, Any]) -> SamplerSettings:
Expand All @@ -57,6 +67,7 @@ def from_config(cls, config: dict[str, Any]) -> SamplerSettings:
n_burn_steps=config["n_burn_steps"],
n_sampling_steps=config["n_sampling_steps"],
n_logging_steps=config["n_logging_steps"],
random_seed=config.get("random_seed"),
settings=config,
)

Expand Down Expand Up @@ -110,8 +121,9 @@ def run_sampling(
kwargs={"set_to_infinite_outside_bounds": True},
pool=pool,
)
_apply_random_seed(sampler, sampler_settings.random_seed)

rng = np.random.default_rng()
rng = np.random.default_rng(sampler_settings.random_seed)
random_pos = rng.uniform(parameter_min, parameter_max, (sampler_settings.n_walkers, parameter_ndim))

# First half of burn-in from random positions
Expand Down Expand Up @@ -143,6 +155,12 @@ def run_sampling(
"chain": sampler.get_chain(),
"acceptance_fraction": sampler.acceptance_fraction,
"log_prob": sampler.get_log_prob(),
"random_seed": sampler_settings.random_seed if sampler_settings.random_seed is not None else -1,
"chain_diagnostics": mc_sampling_base.compute_chain_diagnostics(
sampler.get_chain(),
np.asarray(parameter_min, dtype=np.float64),
np.asarray(parameter_max, dtype=np.float64),
),
}
try:
output_dict["autocorrelation_time"] = sampler.get_autocorr_time()
Expand Down
9 changes: 5 additions & 4 deletions src/bayesian/plot_mcmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

sns.set_context("paper", rc={"font.size": 18, "axes.titlesize": 18, "axes.labelsize": 18})

from bayesian import data_IO, emulation, plot_utils
from bayesian import data_IO, emulation, plot_posterior_transform, plot_utils
from bayesian import mc_sampling as mcmc

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -55,10 +55,11 @@ def plot(config: mcmc.MCConfig):
logger.info(f"Chain is of size: {os.path.getsize(config.mcmc_outputfile) / (1024 * 1024):.1f} MB")

# MCMC plots
_plot_acceptance_fraction(results["acceptance_fraction"], plot_dir, config)
_plot_acceptance_fraction(results["acceptance_fraction"], plot_dir)
_plot_log_posterior(results["log_prob"], plot_dir, config)
_plot_autocorrelation_time(results, plot_dir, config)
_plot_posterior_pairplot(chain, plot_dir, config)
plot_posterior_transform.plot(chain, config, random_seed=results.get("random_seed"))

# Posterior vs. Design observables
design = data_IO.design_array_from_h5(config.output_dir, filename=config.observables_filename)
Expand All @@ -68,7 +69,7 @@ def plot(config: mcmc.MCConfig):


# ---------------------------------------------------------------
def _plot_acceptance_fraction(acceptance_fraction, plot_dir, config):
def _plot_acceptance_fraction(acceptance_fraction, plot_dir):
"""
Plot histogram of acceptance_fraction for each walker.

Expand All @@ -78,7 +79,7 @@ def _plot_acceptance_fraction(acceptance_fraction, plot_dir, config):
:param 1darray acceptance_fraction: fraction of steps accepted for each walker -- shape (n_walkers,)
"""
plt.figure(figsize=(10, 6))
plt.plot(np.arange(config.n_walkers), acceptance_fraction, marker="o", color=sns.xkcd_rgb["denim blue"])
plt.plot(np.arange(len(acceptance_fraction)), acceptance_fraction, marker="o", color=sns.xkcd_rgb["denim blue"])
plt.ylim(0, 1)
plt.xlabel("Walker Index")
plt.ylabel("Acceptance Fraction")
Expand Down
Loading