Source code for snf_simulations.utils

"""Utility functions for spectrum interpolation and sampling."""

import numpy as np



[docs]
def linear_interpolate_with_errors(
    original_bins: np.ndarray,
    original_content: np.ndarray,
    original_errors: np.ndarray,
    new_bins: np.ndarray,
) -> tuple[np.ndarray, np.ndarray]:
    """Linearly interpolate histogram content and propagate errors onto new bins."""
    if len(original_bins) != len(original_content) + 1:
        msg = "original_bins must have length len(original_content) + 1"
        raise ValueError(msg)
    if len(original_errors) != len(original_content):
        msg = "original_errors must have the same length as original_content"
        raise ValueError(msg)
    if len(original_bins) < 2:  # noqa: PLR2004
        msg = "original_bins must have at least two values"
        raise ValueError(msg)
    if len(new_bins) < 2:  # noqa: PLR2004
        msg = "new_bins must have at least two values"
        raise ValueError(msg)
    if np.any(np.diff(original_bins) <= 0):
        msg = "original_bins must be strictly increasing"
        raise ValueError(msg)
    if np.any(np.diff(new_bins) <= 0):
        msg = "new_bins must be strictly increasing"
        raise ValueError(msg)

    # Interpolate new content values
    original_centres = (original_bins[:-1] + original_bins[1:]) / 2
    new_centres = (new_bins[:-1] + new_bins[1:]) / 2
    new_content = np.interp(new_centres, original_centres, original_content)

    # Any extrapolated bins outside the original range should be set to zero
    lower_edges = new_bins[:-1]
    upper_edges = new_bins[1:]
    lower_mask = upper_edges <= original_bins[0]
    upper_mask = lower_edges >= original_bins[-1]
    extrapolation_mask = lower_mask | upper_mask
    new_content[extrapolation_mask] = 0

    # Propagate errors.
    # This code has been vectorised for speed. Bins are split into three categories:
    # - Interior bins, which have errors interpolated from the closest origional bins.
    # - Extrapolated bins, which are set to zero error.
    # - Overlapping bins, which might be partially inside the original range.
    #   These are set to the error of the nearest edge bin.
    new_errors = np.zeros_like(new_centres)
    valid_mask = ~extrapolation_mask
    if np.any(valid_mask):
        valid_indices = np.flatnonzero(valid_mask)
        valid_centres = new_centres[valid_mask]

        # Handle overlapping bins near the original boundaries by keeping the
        # nearest edge-bin error constant.
        left_mask = (valid_centres <= original_centres[0]) | np.isclose(
            valid_centres, original_centres[0]
        )
        right_mask = (valid_centres >= original_centres[-1]) | np.isclose(
            valid_centres, original_centres[-1]
        )
        new_errors[valid_indices[left_mask]] = original_errors[0]
        new_errors[valid_indices[right_mask]] = original_errors[-1]

        # For interior bins, use linear interpolation of errors from the adjacent
        # original bins.
        interior_mask = ~(left_mask | right_mask)
        if np.any(interior_mask):
            interior_indices = valid_indices[interior_mask]
            interior_centres = valid_centres[interior_mask]

            # Find the two closest original bin centers.
            # Using np.searchsorted finds the "insertion point" for the new centre,
            # i.e. the index of where it would go to keep the array sorted.
            # So if the original_centres are [1, 2, 3] and centre is 2.5, idx will be 2
            # as it would fit between 2 (index 1) and 3 (index 2).
            # Therefore the surrounding bins are at idx-1 and idx.
            upper_idx = np.searchsorted(original_centres, interior_centres, side="left")
            lower_idx = upper_idx - 1

            # Calculate new error by propagating errors from the two surrounding bins,
            # weighted by distance to the new centre.
            c_lower = original_centres[lower_idx]
            c_upper = original_centres[upper_idx]
            err_lower = original_errors[lower_idx]
            err_upper = original_errors[upper_idx]

            weight_upper = (interior_centres - c_lower) / (c_upper - c_lower)
            weight_lower = 1.0 - weight_upper
            new_errors[interior_indices] = np.sqrt(
                weight_lower**2 * err_lower**2 + weight_upper**2 * err_upper**2
            )

    return new_content, new_errors




[docs]
def sample_histogram(
    bin_edges: np.ndarray,
    bin_contents: np.ndarray,
    n_samples: int = 100,
    seed: int | None = None,
) -> np.ndarray:
    """Sample x values from histogram bins, similar to ROOT TH1::GetRandom.

    Args:
        bin_edges: 1D array of bin edges with length N+1.
        bin_contents: 1D array of bin contents with length N.
        n_samples: Number of samples to draw.
        seed: Seed for reproducible random sampling.

    Returns:
        Array of sampled x values.

    """
    if bin_edges.ndim != 1 or bin_contents.ndim != 1:
        msg = "bin_edges and bin_contents must be 1D arrays"
        raise ValueError(msg)
    if len(bin_edges) != len(bin_contents) + 1:
        msg = "bin_edges must have length len(bin_contents) + 1"
        raise ValueError(msg)
    widths = np.diff(bin_edges)
    if np.any(widths <= 0):
        msg = "bin_edges must be strictly increasing"
        raise ValueError(msg)
    if np.any(bin_contents < 0):
        msg = "bin_contents must be non-negative"
        raise ValueError(msg)

    # Match ROOT TH1::GetRandom behaviour: bin selection probability is
    # proportional to bin content, then sample uniformly within the selected bin.
    weights = bin_contents
    total_weight = np.sum(weights)
    if total_weight <= 0:  # Avoid division by zero errors
        msg = "Histogram has zero total area; cannot sample"
        raise ValueError(msg)
    probabilities = weights / total_weight

    # Use numpy's random choice to select X bins according to their probabilities,
    # for the requested number of samples.
    rng = np.random.default_rng(seed)
    sampled_indices = rng.choice(len(bin_contents), size=n_samples, p=probabilities)

    # Finally, for each bin take a uniform sample between the upper and lower edges.
    # This gives a continuous distribution of sampled x values from within the bins.
    lower = bin_edges[sampled_indices]
    upper = bin_edges[sampled_indices + 1]
    return rng.uniform(lower, upper)