Skip to content

Mixins

RebinMixin

hypergrid.mixin.rebin_mixin.RebinMixin

Project the histogram onto a new grid by mapping each bin's centroid.

Source code in hypergrid\mixin\rebin_mixin.py
class RebinMixin:
    """Project the histogram onto a new grid by mapping each bin's centroid."""

    def rebin_to(self, target_edges, *, as_storage=None):
        """
        Reproject mass onto target_edges by mapping each source bin's centre
        to the nearest target bin.

        Parameters
        ----------
        target_edges : list of ndarray
        as_storage : storage backend, optional
            If provided, also accumulate results into this backend.
            Useful when rebinning into an existing grid in-place.

        Returns
        -------
        dict  {tuple_index: float}
            Mass in target-grid coordinates.
        """
        src_edges = self.get_edges()
        result = {}

        for idx, count in self.get_mass().items():
            center = [
                (src_edges[d][i] + src_edges[d][i + 1]) / 2.0
                for d, i in enumerate(idx)
            ]

            new_idx = tuple(
                int(np.clip(np.searchsorted(target_edges[d], center[d], side="right") - 1,
                            0, len(target_edges[d]) - 2))
                for d in range(self.dim)
            )

            result[new_idx] = result.get(new_idx, 0.0) + count

        if as_storage is not None:
            for key, val in result.items():
                as_storage.add(key, val)

        return result
rebin_to(target_edges, *, as_storage=None)

Reproject mass onto target_edges by mapping each source bin's centre to the nearest target bin.

Parameters:

Name Type Description Default
target_edges list of ndarray
required
as_storage storage backend

If provided, also accumulate results into this backend. Useful when rebinning into an existing grid in-place.

None

Returns:

Type Description
dict {tuple_index: float}

Mass in target-grid coordinates.

Source code in hypergrid\mixin\rebin_mixin.py
def rebin_to(self, target_edges, *, as_storage=None):
    """
    Reproject mass onto target_edges by mapping each source bin's centre
    to the nearest target bin.

    Parameters
    ----------
    target_edges : list of ndarray
    as_storage : storage backend, optional
        If provided, also accumulate results into this backend.
        Useful when rebinning into an existing grid in-place.

    Returns
    -------
    dict  {tuple_index: float}
        Mass in target-grid coordinates.
    """
    src_edges = self.get_edges()
    result = {}

    for idx, count in self.get_mass().items():
        center = [
            (src_edges[d][i] + src_edges[d][i + 1]) / 2.0
            for d, i in enumerate(idx)
        ]

        new_idx = tuple(
            int(np.clip(np.searchsorted(target_edges[d], center[d], side="right") - 1,
                        0, len(target_edges[d]) - 2))
            for d in range(self.dim)
        )

        result[new_idx] = result.get(new_idx, 0.0) + count

    if as_storage is not None:
        for key, val in result.items():
            as_storage.add(key, val)

    return result

ComparisonMixin

hypergrid.mixin.comparison_mixin.ComparisonMixin

Statistical divergence metrics between two hypergrids.

Source code in hypergrid\mixin\comparison_mixin.py
class ComparisonMixin:
    """Statistical divergence metrics between two hypergrids."""

    def compare(self, other, method="js", align="union", normalize=True):
        """
        Compute a scalar distance between self and other.

        Parameters
        ----------
        other : hypergrid
        method : {"l1", "kl", "js", "wasserstein"}
            l1          — total variation distance
            kl          — Kullback-Leibler divergence (asymmetric)
            js          — Jensen-Shannon divergence (symmetric, bounded in [0,1])
            wasserstein — Earth Mover's Distance via linear programming
        align : {"union", "self"}
            union — merge both edge sets before comparing (recommended)
            self  — project other onto self's edges
        normalize : bool
            Compare probability densities (True) or raw counts (False).

        Returns
        -------
        float
        """
        if self.dim != other.dim:
            raise ValueError(f"Dimension mismatch: self has {self.dim}D, other has {other.dim}D.")

        if method == "wasserstein":
            return self._wasserstein(other, normalize)

        h1, h2 = self._align_histograms(other, align)

        if normalize:
            h1 = _normalize(h1)
            h2 = _normalize(h2)

        if method == "l1":
            return _l1(h1, h2)
        elif method == "kl":
            return _kl(h1, h2)
        elif method == "js":
            return _js(h1, h2)
        else:
            raise ValueError(f"Unknown method: {method!r}. Choose from 'l1', 'kl', 'js', 'wasserstein'.")

    # ------------------------------------------------------------------
    # Alignment
    # ------------------------------------------------------------------

    def _align_histograms(self, other, align):
        if align == "self":
            return self.get_mass(), other.rebin_to(self.get_edges())
        elif align == "union":
            edges = self._union_edges(other)
            return self.rebin_to(edges), other.rebin_to(edges)
        else:
            raise ValueError("align must be 'self' or 'union'.")

    def _union_edges(self, other):
        return [
            np.sort(np.unique(np.concatenate([self.get_edges()[d], other.get_edges()[d]])))
            for d in range(self.dim)
        ]

    # ------------------------------------------------------------------
    # Wasserstein via LP
    # ------------------------------------------------------------------

    def _wasserstein(self, other, normalize=True):
        coords1, w1 = self._to_bin_centers()
        coords2, w2 = other._to_bin_centers()

        if len(coords1) == 0 or len(coords2) == 0:
            return 0.0

        if normalize:
            w1 = w1 / w1.sum()
            w2 = w2 / w2.sum()

        C = cdist(coords1, coords2)
        n, m = len(w1), len(w2)

        # Vectorised constraint matrices (avoids Python loop over n*m).
        # Row constraints: each source bin must ship its full supply w1[i].
        rows = np.eye(n).repeat(m, axis=1)   # (n, n*m)
        # Column constraints: each target bin must receive its full demand w2[j].
        cols = np.tile(np.eye(m), (1, n))     # (m, n*m)

        res = linprog(
            C.flatten(),
            A_eq=np.vstack([rows, cols]),
            b_eq=np.concatenate([w1, w2]),
            bounds=(0, None),
            method="highs",
        )
        return float(res.fun)

    def _to_bin_centers(self):
        edges = self.get_edges()
        coords, weights = [], []
        for idx, count in self.get_mass().items():
            center = [(edges[d][idx[d]] + edges[d][idx[d] + 1]) / 2.0 for d in range(self.dim)]
            coords.append(center)
            weights.append(count)
        return np.array(coords, dtype=float), np.array(weights, dtype=float)
compare(other, method='js', align='union', normalize=True)

Compute a scalar distance between self and other.

Parameters:

Name Type Description Default
other hypergrid
required
method (l1, kl, js, wasserstein)

l1 — total variation distance kl — Kullback-Leibler divergence (asymmetric) js — Jensen-Shannon divergence (symmetric, bounded in [0,1]) wasserstein — Earth Mover's Distance via linear programming

"l1"
align (union, self)

union — merge both edge sets before comparing (recommended) self — project other onto self's edges

"union"
normalize bool

Compare probability densities (True) or raw counts (False).

True

Returns:

Type Description
float
Source code in hypergrid\mixin\comparison_mixin.py
def compare(self, other, method="js", align="union", normalize=True):
    """
    Compute a scalar distance between self and other.

    Parameters
    ----------
    other : hypergrid
    method : {"l1", "kl", "js", "wasserstein"}
        l1          — total variation distance
        kl          — Kullback-Leibler divergence (asymmetric)
        js          — Jensen-Shannon divergence (symmetric, bounded in [0,1])
        wasserstein — Earth Mover's Distance via linear programming
    align : {"union", "self"}
        union — merge both edge sets before comparing (recommended)
        self  — project other onto self's edges
    normalize : bool
        Compare probability densities (True) or raw counts (False).

    Returns
    -------
    float
    """
    if self.dim != other.dim:
        raise ValueError(f"Dimension mismatch: self has {self.dim}D, other has {other.dim}D.")

    if method == "wasserstein":
        return self._wasserstein(other, normalize)

    h1, h2 = self._align_histograms(other, align)

    if normalize:
        h1 = _normalize(h1)
        h2 = _normalize(h2)

    if method == "l1":
        return _l1(h1, h2)
    elif method == "kl":
        return _kl(h1, h2)
    elif method == "js":
        return _js(h1, h2)
    else:
        raise ValueError(f"Unknown method: {method!r}. Choose from 'l1', 'kl', 'js', 'wasserstein'.")

EmbeddingMixin

hypergrid.mixin.embedding_mixin.EmbeddingMixin

Convert hypergrid mass to a flat, index-ordered probability vector.

Source code in hypergrid\mixin\embedding_mixin.py
class EmbeddingMixin:
    """Convert hypergrid mass to a flat, index-ordered probability vector."""

    def to_vector(self):
        """
        Flatten the histogram into a 1D probability vector ordered by
        ravel_multi_index, so two grids with the same edges produce
        directly comparable vectors.

        Returns
        -------
        ndarray, shape (prod(shape),)  — sums to ~1.
        """
        n = int(np.prod(self.shape))
        vec = np.zeros(n)
        for idx, v in self.get_mass().items():
            vec[int(np.ravel_multi_index(idx, self.shape))] = v
        total = vec.sum()
        return vec / (total + 1e-12)
to_vector()

Flatten the histogram into a 1D probability vector ordered by ravel_multi_index, so two grids with the same edges produce directly comparable vectors.

Returns:

Type Description
ndarray, shape (prod(shape),) — sums to ~1.
Source code in hypergrid\mixin\embedding_mixin.py
def to_vector(self):
    """
    Flatten the histogram into a 1D probability vector ordered by
    ravel_multi_index, so two grids with the same edges produce
    directly comparable vectors.

    Returns
    -------
    ndarray, shape (prod(shape),)  — sums to ~1.
    """
    n = int(np.prod(self.shape))
    vec = np.zeros(n)
    for idx, v in self.get_mass().items():
        vec[int(np.ravel_multi_index(idx, self.shape))] = v
    total = vec.sum()
    return vec / (total + 1e-12)

VisualizationMixin

hypergrid.mixin.visualization_mixin.VisualizationMixin

Plotting methods for single-grid inspection and pairwise comparison.

Source code in hypergrid\mixin\visualization_mixin.py
class VisualizationMixin:
    """Plotting methods for single-grid inspection and pairwise comparison."""

    # ------------------------------------------------------------------
    # Single-grid plots
    # ------------------------------------------------------------------

    def plot_top_bins(self, k=20, ax=None):
        items = sorted(self.get_mass().items(), key=lambda x: -x[1])[:k]
        labels = [str(idx) for idx, _ in items]
        values = [v for _, v in items]

        standalone = ax is None
        ax = ax or plt.gca()
        ax.barh(labels[::-1], values[::-1])
        ax.set_title(f"Top {k} bins by mass")
        if standalone:
            plt.tight_layout()
            plt.show()

    def plot_marginal(self, dim, ax=None, label=None):
        edges = self.get_edges()[dim]
        hist = np.zeros(len(edges) - 1)
        for idx, v in self.get_mass().items():
            hist[idx[dim]] += v
        hist /= hist.sum() + 1e-12

        standalone = ax is None
        ax = ax or plt.gca()
        ax.stairs(hist, edges, fill=True, alpha=0.7, label=label)
        ax.set_title(f"Marginal (dim {dim})")
        ax.set_xlabel(f"Dimension {dim}")
        ax.set_ylabel("Probability")
        if label:
            ax.legend()
        if standalone:
            plt.tight_layout()
            plt.show()

    def plot_all_marginals(self):
        fig, axes = plt.subplots(1, self.dim, figsize=(5 * self.dim, 4))
        axes = [axes] if self.dim == 1 else list(axes)
        for d, ax in enumerate(axes):
            self.plot_marginal(d, ax=ax)
        plt.tight_layout()
        plt.show()

    def plot_joint(self, dim_x, dim_y, ax=None):
        ex, ey = self.get_edges()[dim_x], self.get_edges()[dim_y]
        grid = np.zeros((len(ex) - 1, len(ey) - 1))
        for idx, v in self.get_mass().items():
            grid[idx[dim_x], idx[dim_y]] += v
        grid /= grid.sum() + 1e-12

        standalone = ax is None
        ax = ax or plt.gca()
        im = ax.imshow(grid.T, origin="lower", aspect="auto")
        plt.colorbar(im, ax=ax, label="Probability")
        ax.set_title(f"Joint ({dim_x}, {dim_y})")
        ax.set_xlabel(f"Dim {dim_x}")
        ax.set_ylabel(f"Dim {dim_y}")
        if standalone:
            plt.tight_layout()
            plt.show()

    # ------------------------------------------------------------------
    # Sampling
    # ------------------------------------------------------------------

    def sample(self, n_samples=2000, rng=None):
        """
        Draw samples from the histogram by treating each bin as a uniform
        distribution over its volume.

        Parameters
        ----------
        n_samples : int
        rng : numpy.random.Generator, optional

        Returns
        -------
        ndarray, shape (n_samples, dim)
        """
        rng = np.random.default_rng(rng)
        mass = self.get_mass()
        if not mass:
            raise ValueError("Histogram is empty; call fit() or update() first.")

        bins = list(mass.keys())
        weights = np.array(list(mass.values()), dtype=float)
        weights /= weights.sum()

        chosen = rng.choice(len(bins), size=n_samples, p=weights)
        edges = self.get_edges()

        samples = np.empty((n_samples, self.dim))
        for i, c in enumerate(chosen):
            idx = bins[c]
            for d in range(self.dim):
                lo, hi = edges[d][idx[d]], edges[d][idx[d] + 1]
                samples[i, d] = rng.uniform(lo, hi)
        return samples

    # ------------------------------------------------------------------
    # UMAP
    # ------------------------------------------------------------------

    def plot_umap(self, n_samples=2000, ax=None, **umap_kwargs):
        """UMAP projection of a single hypergrid."""
        import umap as _umap
        emb = _umap.UMAP(**umap_kwargs).fit_transform(self.sample(n_samples))

        standalone = ax is None
        ax = ax or plt.gca()
        ax.scatter(emb[:, 0], emb[:, 1], s=5, alpha=0.6)
        ax.set_title("UMAP projection")
        if standalone:
            plt.tight_layout()
            plt.show()

    def compare_umap(self, other, n_samples=2000, **umap_kwargs):
        """UMAP projection with both grids overlaid in different colours."""
        import umap as _umap
        X1, X2 = self.sample(n_samples), other.sample(n_samples)
        labels = np.array([0] * len(X1) + [1] * len(X2))
        emb = _umap.UMAP(**umap_kwargs).fit_transform(np.vstack([X1, X2]))

        fig, ax = plt.subplots()
        ax.scatter(emb[labels == 0, 0], emb[labels == 0, 1], s=5, alpha=0.6, label="self")
        ax.scatter(emb[labels == 1, 0], emb[labels == 1, 1], s=5, alpha=0.6, label="other")
        ax.legend()
        ax.set_title("UMAP comparison")
        plt.tight_layout()
        plt.show()

    # ------------------------------------------------------------------
    # Comparison plots
    # ------------------------------------------------------------------

    def compare_marginal(self, other, dim, ax=None, rebin=True):
        """
        Overlay marginal distributions for dimension `dim`.

        Parameters
        ----------
        other : hypergrid
        dim : int
        ax : matplotlib Axes, optional
        rebin : bool
            If True (default), project `other` onto self's edges before
            comparing. If False, each grid is plotted on its own native edges.
        """
        standalone = ax is None
        ax = ax or plt.gca()

        if rebin:
            edges = self.get_edges()[dim]
            other_proj = other.rebin_to(self.get_edges())

            h1 = np.zeros(len(edges) - 1)
            h2 = np.zeros(len(edges) - 1)
            for idx, v in self.get_mass().items():
                h1[idx[dim]] += v
            for idx, v in other_proj.items():
                h2[idx[dim]] += v

            h1 /= h1.sum() + 1e-12
            h2 /= h2.sum() + 1e-12

            ax.stairs(h1, edges, fill=True, alpha=0.5, label="self")
            ax.stairs(h2, edges, fill=True, alpha=0.5, label="other")
        else:
            e1 = self.get_edges()[dim]
            e2 = other.get_edges()[dim]

            h1 = np.zeros(len(e1) - 1)
            h2 = np.zeros(len(e2) - 1)
            for idx, v in self.get_mass().items():
                h1[idx[dim]] += v
            for idx, v in other.get_mass().items():
                h2[idx[dim]] += v

            h1 /= h1.sum() + 1e-12
            h2 /= h2.sum() + 1e-12

            ax.stairs(h1, e1, fill=True, alpha=0.5, label="self")
            ax.stairs(h2, e2, fill=True, alpha=0.5, label="other")

        ax.legend()
        ax.set_title(f"Marginal comparison (dim {dim})")
        ax.set_xlabel(f"Dimension {dim}")
        ax.set_ylabel("Probability")
        if standalone:
            plt.tight_layout()
            plt.show()

    # ------------------------------------------------------------------
    # Drift (AdaptiveHypergrid)
    # ------------------------------------------------------------------

    def plot_drift(self):
        if not hasattr(self, "_drift_history") or not self._drift_history:
            raise ValueError("No drift history available. Use AdaptiveHypergrid and process data first.")
        plt.plot(self._drift_history, marker="o")
        plt.title("Overflow fraction at each rebin event")
        plt.xlabel("Rebin index")
        plt.ylabel("Overflow fraction")
        plt.tight_layout()
        plt.show()
sample(n_samples=2000, rng=None)

Draw samples from the histogram by treating each bin as a uniform distribution over its volume.

Parameters:

Name Type Description Default
n_samples int
2000
rng Generator
None

Returns:

Type Description
(ndarray, shape(n_samples, dim))
Source code in hypergrid\mixin\visualization_mixin.py
def sample(self, n_samples=2000, rng=None):
    """
    Draw samples from the histogram by treating each bin as a uniform
    distribution over its volume.

    Parameters
    ----------
    n_samples : int
    rng : numpy.random.Generator, optional

    Returns
    -------
    ndarray, shape (n_samples, dim)
    """
    rng = np.random.default_rng(rng)
    mass = self.get_mass()
    if not mass:
        raise ValueError("Histogram is empty; call fit() or update() first.")

    bins = list(mass.keys())
    weights = np.array(list(mass.values()), dtype=float)
    weights /= weights.sum()

    chosen = rng.choice(len(bins), size=n_samples, p=weights)
    edges = self.get_edges()

    samples = np.empty((n_samples, self.dim))
    for i, c in enumerate(chosen):
        idx = bins[c]
        for d in range(self.dim):
            lo, hi = edges[d][idx[d]], edges[d][idx[d] + 1]
            samples[i, d] = rng.uniform(lo, hi)
    return samples
plot_umap(n_samples=2000, ax=None, **umap_kwargs)

UMAP projection of a single hypergrid.

Source code in hypergrid\mixin\visualization_mixin.py
def plot_umap(self, n_samples=2000, ax=None, **umap_kwargs):
    """UMAP projection of a single hypergrid."""
    import umap as _umap
    emb = _umap.UMAP(**umap_kwargs).fit_transform(self.sample(n_samples))

    standalone = ax is None
    ax = ax or plt.gca()
    ax.scatter(emb[:, 0], emb[:, 1], s=5, alpha=0.6)
    ax.set_title("UMAP projection")
    if standalone:
        plt.tight_layout()
        plt.show()
compare_umap(other, n_samples=2000, **umap_kwargs)

UMAP projection with both grids overlaid in different colours.

Source code in hypergrid\mixin\visualization_mixin.py
def compare_umap(self, other, n_samples=2000, **umap_kwargs):
    """UMAP projection with both grids overlaid in different colours."""
    import umap as _umap
    X1, X2 = self.sample(n_samples), other.sample(n_samples)
    labels = np.array([0] * len(X1) + [1] * len(X2))
    emb = _umap.UMAP(**umap_kwargs).fit_transform(np.vstack([X1, X2]))

    fig, ax = plt.subplots()
    ax.scatter(emb[labels == 0, 0], emb[labels == 0, 1], s=5, alpha=0.6, label="self")
    ax.scatter(emb[labels == 1, 0], emb[labels == 1, 1], s=5, alpha=0.6, label="other")
    ax.legend()
    ax.set_title("UMAP comparison")
    plt.tight_layout()
    plt.show()
compare_marginal(other, dim, ax=None, rebin=True)

Overlay marginal distributions for dimension dim.

Parameters:

Name Type Description Default
other hypergrid
required
dim int
required
ax matplotlib Axes
None
rebin bool

If True (default), project other onto self's edges before comparing. If False, each grid is plotted on its own native edges.

True
Source code in hypergrid\mixin\visualization_mixin.py
def compare_marginal(self, other, dim, ax=None, rebin=True):
    """
    Overlay marginal distributions for dimension `dim`.

    Parameters
    ----------
    other : hypergrid
    dim : int
    ax : matplotlib Axes, optional
    rebin : bool
        If True (default), project `other` onto self's edges before
        comparing. If False, each grid is plotted on its own native edges.
    """
    standalone = ax is None
    ax = ax or plt.gca()

    if rebin:
        edges = self.get_edges()[dim]
        other_proj = other.rebin_to(self.get_edges())

        h1 = np.zeros(len(edges) - 1)
        h2 = np.zeros(len(edges) - 1)
        for idx, v in self.get_mass().items():
            h1[idx[dim]] += v
        for idx, v in other_proj.items():
            h2[idx[dim]] += v

        h1 /= h1.sum() + 1e-12
        h2 /= h2.sum() + 1e-12

        ax.stairs(h1, edges, fill=True, alpha=0.5, label="self")
        ax.stairs(h2, edges, fill=True, alpha=0.5, label="other")
    else:
        e1 = self.get_edges()[dim]
        e2 = other.get_edges()[dim]

        h1 = np.zeros(len(e1) - 1)
        h2 = np.zeros(len(e2) - 1)
        for idx, v in self.get_mass().items():
            h1[idx[dim]] += v
        for idx, v in other.get_mass().items():
            h2[idx[dim]] += v

        h1 /= h1.sum() + 1e-12
        h2 /= h2.sum() + 1e-12

        ax.stairs(h1, e1, fill=True, alpha=0.5, label="self")
        ax.stairs(h2, e2, fill=True, alpha=0.5, label="other")

    ax.legend()
    ax.set_title(f"Marginal comparison (dim {dim})")
    ax.set_xlabel(f"Dimension {dim}")
    ax.set_ylabel("Probability")
    if standalone:
        plt.tight_layout()
        plt.show()

StatsMixin

hypergrid.mixin.stats_mixin.StatsMixin

Summary statistics computed from the histogram's marginal distributions.

Source code in hypergrid\mixin\stats_mixin.py
class StatsMixin:
    """Summary statistics computed from the histogram's marginal distributions."""

    def describe(self, percentiles=None):
        """
        Summary statistics of each dimension computed from the binned histogram.

        Statistics are derived from the marginal distribution along each
        dimension (all other dimensions summed out). They reflect the binned
        representation, not the original raw data.

        Parameters
        ----------
        percentiles : list of float, optional
            Quantile positions to include, each in [0, 1].
            Default is [0.25, 0.50, 0.75].

        Returns
        -------
        pandas.DataFrame
            Rows: count, mean, std, skewness, kurtosis, min, <percentiles>, max.
            Columns: integer dimension indices 0, 1, …, dim-1.

        Notes
        -----
        Requires pandas (``pip install pandas``).

        - **count** — total mass (sum of all bin counts, equal across dimensions).
        - **mean / std** — probability-weighted mean and population std of bin centres.
        - **skewness** — third standardised central moment (0 for symmetric distributions).
        - **kurtosis** — fourth standardised central moment minus 3 (excess kurtosis;
          0 for a normal distribution).
        - **min / max** — lower / upper edge of the outermost non-empty bin.
        - **percentiles** — linearly interpolated within bins from the marginal CDF.
        """
        try:
            import pandas as pd
        except ImportError as exc:
            raise ImportError(
                "describe() requires pandas.  Install it with: pip install pandas"
            ) from exc

        if percentiles is None:
            percentiles = [0.25, 0.50, 0.75]
        percentiles = sorted(float(p) for p in percentiles)

        edges = self.get_edges()
        mass = self.get_mass()
        columns = {}

        for d in range(self.dim):
            e = np.asarray(edges[d])
            n_bins = len(e) - 1
            centers = (e[:-1] + e[1:]) / 2

            marg = np.zeros(n_bins)
            for idx, v in mass.items():
                marg[idx[d]] += v

            total = marg.sum()

            if total == 0:
                pct_rows = {_pct_label(p): np.nan for p in percentiles}
                columns[d] = {
                    "count": 0.0, "mean": np.nan, "std": np.nan,
                    "skewness": np.nan, "kurtosis": np.nan,
                    "min": np.nan, **pct_rows, "max": np.nan,
                }
                continue

            w = marg / total
            mean = float(np.dot(w, centers))
            deviations = centers - mean
            variance = float(np.dot(w, deviations ** 2))
            std = float(np.sqrt(variance))

            # Standardised central moments; guard against zero-variance (single bin).
            if std > 0:
                skewness = float(np.dot(w, deviations ** 3) / std ** 3)
                kurtosis = float(np.dot(w, deviations ** 4) / std ** 4) - 3.0
            else:
                skewness = 0.0
                kurtosis = 0.0

            nonempty = np.where(marg > 0)[0]
            min_val = float(e[nonempty[0]])
            max_val = float(e[nonempty[-1] + 1])

            # Percentiles: linear interpolation within bins from the marginal CDF.
            cdf = np.cumsum(marg) / total
            pct_rows = {}
            for p in percentiles:
                i = int(np.searchsorted(cdf, p, side="left"))
                i = min(i, n_bins - 1)
                cdf_lo = cdf[i - 1] if i > 0 else 0.0
                cdf_hi = cdf[i]
                t = (p - cdf_lo) / (cdf_hi - cdf_lo) if cdf_hi > cdf_lo else 0.5
                pct_rows[_pct_label(p)] = float(e[i] + t * (e[i + 1] - e[i]))

            columns[d] = {
                "count": float(total),
                "mean": mean,
                "std": std,
                "skewness": skewness,
                "kurtosis": kurtosis,
                "min": min_val,
                **pct_rows,
                "max": max_val,
            }

        return pd.DataFrame(columns)
describe(percentiles=None)

Summary statistics of each dimension computed from the binned histogram.

Statistics are derived from the marginal distribution along each dimension (all other dimensions summed out). They reflect the binned representation, not the original raw data.

Parameters:

Name Type Description Default
percentiles list of float

Quantile positions to include, each in [0, 1]. Default is [0.25, 0.50, 0.75].

None

Returns:

Type Description
DataFrame

Rows: count, mean, std, skewness, kurtosis, min, , max. Columns: integer dimension indices 0, 1, …, dim-1.

Notes

Requires pandas (pip install pandas).

  • count — total mass (sum of all bin counts, equal across dimensions).
  • mean / std — probability-weighted mean and population std of bin centres.
  • skewness — third standardised central moment (0 for symmetric distributions).
  • kurtosis — fourth standardised central moment minus 3 (excess kurtosis; 0 for a normal distribution).
  • min / max — lower / upper edge of the outermost non-empty bin.
  • percentiles — linearly interpolated within bins from the marginal CDF.
Source code in hypergrid\mixin\stats_mixin.py
def describe(self, percentiles=None):
    """
    Summary statistics of each dimension computed from the binned histogram.

    Statistics are derived from the marginal distribution along each
    dimension (all other dimensions summed out). They reflect the binned
    representation, not the original raw data.

    Parameters
    ----------
    percentiles : list of float, optional
        Quantile positions to include, each in [0, 1].
        Default is [0.25, 0.50, 0.75].

    Returns
    -------
    pandas.DataFrame
        Rows: count, mean, std, skewness, kurtosis, min, <percentiles>, max.
        Columns: integer dimension indices 0, 1, …, dim-1.

    Notes
    -----
    Requires pandas (``pip install pandas``).

    - **count** — total mass (sum of all bin counts, equal across dimensions).
    - **mean / std** — probability-weighted mean and population std of bin centres.
    - **skewness** — third standardised central moment (0 for symmetric distributions).
    - **kurtosis** — fourth standardised central moment minus 3 (excess kurtosis;
      0 for a normal distribution).
    - **min / max** — lower / upper edge of the outermost non-empty bin.
    - **percentiles** — linearly interpolated within bins from the marginal CDF.
    """
    try:
        import pandas as pd
    except ImportError as exc:
        raise ImportError(
            "describe() requires pandas.  Install it with: pip install pandas"
        ) from exc

    if percentiles is None:
        percentiles = [0.25, 0.50, 0.75]
    percentiles = sorted(float(p) for p in percentiles)

    edges = self.get_edges()
    mass = self.get_mass()
    columns = {}

    for d in range(self.dim):
        e = np.asarray(edges[d])
        n_bins = len(e) - 1
        centers = (e[:-1] + e[1:]) / 2

        marg = np.zeros(n_bins)
        for idx, v in mass.items():
            marg[idx[d]] += v

        total = marg.sum()

        if total == 0:
            pct_rows = {_pct_label(p): np.nan for p in percentiles}
            columns[d] = {
                "count": 0.0, "mean": np.nan, "std": np.nan,
                "skewness": np.nan, "kurtosis": np.nan,
                "min": np.nan, **pct_rows, "max": np.nan,
            }
            continue

        w = marg / total
        mean = float(np.dot(w, centers))
        deviations = centers - mean
        variance = float(np.dot(w, deviations ** 2))
        std = float(np.sqrt(variance))

        # Standardised central moments; guard against zero-variance (single bin).
        if std > 0:
            skewness = float(np.dot(w, deviations ** 3) / std ** 3)
            kurtosis = float(np.dot(w, deviations ** 4) / std ** 4) - 3.0
        else:
            skewness = 0.0
            kurtosis = 0.0

        nonempty = np.where(marg > 0)[0]
        min_val = float(e[nonempty[0]])
        max_val = float(e[nonempty[-1] + 1])

        # Percentiles: linear interpolation within bins from the marginal CDF.
        cdf = np.cumsum(marg) / total
        pct_rows = {}
        for p in percentiles:
            i = int(np.searchsorted(cdf, p, side="left"))
            i = min(i, n_bins - 1)
            cdf_lo = cdf[i - 1] if i > 0 else 0.0
            cdf_hi = cdf[i]
            t = (p - cdf_lo) / (cdf_hi - cdf_lo) if cdf_hi > cdf_lo else 0.5
            pct_rows[_pct_label(p)] = float(e[i] + t * (e[i + 1] - e[i]))

        columns[d] = {
            "count": float(total),
            "mean": mean,
            "std": std,
            "skewness": skewness,
            "kurtosis": kurtosis,
            "min": min_val,
            **pct_rows,
            "max": max_val,
        }

    return pd.DataFrame(columns)