Source code for adaptive.learner.average_learner1D

from __future__ import annotations

import math
import sys
from collections import defaultdict
from collections.abc import Iterable, Sequence
from copy import deepcopy
from math import hypot
from typing import Callable

import numpy as np
import scipy.stats
from sortedcollections import ItemSortedDict
from sortedcontainers import SortedDict

from adaptive.learner.learner1D import Learner1D, _get_intervals
from adaptive.notebook_integration import ensure_holoviews
from adaptive.types import Int, Real
from adaptive.utils import assign_defaults, partial_function_from_dataframe

try:
    import pandas

    with_pandas = True

except ModuleNotFoundError:
    with_pandas = False

Point = tuple[int, Real]
Points = list[Point]

__all__: list[str] = ["AverageLearner1D"]


[docs]class AverageLearner1D(Learner1D):
    """Learns and predicts a noisy function 'f:ℝ → ℝ'.

    Parameters
    ----------
    function : callable
        The function to learn. Must take a tuple of ``(seed, x)`` and
        return a real number.
    bounds : pair of reals
        The bounds of the interval on which to learn 'function'.
    loss_per_interval: callable, optional
        A function that returns the loss for a single interval of the domain.
        If not provided, then a default is used, which uses the scaled distance
        in the x-y plane as the loss. See the notes for more details
        of `adaptive.Learner1D` for more details.
    delta : float, optional, default 0.2
        This parameter controls the resampling condition. A point is resampled
        if its uncertainty is larger than delta times the smallest neighboring
        interval.
        We strongly recommend ``0 < delta <= 1``.
    alpha : float (0 < alpha < 1), default 0.005
        The true value of the function at x is within the confidence interval
        ``[self.data[x] - self.error[x], self.data[x] + self.error[x]]`` with
        probability ``1-2*alpha``.
        We recommend to keep ``alpha=0.005``.
    neighbor_sampling : float (0 < neighbor_sampling <= 1), default 0.3
        Each new point is initially sampled at least a (neighbor_sampling*100)%
        of the average number of samples of its neighbors.
    min_samples : int (min_samples > 0), default 50
        Minimum number of samples at each point x. Each new point is initially
        sampled at least min_samples times.
    max_samples : int (min_samples < max_samples), default np.inf
        Maximum number of samples at each point x.
    min_error : float (min_error >= 0), default 0
        Minimum size of the confidence intervals. The true value of the
        function at x is within the confidence interval [self.data[x] -
        self.error[x], self.data[x] + self.error[x]] with
        probability 1-2*alpha.
        If self.error[x] < min_error, then x will not be resampled
        anymore, i.e., the smallest confidence interval at x is
        [self.data[x] - min_error, self.data[x] + min_error].
    """

    def __init__(
        self,
        function: Callable[[tuple[int, Real]], Real],
        bounds: tuple[Real, Real],
        loss_per_interval: None
        | (Callable[[Sequence[Real], Sequence[Real]], float]) = None,
        delta: float = 0.2,
        alpha: float = 0.005,
        neighbor_sampling: float = 0.3,
        min_samples: int = 50,
        max_samples: int = sys.maxsize,
        min_error: float = 0,
    ):
        if not (0 < delta <= 1):
            raise ValueError("Learner requires 0 < delta <= 1.")
        if not (0 < alpha <= 1):
            raise ValueError("Learner requires 0 < alpha <= 1.")
        if not (0 < neighbor_sampling <= 1):
            raise ValueError("Learner requires 0 < neighbor_sampling <= 1.")
        if min_samples < 0:
            raise ValueError("min_samples should be positive.")
        if min_samples > max_samples:
            raise ValueError("max_samples should be larger than min_samples.")

        super().__init__(function, bounds, loss_per_interval)  # type: ignore[arg-type]

        self.delta = delta
        self.alpha = alpha
        self.min_samples = min_samples
        self.min_error = min_error
        self.max_samples = max_samples
        self.neighbor_sampling = neighbor_sampling

        # Contains all samples f(x) for each
        # point x in the form {x0: {0: f_0(x0), 1: f_1(x0), ...}, ...}
        self._data_samples: SortedDict[float, dict[int, Real]] = SortedDict()
        # Contains the number of samples taken
        # at each point x in the form {x0: n0, x1: n1, ...}
        self._number_samples = SortedDict()
        # This set contains the points x that have less than min_samples
        # samples or less than a (neighbor_sampling*100)% of their neighbors
        self._undersampled_points: set[Real] = set()
        # Contains the error in the estimate of the
        # mean at each point x in the form {x0: error(x0), ...}
        self.error: dict[Real, float] = decreasing_dict()
        #  Distance between two neighboring points in the
        # form {xi: ((xii-xi)^2 + (yii-yi)^2)^0.5, ...}
        self._distances: dict[Real, float] = decreasing_dict()
        # {xii: error[xii]/min(_distances[xi], _distances[xii], ...}
        self.rescaled_error: ItemSortedDict[Real, float] = decreasing_dict()

[docs]    def new(self) -> AverageLearner1D:
        """Create a copy of `~adaptive.AverageLearner1D` without the data."""
        return AverageLearner1D(
            self.function,
            self.bounds,
            self.loss_per_interval,  # type: ignore[arg-type]
            self.delta,
            self.alpha,
            self.neighbor_sampling,
            self.min_samples,
            self.max_samples,
            self.min_error,
        )

    @property
    def nsamples(self) -> int:
        """Returns the total number of samples"""
        return sum(self._number_samples.values())

    @property
    def min_samples_per_point(self) -> int:
        if not self._number_samples:
            return 0
        return min(self._number_samples.values())

[docs]    def to_numpy(self, mean: bool = False) -> np.ndarray:
        if mean:
            return super().to_numpy()
        else:
            return np.array(
                [
                    (seed, x, *np.atleast_1d(y))
                    for x, seed_y in self._data_samples.items()
                    for seed, y in seed_y.items()
                ]
            )

[docs]    def to_dataframe(  # type: ignore[override]
        self,
        mean: bool = False,
        with_default_function_args: bool = True,
        function_prefix: str = "function.",
        seed_name: str = "seed",
        x_name: str = "x",
        y_name: str = "y",
    ) -> pandas.DataFrame:
        """Return the data as a `pandas.DataFrame`.

        Parameters
        ----------
        with_default_function_args : bool, optional
            Include the ``learner.function``'s default arguments as a
            column, by default True
        function_prefix : str, optional
            Prefix to the ``learner.function``'s default arguments' names,
            by default "function."
        seed_name : str, optional
            Name of the ``seed`` parameter, by default "seed"
        x_name : str, optional
            Name of the ``x`` parameter, by default "x"
        y_name : str, optional
            Name of the output value, by default "y"

        Returns
        -------
        pandas.DataFrame

        Raises
        ------
        ImportError
            If `pandas` is not installed.
        """
        if not with_pandas:
            raise ImportError("pandas is not installed.")
        if mean:
            data: list[tuple[Real, Real]] = sorted(self.data.items())
            columns = [x_name, y_name]
        else:
            data: list[tuple[int, Real, Real]] = [  # type: ignore[no-redef]
                (seed, x, y)
                for x, seed_y in sorted(self._data_samples.items())
                for seed, y in sorted(seed_y.items())
            ]
            columns = [seed_name, x_name, y_name]
        df = pandas.DataFrame(data, columns=columns)
        df.attrs["inputs"] = [seed_name, x_name]
        df.attrs["output"] = y_name
        if with_default_function_args:
            assign_defaults(self.function, df, function_prefix)
        return df

[docs]    def load_dataframe(  # type: ignore[override]
        self,
        df: pandas.DataFrame,
        with_default_function_args: bool = True,
        function_prefix: str = "function.",
        seed_name: str = "seed",
        x_name: str = "x",
        y_name: str = "y",
    ):
        """Load data from a `pandas.DataFrame`.

        If ``with_default_function_args`` is True, then ``learner.function``'s
        default arguments are set (using `functools.partial`) from the values
        in the `pandas.DataFrame`.

        Parameters
        ----------
        df : pandas.DataFrame
            The data to load.
        with_default_function_args : bool, optional
            The ``with_default_function_args`` used in ``to_dataframe()``,
            by default True
        function_prefix : str, optional
            The ``function_prefix`` used in ``to_dataframe``, by default "function."
        seed_name : str, optional
            The ``seed_name`` used in ``to_dataframe``, by default "seed"
        x_name : str, optional
            The ``x_name`` used in ``to_dataframe``, by default "x"
        y_name : str, optional
            The ``y_name`` used in ``to_dataframe``, by default "y"
        """
        # Were using zip instead of df[[seed_name, x_name]].values because that will
        # make the seeds into floats
        seed_x = list(zip(df[seed_name].values.tolist(), df[x_name].values.tolist()))
        self.tell_many(seed_x, df[y_name].values)
        if with_default_function_args:
            self.function = partial_function_from_dataframe(
                self.function, df, function_prefix
            )

[docs]    def ask(self, n: int, tell_pending: bool = True) -> tuple[Points, list[float]]:  # type: ignore[override]
        """Return 'n' points that are expected to maximally reduce the loss."""
        # If some point is undersampled, resample it
        if len(self._undersampled_points):
            x = next(iter(self._undersampled_points))
            points, loss_improvements = self._ask_for_more_samples(x, n)
        # If less than 2 points were sampled, sample a new one
        elif len(self.data) <= 1:
            # TODO: if `n` is very large, we should suggest a few different points.
            points, loss_improvements = self._ask_for_new_point(n)
        #  Else, check the resampling condition
        else:
            if len(self.rescaled_error):
                # This is in case rescaled_error is empty (e.g. when sigma=0)
                x, resc_error = self.rescaled_error.peekitem(0)
                # Resampling condition
                if resc_error > self.delta:
                    points, loss_improvements = self._ask_for_more_samples(x, n)
                else:
                    points, loss_improvements = self._ask_for_new_point(n)
            else:
                points, loss_improvements = self._ask_for_new_point(n)

        if tell_pending:
            for p in points:
                self.tell_pending(p)

        return points, loss_improvements

    def _ask_for_more_samples(self, x: Real, n: int) -> tuple[Points, list[float]]:
        """When asking for n points, the learner returns n times an existing point
        to be resampled, since in general n << min_samples and this point will
        need to be resampled many more times"""
        n_existing = self._number_samples.get(x, 0)
        points = [(seed + n_existing, x) for seed in range(n)]
        xl, xr = self.neighbors_combined[x]
        loss_left = self.losses_combined.get((xl, x), float("inf"))
        loss_right = self.losses_combined.get((x, xr), float("inf"))
        loss = (loss_left + loss_right) / 2
        if math.isinf(loss):
            loss_improvement = float("inf")
        else:
            loss_improvement = loss - loss * np.sqrt(n_existing) / np.sqrt(
                n_existing + n
            )
        loss_improvements = [loss_improvement / n] * n
        return points, loss_improvements

    def _ask_for_new_point(self, n: int) -> tuple[Points, list[float]]:
        """When asking for n new points, the learner returns n times a single
        new point, since in general n << min_samples and this point will need
        to be resampled many more times"""
        points, (loss_improvement,) = self._ask_points_without_adding(1)
        seed_points = list(zip(range(n), n * points))
        loss_improvements = [loss_improvement / n] * n
        return seed_points, loss_improvements  # type: ignore[return-value]

[docs]    def tell_pending(self, seed_x: Point) -> None:  # type: ignore[override]
        _, x = seed_x
        self.pending_points.add(seed_x)
        if x not in self.data:
            self._update_neighbors(x, self.neighbors_combined)
            self._update_losses(x, real=False)

[docs]    def tell(self, seed_x: Point, y: Real) -> None:  # type: ignore[override]
        seed, x = seed_x
        if y is None:
            raise TypeError(
                "Y-value may not be None, use learner.tell_pending(x)"
                "to indicate that this value is currently being calculated"
            )

        if x not in self.data:
            self._update_data(x, y, "new")
            self._update_data_structures(seed_x, y, "new")
        elif seed not in self._data_samples[x]:  # check if the seed is new
            self._update_data(x, y, "resampled")
            self._update_data_structures(seed_x, y, "resampled")
        self.pending_points.discard(seed_x)

    def _update_rescaled_error_in_mean(self, x: Real, point_type: str) -> None:
        """Updates ``self.rescaled_error``.

        Parameters
        ----------
        point_type : str
            Must be either "new" or "resampled".
        """
        #  Update neighbors
        x_left, x_right = self.neighbors[x]
        dists = self._distances
        if x_left is None and x_right is None:
            return

        if x_left is None:
            d_left = dists[x]
        else:
            d_left = dists[x_left]
            if x_left in self.rescaled_error:
                xll = self.neighbors[x_left][0]
                norm = dists[x_left] if xll is None else min(dists[xll], dists[x_left])
                self.rescaled_error[x_left] = self.error[x_left] / norm

        if x_right is None:
            d_right = dists[x_left]
        else:
            d_right = dists[x]
            if x_right in self.rescaled_error:
                xrr = self.neighbors[x_right][1]
                norm = dists[x] if xrr is None else min(dists[x], dists[x_right])
                self.rescaled_error[x_right] = self.error[x_right] / norm

        # Update x
        if point_type == "resampled":
            norm = min(d_left, d_right)
            self.rescaled_error[x] = self.error[x] / norm

    def _update_data(self, x: Real, y: Real, point_type: str) -> None:
        if point_type == "new":
            self.data[x] = y
        elif point_type == "resampled":
            n = len(self._data_samples[x])
            new_average = self.data[x] * n / (n + 1) + y / (n + 1)
            self.data[x] = new_average

    def _update_data_structures(self, seed_x: Point, y: Real, point_type: str) -> None:
        seed, x = seed_x
        if point_type == "new":
            self._data_samples[x] = {seed: y}

            if not self.bounds[0] <= x <= self.bounds[1]:
                return

            self._update_neighbors(x, self.neighbors_combined)
            self._update_neighbors(x, self.neighbors)
            self._update_scale(x, y)
            self._update_losses(x, real=True)

            # If the scale has increased enough, recompute all losses.
            if self._scale[1] > self._recompute_losses_factor * self._oldscale[1]:
                for interval in reversed(self.losses):
                    self._update_interpolated_loss_in_interval(*interval)
                self._oldscale = deepcopy(self._scale)

            self._number_samples[x] = 1
            self._undersampled_points.add(x)
            self.error[x] = np.inf
            self.rescaled_error[x] = np.inf
            self._update_distances(x)
            self._update_rescaled_error_in_mean(x, "new")

        elif point_type == "resampled":
            self._data_samples[x][seed] = y
            ns = self._number_samples
            ns[x] += 1
            n = ns[x]
            if (x in self._undersampled_points) and (n >= self.min_samples):
                x_left, x_right = self.neighbors[x]
                if x_left is not None and x_right is not None:
                    nneighbor = (ns[x_left] + ns[x_right]) / 2
                elif x_left is not None:
                    nneighbor = ns[x_left]
                elif x_right is not None:
                    nneighbor = ns[x_right]
                else:
                    nneighbor = 0
                if n > self.neighbor_sampling * nneighbor:
                    self._undersampled_points.discard(x)

            # We compute the error in the estimation of the mean as
            # the std of the mean multiplied by a t-Student factor to ensure that
            # the mean value lies within the correct interval of confidence
            y_avg = self.data[x]
            ys = self._data_samples[x].values()
            self.error[x] = self._calc_error_in_mean(ys, y_avg, n)
            self._update_distances(x)
            self._update_rescaled_error_in_mean(x, "resampled")

            if self.error[x] <= self.min_error or n >= self.max_samples:
                self.rescaled_error.pop(x, None)

            # We also need to update scale and losses
            self._update_scale(x, y)
            self._update_losses_resampling(x, real=True)

            # If the scale has increased enough, recompute all losses.
            # We only update the scale considering resampled points, since new
            # points are more likely to be outliers.
            if self._scale[1] > self._recompute_losses_factor * self._oldscale[1]:
                for interval in reversed(self.losses):
                    self._update_interpolated_loss_in_interval(*interval)
                self._oldscale = deepcopy(self._scale)

    def _update_distances(self, x: Real) -> None:
        x_left, x_right = self.neighbors[x]
        y = self.data[x]
        if x_left is not None:
            self._distances[x_left] = hypot((x - x_left), (y - self.data[x_left]))
        if x_right is not None:
            self._distances[x] = hypot((x_right - x), (self.data[x_right] - y))

    def _update_losses_resampling(self, x: Real, real=True) -> None:
        """Update all losses that depend on x, whenever the new point is a re-sampled point."""
        # (x_left, x_right) are the "real" neighbors of 'x'.
        x_left, x_right = self._find_neighbors(x, self.neighbors)
        # (a, b) are the neighbors of the combined interpolated
        # and "real" intervals.
        a, b = self._find_neighbors(x, self.neighbors_combined)

        if real:
            for ival in _get_intervals(x, self.neighbors, self.nth_neighbors):
                self._update_interpolated_loss_in_interval(*ival)
        elif x_left is not None and x_right is not None:
            # 'x' happens to be in between two real points,
            # so we can interpolate the losses.
            dx = x_right - x_left
            loss = self.losses[x_left, x_right]
            self.losses_combined[a, x] = (x - a) * loss / dx
            self.losses_combined[x, b] = (b - x) * loss / dx

        # (no real point left of x) or (no real point right of a)
        left_loss_is_unknown = (x_left is None) or (not real and x_right is None)
        if (a is not None) and left_loss_is_unknown:
            self.losses_combined[a, x] = float("inf")

        # (no real point right of x) or (no real point left of b)
        right_loss_is_unknown = (x_right is None) or (not real and x_left is None)
        if (b is not None) and right_loss_is_unknown:
            self.losses_combined[x, b] = float("inf")

    def _calc_error_in_mean(self, ys: Iterable[Real], y_avg: Real, n: int) -> float:
        variance_in_mean = sum((y - y_avg) ** 2 for y in ys) / (n - 1)
        t_student = scipy.stats.t.ppf(1 - self.alpha, df=n - 1)
        return t_student * (variance_in_mean / n) ** 0.5

[docs]    def tell_many(  # type: ignore[override]
        self, xs: Points | np.ndarray, ys: Sequence[Real] | np.ndarray
    ) -> None:
        # Check that all x are within the bounds
        # TODO: remove this requirement, all other learners add the data
        # but ignore it going forward.
        if not np.prod([x >= self.bounds[0] and x <= self.bounds[1] for _, x in xs]):
            raise ValueError(
                "x value out of bounds, "
                "remove x or enlarge the bounds of the learner"
            )

        # Create a mapping of points to a list of samples
        mapping: defaultdict[Real, defaultdict[Int, Real]] = defaultdict(
            lambda: defaultdict(dict)
        )
        for (seed, x), y in zip(xs, ys):
            mapping[x][seed] = y

        for x, seed_y_mapping in mapping.items():
            if len(seed_y_mapping) == 1:
                seed, y = list(seed_y_mapping.items())[0]
                self.tell((seed, x), y)
            elif len(seed_y_mapping) > 1:
                # If we stored more than 1 y-value for the previous x,
                # use a more efficient routine to tell many samples
                # simultaneously, before we move on to a new x
                self.tell_many_at_point(x, seed_y_mapping)

[docs]    def tell_many_at_point(self, x: Real, seed_y_mapping: dict[int, Real]) -> None:
        """Tell the learner about many samples at a certain location x.

        Parameters
        ----------
        x : float
            Value from the function domain.
        seed_y_mapping : Dict[int, Real]
            Dictionary of ``seed`` -> ``y`` at ``x``.
        """
        # Check x is within the bounds
        if not np.prod(x >= self.bounds[0] and x <= self.bounds[1]):
            raise ValueError(
                "x value out of bounds, "
                "remove x or enlarge the bounds of the learner"
            )

        # If x is a new point:
        if x not in self.data:
            # we make a copy because we don't want to modify the original dict
            seed_y_mapping = seed_y_mapping.copy()
            seed = next(iter(seed_y_mapping))
            y = seed_y_mapping.pop(seed)
            self._update_data(x, y, "new")
            self._update_data_structures((seed, x), y, "new")

        ys = np.array(list(seed_y_mapping.values()))

        # If x is not a new point or if there were more than 1 sample in ys:
        if len(ys) > 0:
            self._data_samples[x].update(seed_y_mapping)
            n = len(ys) + self._number_samples[x]
            self.data[x] = (
                np.mean(ys) * len(ys) + self.data[x] * self._number_samples[x]
            ) / n
            self._number_samples[x] = n
            # `self._update_data(x, y, "new")` included the point
            # in _undersampled_points. We remove it if there are
            # more than min_samples samples, disregarding neighbor_sampling.
            if n > self.min_samples:
                self._undersampled_points.discard(x)
            self.error[x] = self._calc_error_in_mean(
                self._data_samples[x].values(), self.data[x], n
            )
            self._update_distances(x)
            self._update_rescaled_error_in_mean(x, "resampled")
            if self.error[x] <= self.min_error or n >= self.max_samples:
                self.rescaled_error.pop(x, None)
            self._update_scale(x, min(self._data_samples[x].values()))
            self._update_scale(x, max(self._data_samples[x].values()))
            self._update_losses_resampling(x, real=True)
            if self._scale[1] > self._recompute_losses_factor * self._oldscale[1]:
                for interval in reversed(self.losses):
                    self._update_interpolated_loss_in_interval(*interval)
                self._oldscale = deepcopy(self._scale)

    def _get_data(self) -> dict[Real, dict[Int, Real]]:  # type: ignore[override]
        return self._data_samples

    def _set_data(self, data: dict[Real, dict[Int, Real]]) -> None:  # type: ignore[override]
        if data:
            for x, samples in data.items():
                self.tell_many_at_point(x, samples)

[docs]    def plot(self):
        """Returns a plot of the evaluated data with error bars.

        This is only implemented for scalar functions, i.e., it requires ``vdim=1``.

        Returns
        -------
        plot : `holoviews.element.Scatter * holoviews.element.ErrorBars *
                holoviews.element.Path`
            Plot of the evaluated data.
        """
        hv = ensure_holoviews()
        if not self.data:
            p = hv.Scatter([]) * hv.ErrorBars([]) * hv.Path([])
        elif not self.vdim > 1:
            xs, ys = zip(*sorted(self.data.items()))
            scatter = hv.Scatter(self.data)
            error = hv.ErrorBars([(x, self.data[x], self.error[x]) for x in self.data])
            line = hv.Path((xs, ys))
            p = scatter * error * line
        else:
            raise Exception("plot() not implemented for vector functions.")

        # Plot with 5% empty margins such that the boundary points are visible
        margin = 0.05 * (self.bounds[1] - self.bounds[0])
        plot_bounds = (self.bounds[0] - margin, self.bounds[1] + margin)

        return p.redim(x={"range": plot_bounds})


def decreasing_dict() -> ItemSortedDict:
    """This initialization orders the dictionary from large to small values"""

    def sorting_rule(key, value):
        return -value

    return ItemSortedDict(sorting_rule, SortedDict())