Source code for qiskit_algorithms.optimizers.gradient_descent

# This code is part of a Qiskit project.
#
# (C) Copyright IBM 2021, 2023.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.

"""A standard gradient descent optimizer."""
from __future__ import annotations

from collections.abc import Generator
from dataclasses import dataclass, field
from typing import Any, Callable, SupportsFloat
import numpy as np
from .optimizer import Optimizer, OptimizerSupportLevel, OptimizerResult, POINT
from .steppable_optimizer import AskData, TellData, OptimizerState, SteppableOptimizer
from .optimizer_utils import LearningRate

CALLBACK = Callable[[int, np.ndarray, float, SupportsFloat], None]


[docs]@dataclass
class GradientDescentState(OptimizerState):
    """State of :class:`~.GradientDescent`.

    Dataclass with all the information of an optimizer plus the learning_rate and the stepsize.
    """

    stepsize: float | None
    """Norm of the gradient on the last step."""

    learning_rate: LearningRate = field(compare=False)
    """Learning rate at the current step of the optimization process.

    It behaves like a generator, (use ``next(learning_rate)`` to get the learning rate for the
    next step) but it can also return  the current learning rate with ``learning_rate.current``.

    """


[docs]class GradientDescent(SteppableOptimizer):
    r"""The gradient descent minimization routine.

    For a function :math:`f` and an initial point :math:`\vec\theta_0`, the standard (or "vanilla")
    gradient descent method is an iterative scheme to find the minimum :math:`\vec\theta^*` of
    :math:`f` by updating the parameters in the direction of the negative gradient of :math:`f`

    .. math::

        \vec\theta_{n+1} = \vec\theta_{n} - \eta_n \vec\nabla f(\vec\theta_{n}),

    for a small learning rate :math:`\eta_n > 0`.

    You can either provide the analytic gradient :math:`\vec\nabla f` as ``jac``
    in the :meth:`~.minimize` method, or, if you do not provide it, use a finite difference
    approximation of the gradient. To adapt the size of the perturbation in the finite difference
    gradients, set the ``perturbation`` property in the initializer.

    This optimizer supports a callback function. If provided in the initializer, the optimizer
    will call the callback in each iteration with the following information in this order:
    current number of function values, current parameters, current function value, norm of current
    gradient.

    Examples:

        A minimum example that will use finite difference gradients with a default perturbation
        of 0.01 and a default learning rate of 0.01.

        .. code-block:: python

            from qiskit_algorithms.optimizers import GradientDescent

            def f(x):
                return (np.linalg.norm(x) - 1) ** 2

            initial_point = np.array([1, 0.5, -0.2])

            optimizer = GradientDescent(maxiter=100)

            result = optimizer.minimize(fun=fun, x0=initial_point)

            print(f"Found minimum {result.x} at a value"
                "of {result.fun} using {result.nfev} evaluations.")

        An example where the learning rate is an iterator and we supply the analytic gradient.
        Note how much faster this convergences (i.e. less ``nfev``) compared to the previous
        example.

        .. code-block:: python

            from qiskit_algorithms.optimizers import GradientDescent

            def learning_rate():
                power = 0.6
                constant_coeff = 0.1
                def power_law():
                    n = 0
                    while True:
                        yield constant_coeff * (n ** power)
                        n += 1

                return power_law()

            def f(x):
                return (np.linalg.norm(x) - 1) ** 2

            def grad_f(x):
                return 2 * (np.linalg.norm(x) - 1) * x / np.linalg.norm(x)

            initial_point = np.array([1, 0.5, -0.2])

            optimizer = GradientDescent(maxiter=100, learning_rate=learning_rate)
            result = optimizer.minimize(fun=fun, jac=grad_f, x0=initial_point)

            print(f"Found minimum {result.x} at a value"
            "of {result.fun} using {result.nfev} evaluations.")


    An other example where the evaluation of the function has a chance of failing. The user, with
    specific knowledge about his function can catch this errors and handle them before passing the
    result to the optimizer.

        .. code-block:: python

            import random
            import numpy as np
            from qiskit_algorithms.optimizers import GradientDescent

            def objective(x):
                if random.choice([True, False]):
                    return None
                else:
                    return (np.linalg.norm(x) - 1) ** 2

            def grad(x):
                if random.choice([True, False]):
                    return None
                else:
                    return 2 * (np.linalg.norm(x) - 1) * x / np.linalg.norm(x)


            initial_point = np.random.normal(0, 1, size=(100,))

            optimizer = GradientDescent(maxiter=20)
            optimizer.start(x0=initial_point, fun=objective, jac=grad)

            while optimizer.continue_condition():
                ask_data = optimizer.ask()
                evaluated_gradient = None

                while evaluated_gradient is None:
                    evaluated_gradient = grad(ask_data.x_center)
                    optimizer.state.njev += 1

                optimizer.state.nit += 1

                tell_data = TellData(eval_jac=evaluated_gradient)
                optimizer.tell(ask_data=ask_data, tell_data=tell_data)

            result = optimizer.create_result()

    Users that aren't dealing with complicated functions and who are more familiar with step by step
    optimization algorithms can use the :meth:`~.step` method which wraps the :meth:`~.ask`
    and :meth:`~.tell` methods. In the same spirit the method :meth:`~.minimize` will optimize the
    function and return the result.

    To see other libraries that use this interface one can visit:
    https://optuna.readthedocs.io/en/stable/tutorial/20_recipes/009_ask_and_tell.html

    """

    def __init__(
        self,
        maxiter: int = 100,
        learning_rate: float
        | list[float]
        | np.ndarray
        | Callable[[], Generator[float, None, None]] = 0.01,
        tol: float = 1e-7,
        callback: CALLBACK | None = None,
        perturbation: float | None = None,
    ) -> None:
        """
        Args:
            maxiter: The maximum number of iterations.
            learning_rate: A constant, list, array or factory of generators yielding learning rates
                           for the parameter updates. See the docstring for an example.
            tol: If the norm of the parameter update is smaller than this threshold, the
                optimizer has converged.
            perturbation: If no gradient is passed to :meth:`~.minimize` the gradient is
                approximated with a forward finite difference scheme with ``perturbation``
                perturbation in both directions (defaults to 1e-2 if required).
                Ignored when we have an explicit function for the gradient.
        Raises:
            ValueError: If ``learning_rate`` is an array and its length is less than ``maxiter``.
        """
        super().__init__(maxiter=maxiter)
        self.callback = callback
        self._state: GradientDescentState | None = None
        self._perturbation = perturbation
        self._tol = tol
        # if learning rate is an array, check it is sufficiently long.
        if isinstance(learning_rate, (list, np.ndarray)):
            if len(learning_rate) < maxiter:
                raise ValueError(
                    f"Length of learning_rate ({len(learning_rate)}) "
                    f"is smaller than maxiter ({maxiter})."
                )
        self.learning_rate = learning_rate

    @property  # type: ignore[override]
    def state(self) -> GradientDescentState:
        """Return the current state of the optimizer."""
        return self._state

    @state.setter
    def state(self, state: GradientDescentState) -> None:
        """Set the current state of the optimizer."""
        self._state = state

    @property
    def tol(self) -> float:
        """Returns the tolerance of the optimizer.

        Any step with smaller stepsize than this value will stop the optimization."""
        return self._tol

    @tol.setter
    def tol(self, tol: float) -> None:
        """Set the tolerance."""
        self._tol = tol

    @property
    def perturbation(self) -> float | None:
        """Returns the perturbation.

        This is the perturbation used in the finite difference gradient approximation.
        """
        return self._perturbation

    @perturbation.setter
    def perturbation(self, perturbation: float | None) -> None:
        """Set the perturbation."""
        self._perturbation = perturbation

    def _callback_wrapper(self) -> None:
        """
        Wraps the callback function to accommodate GradientDescent.

        Will call :attr:`~.callback` and pass the following arguments:
        current number of function values, current parameters, current function value,
        norm of current gradient.
        """
        if self.callback is not None:
            self.callback(
                self.state.nfev,
                self.state.x,  # type: ignore[arg-type]
                self.state.fun(self.state.x),
                self.state.stepsize,
            )

    @property
    def settings(self) -> dict[str, Any]:
        # if learning rate or perturbation are custom iterators expand them
        learning_rate = self.learning_rate
        if callable(self.learning_rate):
            iterator = self.learning_rate()
            learning_rate = np.array([next(iterator) for _ in range(self.maxiter)])

        return {
            "maxiter": self.maxiter,
            "tol": self.tol,
            "learning_rate": learning_rate,
            "perturbation": self.perturbation,
            "callback": self.callback,
        }

[docs]    def ask(self) -> AskData:
        """Returns an object with the data needed to evaluate the gradient.

        If this object contains a gradient function the gradient can be evaluated directly. Otherwise
        approximate it with a finite difference scheme.
        """
        return AskData(
            x_jac=self.state.x,
        )

[docs]    def tell(self, ask_data: AskData, tell_data: TellData) -> None:
        """
        Updates :attr:`.~GradientDescentState.x` by an amount proportional to the learning
        rate and value of the gradient at that point.

        Args:
            ask_data: The data used to evaluate the function.
            tell_data: The data from the function evaluation.

        Raises:
            ValueError: If the gradient passed doesn't have the right dimension.
        """
        if np.shape(self.state.x) != np.shape(tell_data.eval_jac):  # type: ignore[arg-type]
            raise ValueError("The gradient does not have the correct dimension")
        self.state.x = self.state.x - next(self.state.learning_rate) * tell_data.eval_jac
        self.state.stepsize = np.linalg.norm(tell_data.eval_jac)  # type: ignore[arg-type,assignment]
        self.state.nit += 1

[docs]    def evaluate(self, ask_data: AskData) -> TellData:
        """Evaluates the gradient.

        It does so either by evaluating an analytic gradient or by approximating it with a
        finite difference scheme. It will either add ``1`` to the number of gradient evaluations or add
        ``N+1`` to the number of function evaluations (Where N is the dimension of the gradient).

        Args:
            ask_data: It contains the point where the gradient is to be evaluated and the gradient
                      function or, in its absence, the objective function to perform a finite difference
                      approximation.

        Returns:
            The data containing the gradient evaluation.
        """
        if self.state.jac is None:
            eps = 0.01 if (self.perturbation is None) else self.perturbation
            grad = Optimizer.gradient_num_diff(
                x_center=ask_data.x_jac,
                f=self.state.fun,
                epsilon=eps,
                max_evals_grouped=self._max_evals_grouped,
            )
            self.state.nfev += 1 + len(ask_data.x_jac)  # type: ignore[arg-type]
        else:
            grad = self.state.jac(ask_data.x_jac)  # type: ignore[arg-type]
            self.state.njev += 1

        return TellData(eval_jac=grad)

[docs]    def create_result(self) -> OptimizerResult:
        """Creates a result of the optimization process.

        This result contains the best point, the best function value, the number of function/gradient
        evaluations and the number of iterations.

        Returns:
            The result of the optimization process.
        """
        result = OptimizerResult()
        result.x = self.state.x
        result.fun = self.state.fun(self.state.x)
        result.nfev = self.state.nfev
        result.njev = self.state.njev
        result.nit = self.state.nit
        return result

[docs]    def start(
        self,
        fun: Callable[[POINT], float],
        x0: POINT,
        jac: Callable[[POINT], POINT] | None = None,
        bounds: list[tuple[float, float]] | None = None,
    ) -> None:

        self.state = GradientDescentState(
            fun=fun,
            jac=jac,
            x=np.asarray(x0),
            nit=0,
            nfev=0,
            njev=0,
            learning_rate=LearningRate(learning_rate=self.learning_rate),
            stepsize=None,
        )

[docs]    def continue_condition(self) -> bool:
        """
        Condition that indicates the optimization process should come to an end.

        When the stepsize is smaller than the tolerance, the optimization process is considered
        finished.

        Returns:
            ``True`` if the optimization process should continue, ``False`` otherwise.
        """
        if self.state.stepsize is None:
            return True
        else:
            return (self.state.stepsize > self.tol) and super().continue_condition()

[docs]    def get_support_level(self):
        """Get the support level dictionary."""
        return {
            "gradient": OptimizerSupportLevel.supported,
            "bounds": OptimizerSupportLevel.ignored,
            "initial_point": OptimizerSupportLevel.required,
        }