Skip to content

Commit

Permalink
Enable numpy typing check
Browse files Browse the repository at this point in the history
  • Loading branch information
takuseno committed Oct 21, 2023
1 parent f68ad3c commit d7ad45f
Show file tree
Hide file tree
Showing 41 changed files with 594 additions and 344 deletions.
15 changes: 8 additions & 7 deletions d3rlpy/algos/qlearning/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
sync_optimizer_state,
train_api,
)
from ...types import NDArray
from ..utility import (
assert_action_space_with_dataset,
assert_action_space_with_env,
Expand Down Expand Up @@ -243,7 +244,7 @@ def _func(x: torch.Tensor) -> torch.Tensor:
# workaround until version 1.6
self._impl.modules.unfreeze()

def predict(self, x: Observation) -> np.ndarray:
def predict(self, x: Observation) -> NDArray:
"""Returns greedy actions.
.. code-block:: python
Expand Down Expand Up @@ -278,9 +279,9 @@ def predict(self, x: Observation) -> np.ndarray:
if self._config.action_scaler:
action = self._config.action_scaler.reverse_transform(action)

return action.cpu().detach().numpy()
return action.cpu().detach().numpy() # type: ignore

def predict_value(self, x: Observation, action: np.ndarray) -> np.ndarray:
def predict_value(self, x: Observation, action: NDArray) -> NDArray:
"""Returns predicted action-values.
.. code-block:: python
Expand Down Expand Up @@ -332,9 +333,9 @@ def predict_value(self, x: Observation, action: np.ndarray) -> np.ndarray:

value = self._impl.predict_value(torch_x, torch_action)

return value.cpu().detach().numpy()
return value.cpu().detach().numpy() # type: ignore

def sample_action(self, x: Observation) -> np.ndarray:
def sample_action(self, x: Observation) -> NDArray:
"""Returns sampled actions.
The sampled actions are identical to the output of `predict` method if
Expand Down Expand Up @@ -364,7 +365,7 @@ def sample_action(self, x: Observation) -> np.ndarray:
if self._config.action_scaler:
action = self._config.action_scaler.reverse_transform(action)

return action.cpu().detach().numpy()
return action.cpu().detach().numpy() # type: ignore

def fit(
self,
Expand Down Expand Up @@ -795,7 +796,7 @@ def collect(
clip_episode = terminal or truncated

# store observation
buffer.append(observation, action, reward)
buffer.append(observation, action, float(reward))

# reset if terminated
if clip_episode:
Expand Down
57 changes: 28 additions & 29 deletions d3rlpy/algos/qlearning/explorers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from abc import ABCMeta, abstractmethod
from typing import Any, List, Optional, Union
from typing import Union

import numpy as np
from typing_extensions import Protocol

from ...preprocessing.action_scalers import ActionScaler, MinMaxActionScaler
from ...dataset import Observation
from ...interface import QLearningAlgoProtocol
from ...preprocessing.action_scalers import MinMaxActionScaler
from ...types import NDArray

__all__ = [
"Explorer",
Expand All @@ -14,24 +16,11 @@
]


class _ActionProtocol(Protocol):
def predict(self, x: Union[np.ndarray, List[Any]]) -> np.ndarray:
...

@property
def action_size(self) -> Optional[int]:
...

@property
def action_scaler(self) -> Optional[ActionScaler]:
...


class Explorer(metaclass=ABCMeta):
@abstractmethod
def sample(
self, algo: _ActionProtocol, x: np.ndarray, step: int
) -> np.ndarray:
self, algo: QLearningAlgoProtocol, x: Observation, step: int
) -> NDArray:
pass


Expand All @@ -48,11 +37,14 @@ def __init__(self, epsilon: float):
self._epsilon = epsilon

def sample(
self, algo: _ActionProtocol, x: np.ndarray, step: int
) -> np.ndarray:
self, algo: QLearningAlgoProtocol, x: Observation, step: int
) -> NDArray:
action_size = algo.action_size
assert action_size is not None
greedy_actions = algo.predict(x)
random_actions = np.random.randint(algo.action_size, size=x.shape[0])
is_random = np.random.random(x.shape[0]) < self._epsilon
batch_size = greedy_actions.shape[0]
random_actions = np.random.randint(action_size, size=batch_size)
is_random = np.random.random(batch_size) < self._epsilon
return np.where(is_random, random_actions, greedy_actions)


Expand Down Expand Up @@ -80,8 +72,8 @@ def __init__(
self._duration = duration

def sample(
self, algo: _ActionProtocol, x: np.ndarray, step: int
) -> np.ndarray:
self, algo: QLearningAlgoProtocol, x: Observation, step: int
) -> NDArray:
"""Returns :math:`\\epsilon`-greedy action.
Args:
Expand All @@ -92,9 +84,12 @@ def sample(
Returns:
:math:`\\epsilon`-greedy action.
"""
action_size = algo.action_size
assert action_size is not None
greedy_actions = algo.predict(x)
random_actions = np.random.randint(algo.action_size, size=x.shape[0])
is_random = np.random.random(x.shape[0]) < self.compute_epsilon(step)
batch_size = greedy_actions.shape[0]
random_actions = np.random.randint(action_size, size=batch_size)
is_random = np.random.random(batch_size) < self.compute_epsilon(step)
return np.where(is_random, random_actions, greedy_actions)

def compute_epsilon(self, step: int) -> float:
Expand Down Expand Up @@ -125,8 +120,8 @@ def __init__(self, mean: float = 0.0, std: float = 0.1):
self._std = std

def sample(
self, algo: _ActionProtocol, x: np.ndarray, step: int
) -> np.ndarray:
self, algo: QLearningAlgoProtocol, x: Observation, step: int
) -> NDArray:
"""Returns action with noise injection.
Args:
Expand All @@ -139,12 +134,16 @@ def sample(
action = algo.predict(x)
noise = np.random.normal(self._mean, self._std, size=action.shape)

minimum: Union[float, NDArray]
maximum: Union[float, NDArray]
if isinstance(algo.action_scaler, MinMaxActionScaler):
# scale noise
assert algo.action_scaler.minimum is not None
assert algo.action_scaler.maximum is not None
minimum = algo.action_scaler.minimum
maximum = algo.action_scaler.maximum
else:
minimum = -1.0
maximum = 1.0

return np.clip(action + noise, minimum, maximum)
return np.clip(action + noise, minimum, maximum) # type: ignore
13 changes: 7 additions & 6 deletions d3rlpy/algos/qlearning/random_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ...constants import ActionSpace
from ...dataset import Observation, Shape
from ...torch_utility import TorchMiniBatch
from ...types import NDArray
from .base import QLearningAlgoBase

__all__ = [
Expand Down Expand Up @@ -54,10 +55,10 @@ def inner_create_impl(
) -> None:
self._action_size = action_size

def predict(self, x: Observation) -> np.ndarray:
def predict(self, x: Observation) -> NDArray:
return self.sample_action(x)

def sample_action(self, x: Observation) -> np.ndarray:
def sample_action(self, x: Observation) -> NDArray:
x = np.asarray(x)
action_shape = (x.shape[0], self._action_size)

Expand All @@ -79,7 +80,7 @@ def sample_action(self, x: Observation) -> np.ndarray:

return action

def predict_value(self, x: Observation, action: np.ndarray) -> np.ndarray:
def predict_value(self, x: Observation, action: NDArray) -> NDArray:
raise NotImplementedError

def inner_update(self, batch: TorchMiniBatch) -> Dict[str, float]:
Expand Down Expand Up @@ -117,14 +118,14 @@ def inner_create_impl(
) -> None:
self._action_size = action_size

def predict(self, x: Observation) -> np.ndarray:
def predict(self, x: Observation) -> NDArray:
return self.sample_action(x)

def sample_action(self, x: Observation) -> np.ndarray:
def sample_action(self, x: Observation) -> NDArray:
x = np.asarray(x)
return np.random.randint(self._action_size, size=x.shape[0])

def predict_value(self, x: Observation, action: np.ndarray) -> np.ndarray:
def predict_value(self, x: Observation, action: NDArray) -> NDArray:
raise NotImplementedError

def inner_update(self, batch: TorchMiniBatch) -> Dict[str, float]:
Expand Down
18 changes: 6 additions & 12 deletions d3rlpy/algos/transformer/action_samplers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
from typing_extensions import Protocol

from ...types import NDArray

__all__ = [
"TransformerActionSampler",
"IdentityTransformerActionSampler",
Expand All @@ -14,9 +16,7 @@
class TransformerActionSampler(Protocol):
r"""Interface of TransformerActionSampler."""

def __call__(
self, transformer_output: np.ndarray
) -> Union[np.ndarray, int]:
def __call__(self, transformer_output: NDArray) -> Union[NDArray, int]:
r"""Returns sampled action from Transformer output.
Args:
Expand All @@ -35,9 +35,7 @@ class IdentityTransformerActionSampler(TransformerActionSampler):
Sampled action is the exactly same as ``transformer_output``.
"""

def __call__(
self, transformer_output: np.ndarray
) -> Union[np.ndarray, int]:
def __call__(self, transformer_output: NDArray) -> Union[NDArray, int]:
return transformer_output


Expand All @@ -55,9 +53,7 @@ class SoftmaxTransformerActionSampler(TransformerActionSampler):
def __init__(self, temperature: float = 1.0):
self._temperature = temperature

def __call__(
self, transformer_output: np.ndarray
) -> Union[np.ndarray, int]:
def __call__(self, transformer_output: NDArray) -> Union[NDArray, int]:
assert transformer_output.ndim == 1
logits = transformer_output / self._temperature
x = np.exp(logits - np.max(logits))
Expand All @@ -73,8 +69,6 @@ class GreedyTransformerActionSampler(TransformerActionSampler):
probability distribution.
"""

def __call__(
self, transformer_output: np.ndarray
) -> Union[np.ndarray, int]:
def __call__(self, transformer_output: NDArray) -> Union[NDArray, int]:
assert transformer_output.ndim == 1
return int(np.argmax(transformer_output))
12 changes: 7 additions & 5 deletions d3rlpy/algos/transformer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
)
from ...metrics import evaluate_transformer_with_environment
from ...torch_utility import TorchTrajectoryMiniBatch, train_api
from ...types import NDArray
from ..utility import (
assert_action_space_with_dataset,
build_scalers_with_trajectory_slicer,
Expand Down Expand Up @@ -102,7 +103,7 @@ class StatefulTransformerWrapper(Generic[TTransformerImpl, TTransformerConfig]):
_action_sampler: TransformerActionSampler
_return_rest: float
_observations: Deque[Observation]
_actions: Deque[Union[np.ndarray, int]]
_actions: Deque[Union[NDArray, int]]
_rewards: Deque[float]
_returns_to_go: Deque[float]
_timesteps: Deque[int]
Expand All @@ -128,7 +129,7 @@ def __init__(
self._timesteps = deque([], maxlen=context_size)
self._timestep = 1

def predict(self, x: Observation, reward: float) -> Union[np.ndarray, int]:
def predict(self, x: Observation, reward: float) -> Union[NDArray, int]:
r"""Returns action.
Args:
Expand Down Expand Up @@ -173,8 +174,9 @@ def algo(
) -> "TransformerAlgoBase[TTransformerImpl, TTransformerConfig]":
return self._algo

def _get_pad_action(self) -> Union[int, np.ndarray]:
def _get_pad_action(self) -> Union[int, NDArray]:
assert self._algo.impl
pad_action: Union[int, NDArray]
if self._algo.get_action_type() == ActionSpace.CONTINUOUS:
pad_action = np.zeros(self._algo.impl.action_size, dtype=np.float32)
else:
Expand All @@ -186,7 +188,7 @@ class TransformerAlgoBase(
Generic[TTransformerImpl, TTransformerConfig],
LearnableBase[TTransformerImpl, TTransformerConfig],
):
def predict(self, inpt: TransformerInput) -> np.ndarray:
def predict(self, inpt: TransformerInput) -> NDArray:
"""Returns action.
This is for internal use. For evaluation, use
Expand All @@ -213,7 +215,7 @@ def predict(self, inpt: TransformerInput) -> np.ndarray:
if self._config.action_scaler:
action = self._config.action_scaler.reverse_transform(action)

return action.cpu().detach().numpy()
return action.cpu().detach().numpy() # type: ignore

def fit(
self,
Expand Down
Loading

0 comments on commit d7ad45f

Please sign in to comment.