mjlab.rl

Contents

mjlab.rl#

Classes

Runner#

class mjlab.rl.MjlabOnPolicyRunner[source]#

Bases: OnPolicyRunner

Base runner that persists environment state across checkpoints.

__init__(env: rsl_rl.env.VecEnv, train_cfg: dict, log_dir: str | None = None, device: str = 'cpu') None[source]#
export_policy_to_onnx(path: str, filename: str = 'policy.onnx', verbose: bool = False) None[source]#

Export policy to ONNX format using legacy export path.

Overrides the base implementation to set dynamo=False, avoiding warnings about dynamic_axes being deprecated with the new TorchDynamo export path (torch>=2.9 default).

save(path: str, infos=None) None[source]#

Save checkpoint.

Extends the base implementation to persist the environment’s common_step_counter and to respect the upload_model config flag.

load(path: str, load_cfg: dict | None = None, strict: bool = True, map_location: str | None = None) dict[source]#

Load checkpoint.

Extends the base implementation to: 1. Restore common_step_counter to preserve curricula state. 2. Migrate legacy checkpoints (actor.* -> mlp.*, actor_obs_normalizer.*

-> obs_normalizer.*) to the current format (rsl-rl>=4.0).

class mjlab.rl.RslRlVecEnvWrapper[source]#

Bases: VecEnv

__init__(env: ManagerBasedRlEnv, clip_actions: float | None = None)[source]#

Configuration#

class mjlab.rl.RslRlOnPolicyRunnerCfg[source]#
class_name: str = 'OnPolicyRunner'#

The runner class name. Default is OnPolicyRunner.

actor: RslRlModelCfg#

The actor configuration.

critic: RslRlModelCfg#

The critic configuration.

algorithm: RslRlPpoAlgorithmCfg#

The algorithm configuration.

class mjlab.rl.RslRlPpoAlgorithmCfg[source]#

Config for the PPO algorithm.

num_learning_epochs: int = 5#

The number of learning epochs per update.

num_mini_batches: int = 4#

The number of mini-batches per update. mini batch size = num_envs * num_steps / num_mini_batches

learning_rate: float = 0.001#

The learning rate.

schedule: Literal['adaptive', 'fixed'] = 'adaptive'#

The learning rate schedule.

gamma: float = 0.99#

The discount factor.

lam: float = 0.95#

The lambda parameter for Generalized Advantage Estimation (GAE).

entropy_coef: float = 0.005#

The coefficient for the entropy loss.

desired_kl: float = 0.01#

The desired KL divergence between the new and old policies.

max_grad_norm: float = 1.0#

The maximum gradient norm for the policy.

value_loss_coef: float = 1.0#

The coefficient for the value loss.

use_clipped_value_loss: bool = True#

Whether to use clipped value loss.

clip_param: float = 0.2#

The clipping parameter for the policy.

normalize_advantage_per_mini_batch: bool = False#

Whether to normalize the advantage per mini-batch. Default is False. If True, the advantage is normalized over the mini-batches only. Otherwise, the advantage is normalized over the entire collected trajectories.

optimizer: Literal['adam', 'adamw', 'sgd', 'rmsprop'] = 'adam'#

The optimizer to use.

share_cnn_encoders: bool = False#

Share CNN encoders between actor and critic.

class_name: str = 'PPO'#

Algorithm class name resolved by RSL-RL.

class mjlab.rl.RslRlModelCfg[source]#

Config for a single neural network model (Actor or Critic).

hidden_dims: Tuple[int, ...] = (128, 128, 128)#

The hidden dimensions of the network.

activation: str = 'elu'#

The activation function.

obs_normalization: bool = False#

Whether to normalize the observations. Default is False.

init_noise_std: float = 1.0#

The initial noise standard deviation.

noise_std_type: Literal['scalar', 'log'] = 'scalar'#

The type of noise standard deviation.

stochastic: bool = False#

Whether the model output is stochastic.

cnn_cfg: dict[str, Any] | None = None#

CNN encoder config. When set, class_name should be “CNNModel”.

Passed to rsl_rl.modules.CNN. Common keys: output_channels, kernel_size, stride, padding, activation, global_pool, max_pool.

class_name: str = 'MLPModel'#

Model class name resolved by RSL-RL (MLPModel or CNNModel).

class mjlab.rl.RslRlBaseRunnerCfg[source]#
seed: int = 42#

The seed for the experiment. Default is 42.

num_steps_per_env: int = 24#

The number of steps per environment update.

max_iterations: int = 300#

The maximum number of iterations.

save_interval: int = 50#

The number of iterations between saves.

experiment_name: str = 'exp1'#

Directory name used to group runs under logs/rsl_rl/{experiment_name}/.

run_name: str = ''#

Optional label appended to the timestamped run directory (e.g. 2025-01-27_14-30-00_{run_name}). Also becomes the display name for the run in wandb.

logger: Literal['wandb', 'tensorboard'] = 'wandb'#

The logger to use. Default is wandb.

wandb_project: str = 'mjlab'#

The wandb project name.

wandb_tags: Tuple[str, ...] = ()#

Tags for the wandb run. Default is empty tuple.

resume: bool = False#

Whether to resume the experiment. Default is False.

load_run: str = '.*'#

The run directory to load. Default is “.*” which means all runs. If regex expression, the latest (alphabetical order) matching run will be loaded.

load_checkpoint: str = 'model_.*.pt'#

The checkpoint file to load. Default is “model_.*.pt” (all). If regex expression, the latest (alphabetical order) matching file will be loaded.

clip_actions: float | None = None#

The clipping range for action values. If None (default), no clipping is applied.

upload_model: bool = True#

Whether to upload model files (.pt, .onnx) to W&B on save. Set to False to keep metric logging but avoid storage usage. Default is True.