Baillehache Pascal's personal website

LibCapy - policygradient

PolicyGradient class.

Macros:

#define CAPY_POLICYGRADIENT_H

#define CapyPGEnvironmentDef {                          \
  CapyRandom rng;                                       \
  size_t nbAction;                                      \
  CapyVec paramAction;                                  \
  CapyVec paramValue;                                   \
  CapyVec curState;                                     \
  CapyVec actionsProb;                                  \
  void (*destruct)(void);                               \
  void (*setToInitialState)(void);                      \
  CapyPGTransition (*step)(size_t const action);        \
  bool (*isEndState)(void);                             \
  size_t (*getAction)(CapyVec const* const state);      \
  size_t (*getBestAction)(CapyVec const* const state);  \
  void (*getActionsProb)(                               \
    CapyVec const* const state,                         \
          CapyVec* const actionsProb);                  \
  double (*getValue)(CapyVec const* const state);       \
  void (*getGradientValue)(                             \
    CapyVec const* const state,                         \
          CapyVec* const gradValue);                    \
  void (*getGradientActionsProb)(                       \
    CapyVec const* const state,                         \
            size_t const iAction,                       \
          CapyVec* const gradProb);                     \
  void (*getGradientActionsLogProb)(                    \
    CapyVec const* const state,                         \
            size_t const iAction,                       \
          CapyVec* const gradProb);                     \
}

PolicyGradient environment definition

Random number generator CapyRandom rng;

Number of possible action size_t nbAction;

Parameters for actions probability evaluation CapyVec paramAction;

Parameters for value evaluation CapyVec paramValue;

Current state features value CapyVec curState;

Output vector for the actions probability evaluation CapyVec actionsProb;

Destructor void (*destruct)(void);

Step the environment Input: action: the applied action

Output and side effect(s):

Update the current state according to the action, and return the transition CapyPGTransition (*step)(size_t const action);

Set the current state to an initial state

Output and side effect(s):

The current state is set to an intial state void (*setToInitialState)(void);

Check if the current state is an end state

Output and side effect(s):

Return true if the current state is an end state, else false bool (*isEndState)(void);

Get an action for a given state according to their probabilities

Input argument(s):

state: the state to use for evaluation

Output and side effect(s):

Return the selected action. size_t (*getAction)(CapyVec const* const state);

Get the action with highest probability for a given state

Input argument(s):

state: the state to use for evaluation

Output and side effect(s):

Return the selected action. size_t (*getBestAction)(CapyVec const* const state);

Evaluate the action probabilities

Input argument(s):

state: the state used for evaluation
actionsProb: the evaluated actions probability

Output and side effect(s):

'actionsProb' is updated. void (*getActionsProb)( CapyVec const* const state, CapyVec* const actionsProb);

Evaluate the value

Input argument(s):

state: the state used for evaluation

Output and side effect(s):

Return the evaluated value double (*getValue)(CapyVec const* const state);

Evaluate the gradient of values

Input argument(s):

state: the state used for evaluation
gradValue: the result gradient

Output and side effect(s):

'gradValue' is updated. void (*getGradientValue)( CapyVec const* const state, CapyVec* const gradValue);

Evaluate the gradient of actions probability

Input argument(s):

state: the state used for evaluation
iAction: the action to be evaluated
gradProb: the result gradient

Output and side effect(s):

'gradProb' is updated. void (*getGradientActionsProb)( CapyVec const* const state, size_t const iAction, CapyVec* const gradProb);

Evaluate the gradient of actions log probability

Input argument(s):

state: the state used for evaluation
iAction: the action to be evaluated
gradProb: the result gradient

Output and side effect(s):

'gradProb' is updated. void (*getGradientActionsLogProb)( CapyVec const* const state, size_t const iAction, CapyVec* const gradProb);

Enumerations:

None.

Typedefs:

typedef struct CapyPGEnvironment CapyPGEnvironmentDef CapyPGEnvironment;

CapyPGEnvironment object

Struct CapyPGTransition :

Struct CapyPGTransition's properties:

  CapyVec fromState;

'from' state

  size_t action;

Action

  CapyVec toState;

'to' state

  double reward;

Reward

Struct CapyPGTransition's methods:

None.

Struct CapyPGTransitionRecorder :

Struct CapyPGTransitionRecorder's properties:

  size_t nbTransition;

Number of transition

  size_t nbMaxTransition;

Size of the recorder memory (in transition number)

  CapyPGTransition* transitions;

Recorded transitions

Struct CapyPGTransitionRecorder's methods:

  void (*destruct)(void);

Destructor

  void (*reset)(void);

Reset the recorder

Output and side effect(s):

'nbTransition' is reset to 0.

  void (*addTransition)(CapyPGTransition const* const transition);

Record one transition

Input argument(s):

transition: the transition to be recorded

Output and side effect(s):

A copy of the transition is added to the end of 'transitions' which is realloced if necessary, 'nbTransition' and 'nbMaxTransition' are updated as necessary.

Struct CapyPolicyGradient :

Struct CapyPolicyGradient's properties:

  CapyPGEnvironment* env;

The trained environment

  double learnRateAction;

Learning rate for action probabilities (in ]0,1], should be small, default: 0.01)

  double learnRateState;

Learning rate for state value (in ]0,1], should be small, default: 0.01)

  double discount;

Discount rate (in ]0,1], default: 0.9)

  size_t nbMaxStep;

Max number of step when sampling trajectory (default: 1000)

  double avgReward;

Average reward during training

  double avgFinalReward;

Average final reward during training

  double avgNbStep;

Average number of step per episode during training

  double coeffClipping;

Clipping coefficient for PPO (in ]0,+inf[, default: 0.2, the lower the more stable but the slower learning)

  CapyGradientDescent* gdAction;

Gradient descent for the action probabilities (adam)

  CapyGradientDescent* gdValue;

Gradient descent for the state values (standard)

Struct CapyPolicyGradient's methods:

  void (*destruct)(void);

Destructor

  void (*reinforce)(size_t const nbEpisode);

Learn the weights of action probabilities and state value functions using the reinforce with baseline algorithm Inputs: nbEpisode: number of training episode

Output and side effect(s):

The environment's action probabilities parameters and state values parameters are updated.

  void (*proximalPolicyOptimisation)(size_t const nbEpisode);

Learn the weights of action probabilities and state value functions using the proximal policy optimisation algorithm Inputs: nbEpisode: number of training episode

Output and side effect(s):

The environment's action probabilities parameters and state values parameters are updated.

Functions:

CapyPGTransition CapyPGTransitionCreate(size_t const nbFeature);

Create a CapyPGTransition

Input argument(s):

nbFeature: number of features describing an environment

Output and side effect(s):

Return a CapyPGTransition

void CapyPGTransitionDestruct(CapyPGTransition* const that);

Destruct a CapyPGTransition

CapyPGEnvironment CapyPGEnvironmentCreate(
            size_t const nbFeature,
            size_t const nbAction,
            size_t const nbParamAction,
            size_t const nbParamValue,
  CapyRandomSeed_t const seed);

Create a CapyPGEnvironment

Input argument(s):

nbFeature: number of features describing an environment
nbAction: number of possible actions
nbParamAction: number of parameters for actions probability evaluation
nbParamValue: number of parameters for value evaluation
seed: seed for the random number generator

Output and side effect(s):

Return a CapyPGEnvironment

CapyPGEnvironment* CapyPGEnvironmentAlloc(
            size_t const nbFeature,
            size_t const nbAction,
            size_t const nbParamAction,
            size_t const nbParamValue,
  CapyRandomSeed_t const seed);

Allocate memory for a new CapyPGEnvironment and create it

Input argument(s):

Output and side effect(s):

Return a CapyPGEnvironment

Exception(s):

May raise CapyExc_MallocFailed.

void CapyPGEnvironmentFree(CapyPGEnvironment** const that);

Free the memory used by a CapyPGEnvironment* and reset '*that' to NULL

Input argument(s):

that: a pointer to the CapyPGEnvironment to free

CapyPGTransitionRecorder CapyPGTransitionRecorderCreate(void);

Create a CapyPGTransitionRecorder

Output and side effect(s):

Return a CapyPGTransitionRecorder

CapyPolicyGradient CapyPolicyGradientCreate(CapyPGEnvironment* const env);

Create a CapyPolicyGradient Inputs: env: the environment to train

Output and side effect(s):

Return a CapyPolicyGradient

CapyPolicyGradient* CapyPolicyGradientAlloc(CapyPGEnvironment* const env);

Allocate memory for a new CapyPolicyGradient and create it Inputs: env: the environment to train

Output and side effect(s):

Return a CapyPolicyGradient

Exception(s):

May raise CapyExc_MallocFailed.

void CapyPolicyGradientFree(CapyPolicyGradient** const that);

Free the memory used by a CapyPolicyGradient* and reset '*that' to NULL

Input argument(s):

that: a pointer to the CapyPolicyGradient to free

2025-04-08
in LibCapy,
5 views