Baillehache Pascal's personal website

LibCapy - markovdecisionprocess

MarkovDecisionProcess class.

Macros:

#define CAPY_MARKOVDECISIONPROCESS_H

#define CapyMDPPolicyDef {                  \
  size_t nbState;                           \
  double* values;                           \
  size_t* actions;                          \
  void (*destruct)(void);                   \
  size_t (*getAction)(size_t const state);  \
  double (*getProbAction)(                  \
    size_t const state,                     \
    size_t const action);                   \
}

MarkovDecisionProcess policy definition Number of states size_t nbState;

States value double* values;

States optimal action size_t* actions;

Destructor void (*destruct)(void);

Get the action for a given state

Input argument(s):

state: the state

Output and side effect(s):

Return the action size_t (*getAction)(size_t const state);

Get the probability that a given action is selected given a state

Input argument(s):

state: the state
action: the action

Output and side effect(s):

Return the probability in [0,1] double (*getProbAction)( size_t const state, size_t const action);

#define CapyMDPEnvironmentDef {                  \
  void (*destruct)(void);                        \
  size_t (*step)(                                \
    size_t const fromState,                      \
    size_t const action);                        \
}

MarkovDecisionProcess environment definition

Destructor void (*destruct)(void);

Get the result action for a given state

Input argument(s):

fromState: the 'from' state
action: the applied action

Output and side effect(s):

Return the result state size_t (*step)( size_t const fromState, size_t const action);

Enumerations:

None.

Typedefs:

typedef struct CapyMDPPolicy CapyMDPPolicyDef CapyMDPPolicy;

CapyMDPPolicy object

typedef struct CapyMDPEnvironment CapyMDPEnvironmentDef CapyMDPEnvironment;

CapyMDPEnvironment object

Struct CapyMDPPolicyEpsilonSoft :

Struct CapyMDPPolicyEpsilonSoft's properties:

  struct CapyMDPPolicyDef;

Inherits CapyMDPPolicy

  CapyRandom rng;

Random number generator

  double epsilon;

Epsilon constant for action selection

  size_t nbAction;

Number of action

Struct CapyMDPPolicyEpsilonSoft's methods:

  void (*destructCapyMDPPolicy)(void);

Destructor for the parent class

Struct CapyMDPTransition :

Struct CapyMDPTransition's properties:

  size_t fromState;

Index of the origin state of the transition

  size_t action;

Index of the action of the transition

  size_t toState;

Index of the termination state of the transition

  double prob;

Probability of transition

  double reward;

Reward for transitioning through that transition

  double value;

Action value of the transition

  size_t nbOccurence;

Number of time this transition has been visited

  size_t nbVisit;

Number of time a transition with same fromState and action has been visited

Struct CapyMDPTransition's methods:

None.

Struct CapyMDPTransitionRecorder :

Struct CapyMDPTransitionRecorder's properties:

  size_t nbTransition;

Number of transition

  size_t nbMaxTransition;

Size of the recorder memory (in transition number)

  CapyMDPTransition* transitions;

Recorded transitions

Struct CapyMDPTransitionRecorder's methods:

  void (*destruct)(void);

Destructor

  void (*reset)(void);

Reset the recorder

Output and side effect(s):

'nbTransition' is reset to 0.

  void (*addTransition)(CapyMDPTransition const* const transition);

Record one transition

Input argument(s):

transition: the transition to be recorded

Output and side effect(s):

A copy of the transition is added to the end of 'transitions' which is realloced if necessary, 'nbTransition' and 'nbMaxTransition' are updated as necessary.

Struct CapyMarkovDecisionProcess :

Struct CapyMarkovDecisionProcess's properties:

  size_t nbState;

Number of state

  size_t nbAction;

Number of action

  size_t nbTransition;

Number of transition (nbState * nbAction * nbState)

  CapyMDPTransition* transitions;

Transition definition

  bool* flagStartStates;

Start state flags

  bool* flagEndStates;

End state flags

  CapyMDPPolicy optimalPolicy;

Optimal policy

  size_t curState;

Index of the current state (default: 0)

  size_t nbStep;

Number of step executed (reset by setCurState() and incremented by step())

  size_t nbMaxStep;

Maximum number of step to avoid infinite stepping (default: 1e9)

  CapyRandom rng;

Pseudo random generator to step the process (initialised with current time)

  double discount;

Discount factor (default 0.9, in [0,1], the lower the more influent the near future rewards compare to far future ones)

  double epsilon;

Epsilon value for convergence during search for optimal policy (default: 1e-6)

  bool flagEveryVisit;

Flag to select between "first visit" and "each visit" during montecarlo search (default: false)

  CapyPad(bool, flagEveryVisit);

  CapyMDPEnvironment* environment;

Refrence to the environment modelised by the MDP (default: NULL)

Struct CapyMarkovDecisionProcess's methods:

  void (*destruct)(void);

Destructor

  CapyMDPTransition* (*getTransition)(
    size_t const fromState,
    size_t const action,
    size_t const toState);

Get a transition

Input argument(s):

fromState: index of the origin state
action: index of the action
toState: index of the termination state

Output and side effect(s):

Return a reference to the transition

  void (*setCurState)(size_t const state);

Set the current state

Input argument(s):

state: index of the current state

Output and side effect(s):

The current state is set and the number of step is reset

  size_t (*getCurState)(void);

Get the current state

Output and side effect(s):

Return the index of the current state

  size_t (*getNbStep)(void);

Get the number of step

Output and side effect(s):

Return the number of step

  CapyMDPTransition* (*step)(void);

Step the MDP according to its transitions definition

Output and side effect(s):

The current state and the number of step are updated. Return the transition.

  CapyMDPTransition* (*stepPolicy)(
    CapyMDPPolicy const* const policy);

Step the MDP according to a given policy

Output and side effect(s):

The current state and the number of step are updated. Return the transition. If the MDP's environment is known it is used to get the result state.

  void (*resetRng)(CapyRandomSeed_t const seed);

Initialise the pseudo random generator

Input argument(s):

seed: the seed

Output and side effect(s):

The pseudo random generator is reset.

  void (*searchOptimalPolicy)(void);

Search the optimal policy (given that the MDP's transitions are all set with the correct transitions probabilities and rewards)

Output and side effect(s):

Calculate the optimal policy, update 'optimalPolicy' which is also used as the initial policy for the search

  double (*getExpReward)(size_t const nbRun);

Get the expected sum of reward

Input argument(s):

nbRun: number of run used to calculate the expected reward

Output and side effect(s):

Return the expected sum of reward, or 0.0 and raise CapyExc_UndefinedExecution if the MDP can't reach an end state within that->nbMaxIter. The start state is selected at random. Randomly select the transitions according to their probabilities.

  double (*getExpRewardFromState)(
    size_t const fromState,
    size_t const nbRun);

Get the expected sum of reward from a given start state

Input argument(s):

fromState: the start state
nbRun: number of run used to calculate the expected reward

Output and side effect(s):

Return the expected sum of reward, or 0.0 and raise CapyExc_UndefinedExecution if the MDP can't reach an end state within that->nbMaxIter. Randomly select the transitions according to their probabilities.

  double (*getExpRewardForPolicy)(
                  size_t const nbRun,
    CapyMDPPolicy const* const policy);

Get the expected sum of reward using a given policy

Input argument(s):

nbRun: number of run used to calculate the expected reward
policy: the policy

Output and side effect(s):

Return the expected sum of reward, or 0.0 and raise CapyExc_UndefinedExecution if the MDP can't reach an end state within that->nbMaxIter. The start state is selected at random. Select the transitions according to the given policy.

  double (*getExpRewardFromStateForPolicy)(
                  size_t const fromState,
                  size_t const nbRun,
    CapyMDPPolicy const* const policy);

Get the expected sum of reward from a given start state using a given policy

Input argument(s):

fromState: the start state
nbRun: number of run used to calculate the expected reward
policy: the policy

Output and side effect(s):

Return the expected sum of reward, or 0.0 and raise CapyExc_UndefinedExecution if the MDP can't reach an end state within that->nbMaxIter. Select the transitions according to the given policy.

  void (*recordTrajectory)(
    CapyMDPTransitionRecorder* const recorder,
                        size_t const startState,
          CapyMDPPolicy const* const policy);

Record a trajectory through the MDP given an initial state and a policy

Input argument(s):

recorder: the recorder
startState: the initial state of the trajectory
policy: the policy used to select transitions

Output and side effect(s):

The recorder is reset and updated with the trajectory. The trajectory stops when encountering an end state, or when it reaches 'that->nbMaxStep'. The current state of the MDP is modified.

  void (*qLearning)(
    double const epsilon,
    double const alpha,
    size_t const nbEpisode);

Search the optimal policy using Q-Learning (converges to the optimal policy by exploring the environment instead of using transitions probabilities, only needs the transition rewards; uses an epsilon-soft policy to explore the transitions)

Input argument(s):

epsilon: exploration coefficient (in ]0, 1])
alpha: learning rate (in ]0, 1])
nbEpisode: number of training episodes

Output and side effect(s):

Calculate the optimal policy, update 'optimalPolicy' which is also used as the initial policy for the search.

  size_t (*getRndStartState)(void);

Get a random start state

Output and side effect(s):

Return one of the start states. If there are no start states, return 0 by default.

Functions:

CapyMDPPolicy CapyMDPPolicyCreate(size_t const nbState);

Create a CapyMDPPolicy

Input argument(s):

nbState: the number of state

Output and side effect(s):

Return a CapyMDPPolicy

CapyMDPPolicy* CapyMDPPolicyAlloc(size_t const nbState);

Allocate memory for a new CapyMDPPolicy and create it

Input argument(s):

nbState: the number of state

Output and side effect(s):

Return a CapyMDPPolicy

Exception(s):

May raise CapyExc_MallocFailed.

void CapyMDPPolicyFree(CapyMDPPolicy** const that);

Free the memory used by a CapyMDPPolicy* and reset '*that' to NULL

Input argument(s):

that: a pointer to the CapyMDPPolicy to free

CapyMDPPolicyEpsilonSoft CapyMDPPolicyEpsilonSoftCreate(
  size_t const nbState,
  size_t const nbAction,
  double const epsilon);

Create a new CapyMDPPolicyEpsilonSoft

Input argument(s):

nbState: the number of state
nbAction: the number of action
epsilon: the epsilon constant for the action selection

Output and side effect(s):

Return a CapyMDPPolicyEpsilonSoft

CapyMDPPolicyEpsilonSoft* CapyMDPPolicyEpsilonSoftAlloc(
  size_t const nbState,
  size_t const nbAction,
  double const epsilon);

Allocate memory for a new CapyMDPPolicyEpsilonSoft and create it

Input argument(s):

nbState: the number of state
nbAction: the number of action
epsilon: the epsilon constant for the action selection

Output and side effect(s):

Return a CapyMDPPolicyEpsilonSoft

Exception(s):

May raise CapyExc_MallocFailed.

void CapyMDPPolicyEpsilonSoftFree(CapyMDPPolicyEpsilonSoft** const that);

Free the memory used by a CapyMDPPolicyEpsilonSoft* and reset '*that' to NULL

Input argument(s):

that: a pointer to the CapyMDPPolicyEpsilonSoft to free

CapyMDPTransitionRecorder CapyMDPTransitionRecorderCreate(void);

Create a CapyMDPTransitionRecorder

Output and side effect(s):

Return a CapyMDPTransitionRecorder

CapyMDPEnvironment CapyMDPEnvironmentCreate(void);

Create a CapyMDPEnvironment

Output and side effect(s):

Return a CapyMDPEnvironment

CapyMDPEnvironment* CapyMDPEnvironmentAlloc(void);

Allocate memory for a new CapyMDPEnvironment and create it

Output and side effect(s):

Return a CapyMDPEnvironment

Exception(s):

May raise CapyExc_MallocFailed.

void CapyMDPEnvironmentFree(CapyMDPEnvironment** const that);

Free the memory used by a CapyMDPEnvironment* and reset '*that' to NULL

Input argument(s):

that: a pointer to the CapyMDPEnvironment to free

CapyMarkovDecisionProcess CapyMarkovDecisionProcessCreate(
  size_t const nbState,
  size_t const nbAction);

Create a CapyMarkovDecisionProcess

Input argument(s):

nbState: the number of state
nbAction: the number of action

Output and side effect(s):

Return a CapyMarkovDecisionProcess

CapyMarkovDecisionProcess* CapyMarkovDecisionProcessAlloc(
  size_t const nbState,
  size_t const nbAction);

Allocate memory for a new CapyMarkovDecisionProcess and create it

Input argument(s):

nbState: the number of state
nbAction: the number of action

Output and side effect(s):

Return a CapyMarkovDecisionProcess

Exception(s):

May raise CapyExc_MallocFailed.

void CapyMarkovDecisionProcessFree(CapyMarkovDecisionProcess** const that);

Free the memory used by a CapyMarkovDecisionProcess* and reset '*that' to NULL

Input argument(s):

that: a pointer to the CapyMarkovDecisionProcess to free

2025-04-08
in LibCapy,
13 views