// -------------------------- markovdecisionprocess.h -------------------------
/*
    LibCapy - a general purpose library of C functions and data structures
    Copyright (C) 2021-2025 Pascal Baillehache info@baillehachepascal.dev
    https://baillehachepascal.dev
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef CAPY_MARKOVDECISIONPROCESS_H
#define CAPY_MARKOVDECISIONPROCESS_H
#include "externalHeaders.h"
#include "cext.h"
#include "random.h"

// Description:
// MarkovDecisionProcess class.

// MarkovDecisionProcess policy definition
// Number of states
// size_t nbState;
//
// States value
// double* values;
//
// States optimal action
// size_t* actions;
//
// Destructor
// void (*destruct)(void);
//
// Get the action for a given state
// Input:
//   state: the state
// Output:
//   Return the action
// size_t (*getAction)(size_t const state);
//
// Get the probability that a given action is selected given a state
// Input:
//   state: the state
//   action: the action
// Output:
//   Return the probability in [0,1]
// double (*getProbAction)(
//   size_t const state,
//   size_t const action);
#define CapyMDPPolicyDef {                  \
  size_t nbState;                           \
  double* values;                           \
  size_t* actions;                          \
  void (*destruct)(void);                   \
  size_t (*getAction)(size_t const state);  \
  double (*getProbAction)(                  \
    size_t const state,                     \
    size_t const action);                   \
}

// CapyMDPPolicy object
typedef struct CapyMDPPolicy CapyMDPPolicyDef CapyMDPPolicy;

// Create a CapyMDPPolicy
// Input:
//   nbState: the number of state
// Output:
//   Return a CapyMDPPolicy
CapyMDPPolicy CapyMDPPolicyCreate(size_t const nbState);

// Allocate memory for a new CapyMDPPolicy and create it
// Input:
//   nbState: the number of state
// Output:
//   Return a CapyMDPPolicy
// Exception:
//   May raise CapyExc_MallocFailed.
CapyMDPPolicy* CapyMDPPolicyAlloc(size_t const nbState);

// Free the memory used by a CapyMDPPolicy* and reset '*that' to NULL
// Input:
//   that: a pointer to the CapyMDPPolicy to free
void CapyMDPPolicyFree(CapyMDPPolicy** const that);

// CapyMDPPolicyEpsilonSoft object
typedef struct CapyMDPPolicyEpsilonSoft {

  // Inherits CapyMDPPolicy
  struct CapyMDPPolicyDef;

  // Destructor for the parent class
  void (*destructCapyMDPPolicy)(void);

  // Random number generator
  CapyRandom rng;

  // Epsilon constant for action selection
  double epsilon;

  // Number of action
  size_t nbAction;
} CapyMDPPolicyEpsilonSoft;

// Create a new CapyMDPPolicyEpsilonSoft
// Input:
//   nbState: the number of state
//   nbAction: the number of action
//   epsilon: the epsilon constant for the action selection
// Output:
//   Return a CapyMDPPolicyEpsilonSoft
CapyMDPPolicyEpsilonSoft CapyMDPPolicyEpsilonSoftCreate(
  size_t const nbState,
  size_t const nbAction,
  double const epsilon);

// Allocate memory for a new CapyMDPPolicyEpsilonSoft and create it
// Input:
//   nbState: the number of state
//   nbAction: the number of action
//   epsilon: the epsilon constant for the action selection
// Output:
//   Return a CapyMDPPolicyEpsilonSoft
// Exception:
//   May raise CapyExc_MallocFailed.
CapyMDPPolicyEpsilonSoft* CapyMDPPolicyEpsilonSoftAlloc(
  size_t const nbState,
  size_t const nbAction,
  double const epsilon);

// Free the memory used by a CapyMDPPolicyEpsilonSoft* and reset '*that' to NULL
// Input:
//   that: a pointer to the CapyMDPPolicyEpsilonSoft to free
void CapyMDPPolicyEpsilonSoftFree(CapyMDPPolicyEpsilonSoft** const that);

// MarkovDecisionProcess transition definition
typedef struct CapyMDPTransition {

  // Index of the origin state of the transition
  size_t fromState;

  // Index of the action of the transition
  size_t action;

  // Index of the termination state of the transition
  size_t toState;

  // Probability of transition
  double prob;

  // Reward for transitioning through that transition
  double reward;

  // Action value of the transition
  double value;

  // Number of time this transition has been visited
  size_t nbOccurence;

  // Number of time a transition with same fromState and action has been visited
  size_t nbVisit;
} CapyMDPTransition;

// MarkovDecisionProcessTransition recorder definition
typedef struct CapyMDPTransitionRecorder {

  // Number of transition
  size_t nbTransition;

  // Size of the recorder memory (in transition number)
  size_t nbMaxTransition;

  // Recorded transitions
  CapyMDPTransition* transitions;

  // Destructor
  void (*destruct)(void);

  // Reset the recorder
  // Output:
  //   'nbTransition' is reset to 0.
  void (*reset)(void);

  // Record one transition
  // Input:
  //   transition: the transition to be recorded
  // Output:
  //   A copy of the transition is added to the end of 'transitions' which
  //   is realloced if necessary, 'nbTransition' and 'nbMaxTransition' are
  //   updated as necessary.
  void (*addTransition)(CapyMDPTransition const* const transition);
} CapyMDPTransitionRecorder;

// Create a CapyMDPTransitionRecorder
// Output:
//   Return a CapyMDPTransitionRecorder
CapyMDPTransitionRecorder CapyMDPTransitionRecorderCreate(void);

// MarkovDecisionProcess environment definition
//
// Destructor
// void (*destruct)(void);
//
// Get the result action for a given state
// Input:
//   fromState: the 'from' state
//   action: the applied action
// Output:
//   Return the result state
// size_t (*step)(
//   size_t const fromState,
//   size_t const action);
#define CapyMDPEnvironmentDef {                  \
  void (*destruct)(void);                        \
  size_t (*step)(                                \
    size_t const fromState,                      \
    size_t const action);                        \
}

// CapyMDPEnvironment object
typedef struct CapyMDPEnvironment CapyMDPEnvironmentDef CapyMDPEnvironment;

// Create a CapyMDPEnvironment
// Output:
//   Return a CapyMDPEnvironment
CapyMDPEnvironment CapyMDPEnvironmentCreate(void);

// Allocate memory for a new CapyMDPEnvironment and create it
// Output:
//   Return a CapyMDPEnvironment
// Exception:
//   May raise CapyExc_MallocFailed.
CapyMDPEnvironment* CapyMDPEnvironmentAlloc(void);

// Free the memory used by a CapyMDPEnvironment* and reset '*that' to NULL
// Input:
//   that: a pointer to the CapyMDPEnvironment to free
void CapyMDPEnvironmentFree(CapyMDPEnvironment** const that);

// MarkovDecisionProcess object
typedef struct CapyMarkovDecisionProcess {

  // Number of state
  size_t nbState;

  // Number of action
  size_t nbAction;

  // Number of transition (nbState * nbAction * nbState)
  size_t nbTransition;

  // Transition definition
  CapyMDPTransition* transitions;

  // Start state flags
  bool* flagStartStates;

  // End state flags
  bool* flagEndStates;

  // Optimal policy
  CapyMDPPolicy optimalPolicy;

  // Index of the current state (default: 0)
  size_t curState;

  // Number of step executed (reset by setCurState() and incremented by step())
  size_t nbStep;

  // Maximum number of step to avoid infinite stepping (default: 1e9)
  size_t nbMaxStep;

  // Pseudo random generator to step the process (initialised with current time)
  CapyRandom rng;

  // Discount factor (default 0.9, in [0,1], the lower the more influent the
  // near future rewards compare to far future ones)
  double discount;

  // Epsilon value for convergence during search for optimal policy
  // (default: 1e-6)
  double epsilon;

  // Flag to select between "first visit" and "each visit" during montecarlo
  // search (default: false)
  bool flagEveryVisit;
  CapyPad(bool, flagEveryVisit);

  // Refrence to the environment modelised by the MDP (default: NULL)
  CapyMDPEnvironment* environment;

  // Destructor
  void (*destruct)(void);

  // Get a transition
  // Input:
  //   fromState: index of the origin state
  //   action: index of the action
  //   toState: index of the termination state
  // Output:
  //   Return a reference to the transition
  CapyMDPTransition* (*getTransition)(
    size_t const fromState,
    size_t const action,
    size_t const toState);

  // Set the current state
  // Input:
  //   state: index of the current state
  // Output:
  //   The current state is set and the number of step is reset
  void (*setCurState)(size_t const state);

  // Get the current state
  // Output:
  //   Return the index of the current state
  size_t (*getCurState)(void);

  // Get the number of step
  // Output:
  //   Return the number of step
  size_t (*getNbStep)(void);

  // Step the MDP according to its transitions definition
  // Output:
  //   The current state and the number of step are updated. Return the
  //   transition.
  CapyMDPTransition* (*step)(void);

  // Step the MDP according to a given policy
  // Output:
  //   The current state and the number of step are updated. Return the
  //   transition. If the MDP's environment is known it is used to get the
  //   result state.
  CapyMDPTransition* (*stepPolicy)(
    CapyMDPPolicy const* const policy);

  // Initialise the pseudo random generator
  // Input:
  //   seed: the seed
  // Output:
  //   The pseudo random generator is reset.
  void (*resetRng)(CapyRandomSeed_t const seed);

  // Search the optimal policy (given that the MDP's transitions are all set
  // with the correct transitions probabilities and rewards)
  // Output:
  //   Calculate the optimal policy, update 'optimalPolicy' which is also
  //   used as the initial policy for the search
  void (*searchOptimalPolicy)(void);

  // Get the expected sum of reward
  // Input:
  //   nbRun: number of run used to calculate the expected reward
  // Output:
  //   Return the expected sum of reward, or 0.0 and raise
  //   CapyExc_UndefinedExecution if the MDP can't reach an end state within
  //   that->nbMaxIter. The start state is selected at random. Randomly
  //   select the transitions according to their probabilities.
  double (*getExpReward)(size_t const nbRun);

  // Get the expected sum of reward from a given start state
  // Input:
  //   fromState: the start state
  //   nbRun: number of run used to calculate the expected reward
  // Output:
  //   Return the expected sum of reward, or 0.0 and raise
  //   CapyExc_UndefinedExecution if the MDP can't reach an end state within
  //   that->nbMaxIter. Randomly select the transitions according to their
  //   probabilities.
  double (*getExpRewardFromState)(
    size_t const fromState,
    size_t const nbRun);

  // Get the expected sum of reward using a given policy
  // Input:
  //   nbRun: number of run used to calculate the expected reward
  //   policy: the policy
  // Output:
  //   Return the expected sum of reward, or 0.0 and raise
  //   CapyExc_UndefinedExecution if the MDP can't reach an end state within
  //   that->nbMaxIter. The start state is selected at random. Select the
  //   transitions according to the given policy.
  double (*getExpRewardForPolicy)(
                  size_t const nbRun,
    CapyMDPPolicy const* const policy);

  // Get the expected sum of reward from a given start state using a given
  // policy
  // Input:
  //   fromState: the start state
  //   nbRun: number of run used to calculate the expected reward
  //   policy: the policy
  // Output:
  //   Return the expected sum of reward, or 0.0 and raise
  //   CapyExc_UndefinedExecution if the MDP can't reach an end state within
  //   that->nbMaxIter. Select the transitions according to the given policy.
  double (*getExpRewardFromStateForPolicy)(
                  size_t const fromState,
                  size_t const nbRun,
    CapyMDPPolicy const* const policy);

  // Record a trajectory through the MDP given an initial state and a policy
  // Input:
  //   recorder: the recorder
  //   startState: the initial state of the trajectory
  //   policy: the policy used to select transitions
  // Output:
  //   The recorder is reset and updated with the trajectory. The trajectory
  //   stops when encountering an end state, or when it reaches
  //   'that->nbMaxStep'. The current state of the MDP is modified.
  void (*recordTrajectory)(
    CapyMDPTransitionRecorder* const recorder,
                        size_t const startState,
          CapyMDPPolicy const* const policy);

  // Search the optimal policy using Q-Learning (converges to the optimal
  // policy by exploring the environment instead of using transitions
  // probabilities, only needs the transition rewards; uses an
  // epsilon-soft policy to explore the transitions)
  // Input:
  //   epsilon: exploration coefficient (in ]0, 1])
  //   alpha: learning rate (in ]0, 1])
  //   nbEpisode: number of training episodes
  // Output:
  //   Calculate the optimal policy, update 'optimalPolicy' which is also
  //   used as the initial policy for the search.
  void (*qLearning)(
    double const epsilon,
    double const alpha,
    size_t const nbEpisode);

  // Get a random start state
  // Output:
  //   Return one of the start states. If there are no start states, return 0 by
  //   default.
  size_t (*getRndStartState)(void);
} CapyMarkovDecisionProcess;

// Create a CapyMarkovDecisionProcess
// Input:
//   nbState: the number of state
//   nbAction: the number of action
// Output:
//   Return a CapyMarkovDecisionProcess
CapyMarkovDecisionProcess CapyMarkovDecisionProcessCreate(
  size_t const nbState,
  size_t const nbAction);

// Allocate memory for a new CapyMarkovDecisionProcess and create it
// Input:
//   nbState: the number of state
//   nbAction: the number of action
// Output:
//   Return a CapyMarkovDecisionProcess
// Exception:
//   May raise CapyExc_MallocFailed.
CapyMarkovDecisionProcess* CapyMarkovDecisionProcessAlloc(
  size_t const nbState,
  size_t const nbAction);

// Free the memory used by a CapyMarkovDecisionProcess* and reset '*that' to NULL
// Input:
//   that: a pointer to the CapyMarkovDecisionProcess to free
void CapyMarkovDecisionProcessFree(CapyMarkovDecisionProcess** const that);
#endif
