// ----------------------------- policygradient.h ----------------------------
/*
    LibCapy - a general purpose library of C functions and data structures
    Copyright (C) 2021-2025 Pascal Baillehache info@baillehachepascal.dev
    https://baillehachepascal.dev
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef CAPY_POLICYGRADIENT_H
#define CAPY_POLICYGRADIENT_H
#include "externalHeaders.h"
#include "cext.h"
#include "capymath.h"
#include "mathfun.h"
#include "gradientDescent.h"
#include "random.h"

// Description:
// PolicyGradient class.

// PG environment transition
typedef struct CapyPGTransition {

  // 'from' state
  CapyVec fromState;

  // Action
  size_t action;

  // 'to' state
  CapyVec toState;

  // Reward
  double reward;
} CapyPGTransition;

// Create a CapyPGTransition
// Input:
//   nbFeature: number of features describing an environment
// Output:
//   Return a CapyPGTransition
CapyPGTransition CapyPGTransitionCreate(size_t const nbFeature);

// Destruct a CapyPGTransition
void CapyPGTransitionDestruct(CapyPGTransition* const that);

// PolicyGradient environment definition
//
// Random number generator
// CapyRandom rng;
//
// Number of possible action
// size_t nbAction;
//
// Parameters for actions probability evaluation
// CapyVec paramAction;
//
// Parameters for value evaluation
// CapyVec paramValue;
//
// Current state features value
// CapyVec curState;
//
// Output vector for the actions probability evaluation
// CapyVec actionsProb;
//
// Destructor
// void (*destruct)(void);
//
// Step the environment
/// Input:
//   action: the applied action
// Output:
//   Update the current state according to the action, and return the transition
// CapyPGTransition (*step)(size_t const action);
//
// Set the current state to an initial state
// Output:
//   The current state is set to an intial state
// void (*setToInitialState)(void);
//
// Check if the current state is an end state
// Output:
//   Return true if the current state is an end state, else false
// bool (*isEndState)(void);
//
// Get an action for a given state according to their probabilities
// Input:
//   state: the state to use for evaluation
// Output:
//   Return the selected action.
// size_t (*getAction)(CapyVec const* const state);
//
// Get the action with highest probability for a given state
// Input:
//   state: the state to use for evaluation
// Output:
//   Return the selected action.
// size_t (*getBestAction)(CapyVec const* const state);
//
// Evaluate the action probabilities
// Input:
//   state: the state used for evaluation
//   actionsProb: the evaluated actions probability
// Output:
//   'actionsProb' is updated.
// void (*getActionsProb)(
//   CapyVec const* const state,
//         CapyVec* const actionsProb);
//
// Evaluate the value
// Input:
//   state: the state used for evaluation
// Output:
//   Return the evaluated value
// double (*getValue)(CapyVec const* const state);
//
// Evaluate the gradient of values
// Input:
//   state: the state used for evaluation
//   gradValue: the result gradient
// Output:
//   'gradValue' is updated.
// void (*getGradientValue)(
//   CapyVec const* const state,
//         CapyVec* const gradValue);
//
// Evaluate the gradient of actions probability
// Input:
//   state: the state used for evaluation
//   iAction: the action to be evaluated
//   gradProb: the result gradient
// Output:
//   'gradProb' is updated.
// void (*getGradientActionsProb)(
//   CapyVec const* const state,
//           size_t const iAction,
//         CapyVec* const gradProb);
//
// Evaluate the gradient of actions log probability
// Input:
//   state: the state used for evaluation
//   iAction: the action to be evaluated
//   gradProb: the result gradient
// Output:
//   'gradProb' is updated.
// void (*getGradientActionsLogProb)(
//   CapyVec const* const state,
//           size_t const iAction,
//         CapyVec* const gradProb);
#define CapyPGEnvironmentDef {                          \
  CapyRandom rng;                                       \
  size_t nbAction;                                      \
  CapyVec paramAction;                                  \
  CapyVec paramValue;                                   \
  CapyVec curState;                                     \
  CapyVec actionsProb;                                  \
  void (*destruct)(void);                               \
  void (*setToInitialState)(void);                      \
  CapyPGTransition (*step)(size_t const action);        \
  bool (*isEndState)(void);                             \
  size_t (*getAction)(CapyVec const* const state);      \
  size_t (*getBestAction)(CapyVec const* const state);  \
  void (*getActionsProb)(                               \
    CapyVec const* const state,                         \
          CapyVec* const actionsProb);                  \
  double (*getValue)(CapyVec const* const state);       \
  void (*getGradientValue)(                             \
    CapyVec const* const state,                         \
          CapyVec* const gradValue);                    \
  void (*getGradientActionsProb)(                       \
    CapyVec const* const state,                         \
            size_t const iAction,                       \
          CapyVec* const gradProb);                     \
  void (*getGradientActionsLogProb)(                    \
    CapyVec const* const state,                         \
            size_t const iAction,                       \
          CapyVec* const gradProb);                     \
}

// CapyPGEnvironment object
typedef struct CapyPGEnvironment CapyPGEnvironmentDef CapyPGEnvironment;

// Create a CapyPGEnvironment
// Input:
//   nbFeature: number of features describing an environment
//   nbAction: number of possible actions
//   nbParamAction: number of parameters for actions probability evaluation
//   nbParamValue: number of parameters for value evaluation
//   seed: seed for the random number generator
// Output:
//   Return a CapyPGEnvironment
CapyPGEnvironment CapyPGEnvironmentCreate(
            size_t const nbFeature,
            size_t const nbAction,
            size_t const nbParamAction,
            size_t const nbParamValue,
  CapyRandomSeed_t const seed);

// Allocate memory for a new CapyPGEnvironment and create it
// Input:
//   nbFeature: number of features describing an environment
//   nbAction: number of possible actions
//   nbParamAction: number of parameters for actions probability evaluation
//   nbParamValue: number of parameters for value evaluation
//   seed: seed for the random number generator
// Output:
//   Return a CapyPGEnvironment
// Exception:
//   May raise CapyExc_MallocFailed.
CapyPGEnvironment* CapyPGEnvironmentAlloc(
            size_t const nbFeature,
            size_t const nbAction,
            size_t const nbParamAction,
            size_t const nbParamValue,
  CapyRandomSeed_t const seed);

// Free the memory used by a CapyPGEnvironment* and reset '*that' to NULL
// Input:
//   that: a pointer to the CapyPGEnvironment to free
void CapyPGEnvironmentFree(CapyPGEnvironment** const that);

// Structure to record PGEnvironment transitions
typedef struct CapyPGTransitionRecorder {

  // Number of transition
  size_t nbTransition;

  // Size of the recorder memory (in transition number)
  size_t nbMaxTransition;

  // Recorded transitions
  CapyPGTransition* transitions;

  // Destructor
  void (*destruct)(void);

  // Reset the recorder
  // Output:
  //   'nbTransition' is reset to 0.
  void (*reset)(void);

  // Record one transition
  // Input:
  //   transition: the transition to be recorded
  // Output:
  //   A copy of the transition is added to the end of 'transitions' which
  //   is realloced if necessary, 'nbTransition' and 'nbMaxTransition' are
  //   updated as necessary.
  void (*addTransition)(CapyPGTransition const* const transition);
} CapyPGTransitionRecorder;

// Create a CapyPGTransitionRecorder
// Output:
//   Return a CapyPGTransitionRecorder
CapyPGTransitionRecorder CapyPGTransitionRecorderCreate(void);

// PolicyGradient object
typedef struct CapyPolicyGradient {

  // The trained environment
  CapyPGEnvironment* env;

  // Learning rate for action probabilities (in ]0,1], should be small,
  // default: 0.01)
  double learnRateAction;

  // Learning rate for state value (in ]0,1], should be small,
  // default: 0.01)
  double learnRateState;

  // Discount rate (in ]0,1], default: 0.9)
  double discount;

  // Max number of step when sampling trajectory (default: 1000)
  size_t nbMaxStep;

  // Average reward during training
  double avgReward;

  // Average final reward during training
  double avgFinalReward;

  // Average number of step per episode during training
  double avgNbStep;

  // Clipping coefficient for PPO (in ]0,+inf[, default: 0.2, the lower the
  // more stable but the slower learning)
  double coeffClipping;

  // Gradient descent for the action probabilities (adam)
  CapyGradientDescent* gdAction;

  // Gradient descent for the state values (standard)
  CapyGradientDescent* gdValue;

  // Destructor
  void (*destruct)(void);

  // Learn the weights of action probabilities and state value functions using
  // the reinforce with baseline algorithm
  // Inputs:
  //   nbEpisode: number of training episode
  // Output:
  //   The environment's action probabilities parameters and state values
  //   parameters are updated.
  void (*reinforce)(size_t const nbEpisode);

  // Learn the weights of action probabilities and state value functions using
  // the proximal policy optimisation algorithm
  // Inputs:
  //   nbEpisode: number of training episode
  // Output:
  //   The environment's action probabilities parameters and state values
  //   parameters are updated.
  void (*proximalPolicyOptimisation)(size_t const nbEpisode);
} CapyPolicyGradient;

// Create a CapyPolicyGradient
// Inputs:
//   env: the environment to train
// Output:
//   Return a CapyPolicyGradient
CapyPolicyGradient CapyPolicyGradientCreate(CapyPGEnvironment* const env);

// Allocate memory for a new CapyPolicyGradient and create it
// Inputs:
//   env: the environment to train
// Output:
//   Return a CapyPolicyGradient
// Exception:
//   May raise CapyExc_MallocFailed.
CapyPolicyGradient* CapyPolicyGradientAlloc(CapyPGEnvironment* const env);

// Free the memory used by a CapyPolicyGradient* and reset '*that' to NULL
// Input:
//   that: a pointer to the CapyPolicyGradient to free
void CapyPolicyGradientFree(CapyPolicyGradient** const that);
#endif
