// -------------------------------- dataset.h -------------------------------
/*
    LibCapy - a general purpose library of C functions and data structures
    Copyright (C) 2021-2025 Pascal Baillehache baillehache.pascal@gmail.com
    https://baillehachepascal.dev
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef CAPY_DATASET_H
#define CAPY_DATASET_H
#include "externalHeaders.h"
#include "cext.h"
#include "capymath.h"
#include "array.h"
#include "pointCloud.h"

// Description:
// Dataset class.

// Type of field.
// datetime1: datetime dd-mm-yyyy hh:ii
// datetime2: datetime hh:ii
typedef enum CapyDatasetFieldType {
  capyDatasetFieldType_num,
  capyDatasetFieldType_cat,
  capyDatasetFieldType_datetime1,
  capyDatasetFieldType_datetime2,
} CapyDatasetFieldType;

// Type of column.
typedef enum CapyDatasetFieldInterface {
  capyDatasetFieldInterface_in,
  capyDatasetFieldInterface_out,
} CapyDatasetFieldInterface;

// Range of row indices
CapyDecRange(DatasetIdxRow, size_t)

// Dataset row object
typedef struct CapyDatasetRow {

  // Original string of the row with commas replaced with \0.
  char* str;

  // Pointers to each field in the row.
  char** fields;

  // Index of the row in the dataset, starting at 0 not counting the header
  // lines.
  size_t idx;

  // Flag to memorise if the row contains null/unknown values
  bool flagNullValue;
  CapyPad(bool, flagNullValue);

  // Number of fields with a null/unknown value in that row.
  size_t nbFieldWithNullValue;

  // Destructor
  void (*destruct)(void);
} CapyDatasetRow;

// Create a CapyDatasetRow.
// Output:
//   Return a CapyDatasetRow.
CapyDatasetRow CapyDatasetRowCreate(void);

// Allocate memory for a new CapyDatasetRow and create it.
// Output:
//   Return a CapyDatasetRow.
// Exception:
//   May raise CapyExc_MallocFailed.
CapyDatasetRow* CapyDatasetRowAlloc(void);

// Free the memory used by a CapyDatasetRow* and reset '*that' to NULL.
// Input:
//   that: a pointer to the CapyDatasetRow to free
void CapyDatasetRowFree(CapyDatasetRow** const that);

// Field description struct.
typedef struct CapyDatasetFieldDesc {

  // Pointer to the field label.
  char* label;

  // Field type.
  CapyDatasetFieldType type;

  // Field interface.
  CapyDatasetFieldInterface interface;

  // Index in the row.
  size_t idx;

  // For fields of categorical types, number of value in the category,
  // for field of numerical types, number of row in the dataset
  size_t nbCategoryVal;

  // For fields of categorical types, array of pointer to the category's values
  char** categoryVals;

  // Range of values (converted to numerical if the field is not numerical)
  CapyRangeDouble range;

  // Average value (converted to numerical if the field is not numerical)
  double avgVal;

  // Median value (converted to numerical if the field is not numerical)
  double medianVal;

  // Flag to memorise if the field contains null/unknown values
  bool flagNullValue;
  CapyPad(bool, flagNullValue);

  // Number of rows with a null/unknown value in that field.
  size_t nbRowWithNullValue;

  // Destructor
  void (*destruct)(void);
} CapyDatasetFieldDesc;

// Create a CapyDatasetFieldDesc.
// Output:
//   Return a CapyDatasetFieldDesc.
CapyDatasetFieldDesc CapyDatasetFieldDescCreate(void);

// Allocate memory for a new CapyDatasetFieldDesc and create it.
// Output:
//   Return a CapyDatasetFieldDesc.
// Exception:
//   May raise CapyExc_MallocFailed.
CapyDatasetFieldDesc* CapyDatasetFieldDescAlloc(void);

// Free the memory used by a CapyDatasetFieldDesc* and reset '*that' to NULL.
// Input:
//   that: a pointer to the CapyDatasetFieldDesc to free
void CapyDatasetFieldDescFree(CapyDatasetFieldDesc** const that);

// Dataset object.
typedef struct CapyDataset {

  // Number of rows.
  size_t nbRow;

  // Number of rows with at least one null/unknown value.
  size_t nbRowWithNullValue;

  // Number of fields in each row.
  size_t nbField;

  // Number of fields with at least one null/unknown value.
  size_t nbFieldWithNullValue;

  // Fields interface row with comma replaced with \0.
  char* fieldInterfaceStr;

  // Fields type row with comma replaced with \0.
  char* fieldTypeStr;

  // Fields label row with comma replaced with \0.
  char* fieldLabelStr;

  // Fields description.
  CapyDatasetFieldDesc* fields;

  // Array of rows.
  CapyDatasetRow* rows;

  // Number of threads for multithreaded operation (default: 10)
  size_t nbThread;

  // Destructor
  void (*destruct)(void);

  // Load the dataset from a file at a given path
  // Input:
  //   path: path to the dataset file
  // Exception:
  //   May raise CapyExc_MallocFailed, CapyExc_StreamReadError, 
  //   CapyExc_InvalidStream.
  void (*loadFromPath)(char const* const path);

  // Print the dataset description of the dataset on a given stream.
  // Input:
  //   stream: the stream to print onto
  void (*print)(FILE* const stream);

  // Print the dataset data of the dataset on a given stream.
  // Input:
  //   stream: the stream to print onto
  //   nbRow: if not 0 print the first nbRow rows only
  void (*printData)(
     FILE* const stream,
    size_t const nbRow);

  // Get the number of input fields
  // Output:
  //   Return the number of input fields
  size_t (*getNbInput)(void);

  // Get the number of output fields
  // Output:
  //   Return the number of output fields
  size_t (*getNbOutput)(void);

  // Get the number of fields of a given type
  // Input:
  //   type: the type of field to be counted
  // Output:
  //   Return the number of fields
  size_t (*getNbFieldOfType)(CapyDatasetFieldType const type);

  // Get the field index of the i-th input
  // Input:
  //   iInput: index of the input
  // Output:
  //   Return the index
  // Exception:
  //   May raise CapyExc_InvalidElemIdx.
  size_t (*getIdxInputField)(size_t const iInput);

  // Get the field index of the i-th output
  // Input:
  //   iOutput: index of the output
  // Output:
  //   Return the index
  // Exception:
  //   May raise CapyExc_InvalidElemIdx.
  size_t (*getIdxOutputField)(size_t const iOutput);

  // Get a value as a numeral.
  // Inputs:
  //   iRow: index of the row
  //   iField: index of the field
  // Output:
  //   For numeral fields return the value as it is, for categorical fields
  //   return the index of the value in the list of possible values
  //   (fieldDesc.categoryVals)
  // Exception:
  //   May raise CapyExc_UndefinedExecution, CapyExc_InvalidParameters.
  double (*getValAsNum)(
    size_t const iRow,
    size_t const iField);

  // Get a value as a uint64 encoded date.
  // Inputs:
  //   iRow: index of the row
  //   iField: index of the field
  // Output:
  //   Return the uint64 encoded date (see CapyDate)
  // Exception:
  //   May raise CapyExc_UndefinedExecution, CapyExc_InvalidParameters.
  uint64_t (*getValAsUInt64Date)(
    size_t const iRow,
    size_t const iField);

  // Get a value as a normalised numeral.
  // Inputs:
  //   iRow: index of the row
  //   iField: index of the field
  // Output:
  //   Return the value, converted to numerical if the field is categorical,
  //   after normalisation according to the 'range' property of the field
  //   description.
  // Exception:
  //   May raise CapyExc_UndefinedExecution.
  double (*getValAsNormalisedNum)(
    size_t const iRow,
    size_t const iField);

  // Convert a dataset to a matrix to be used by a single category predictor
  // Input:
  //   iOutput: index of the output
  // Output:
  //   Return a matrix with as many rows as there are rows in the dataset, and
  //   as many columns as there are inputs in the dataset plus one. The output
  //   must be of type capyDatasetFieldType_cat. Input values are converted
  //   using getValAsNum. The output value is assigned to the last
  //   column in the matrix, and equal to the category index.
  // Exception:
  //   May raise CapyExc_UnsupportedFormat.
  CapyMat (*cvtToMatForSingleCatPredictor)(size_t const iOutput);

  // Convert a dataset to a matrix to be used by a numerical predictor
  // Input:
  //   iOutput: index of the output
  // Output:
  //   Return a matrix with as many rows as there are rows in the dataset, and
  //   as many columns as there are inputs in the dataset plus one. The output
  //   must be of type capyDatasetFieldType_num. Input and output values are
  //   converted using getValAsNum. The output value is assigned to the last
  //   column in the matrix.
  // Exception:
  //   May raise CapyExc_UnsupportedFormat.
  CapyMat (*cvtToMatForNumericalPredictor)(size_t const iOutput);

  // Get the number of different values for a given output
  // Inputs:
  //   iField: index of the output
  // Output:
  //   Return the number of different values for a categorical output field
  //   or the number of rows for a numerical output field
  size_t (*getNbValOutputField)(size_t const iOutput);

  // Convert a dataset to a matrix to be used by a one hot predictor
  // Input:
  //   iOutput: index of the output
  // Output:
  //   Return a matrix with as many rows as there are rows in the dataset, and
  //   as many columns as there are inputs in the dataset plus as many values
  //   the given output takes. The output must be of type
  //   capyDatasetFieldType_cat. Input values are converted using
  //   getValAsNum. The one hot encoding of the output value is
  //   assigned to the last columns in the matrix, and take values 0 for 'is
  //   not this category' and 1 for 'is this category'.
  // Exception:
  //   May raise CapyExc_UnsupportedFormat.
  CapyMat (*cvtToMatForOneHotPredictor)(size_t const iOutput);

  // Get the index of a field from its name
  // Input:
  //   name: the name
  // Output:
  //   Return the index of the field, or raise the exception
  //   CapyExc_InvalidParameters if it couldn't be found.
  size_t (*getIdxFieldFromName)(char const* const name);

  // Get the distribution of a field values as an array of bins.
  // Input:
  //   iField: the index of the field
  //   nbBin: the number of bins
  // Output:
  //   Return a CapyArrSize of size 'nbBin'.
  CapyArrSize* (*getDistAsBins)(
    size_t const iField,
    size_t const nbBin);

  // Get the distribution of a field values as an array of bins for a given
  // value of a given categorical field
  // Input:
  //   iField: the index of the field
  //   nbBin: the number of bins
  //   iCatField: the index of the categorical field
  //   valCatField: the falue of the categorical field
  // Output:
  //   Return a CapyArrSize of size 'nbBin'.
  CapyArrSize* (*getDistAsBinsGivenCatValue)(
         size_t const iField,
         size_t const nbBin,
         size_t const iCatField,
    char const* const valCatField);

  // Get the number of rows containing a particular value for a given field.
  // Input:
  //   iField: the filtering field
  //   valField: the filtering value
  // Output:
  //   Return the number of rows.
  size_t (*getNbRowContainingVal)(
         size_t const iField,
    char const* const valField);

  // Get the pair of values from two fields as two vectors.
  // Input:
  //   iField: the first field
  //   jField: the second field
  //   u: the vector receiving values from the first field
  //   v: the vector receiving values from the second field
  // Output:
  //   'u' and 'v' are destructed, created afresh and populated with values.
  //   Rows with null value in one or the other field are ignored.
  void (*getValuesFromTwoFieldsAsVectors)(
      size_t const iField,
      size_t const jField,
    CapyVec* const u,
    CapyVec* const v);

  // Check if a value is a null/unknown value
  // Input:
  //   val: the value to check
  // Output:
  //   Return true if the value is considered to be null/unknown. A null/unknown
  //   value is the empty string or a string equal to "nan" (case insensitive).
  bool (*isNullValue)(char const* const val);

  // Convert the dataset into a point cloud
  // Output:
  //   Return a CapyPointCloud of dimension equal to the number of fields
  //   and number of point equal to the number of sample. All values are
  // converted to numerical values.
  CapyPointCloud* (*toPointCloud)(void);
} CapyDataset;

// Create a CapyDataset.
// Output:
//   Return a CapyDataset.
CapyDataset CapyDatasetCreate(void);

// Allocate memory for a new CapyDataset and create it.
// Output:
//   Return a CapyDataset.
// Exception:
//   May raise CapyExc_MallocFailed.
CapyDataset* CapyDatasetAlloc(void);

// Free the memory used by a CapyDataset* and reset '*that' to NULL.
// Input:
//   that: a pointer to the CapyDataset to free.
void CapyDatasetFree(CapyDataset** const that);
#endif
