// ------------------------------ dataset.c ------------------------------
/*
    LibCapy - a general purpose library of C functions and data structures
    Copyright (C) 2021-2025 Pascal Baillehache baillehache.pascal@gmail.com
    https://baillehachepascal.dev
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "dataset.h"
#include "streamIo.h"
#include "date.h"
#include <strings.h>

// Free the memory used by a CapyDatasetRow
static void DatasetRowDestruct(void) {
  methodOf(CapyDatasetRow);
  free(that->str);
  free(that->fields);
}

// Create a CapyDatasetRow
// Output:
//   Return a CapyDatasetRow
CapyDatasetRow CapyDatasetRowCreate(void) {
  return (CapyDatasetRow){
    .str = NULL,
    .fields = NULL,
    .idx = 0,
    .destruct = DatasetRowDestruct,
  };
}

// Allocate memory for a new CapyDatasetRow and create it
// Output:
//   Return a CapyDatasetRow
// Exception:
//   May raise CapyExc_MallocFailed.
CapyDatasetRow* CapyDatasetRowAlloc(void) {
  CapyDatasetRow* that = NULL;
  safeMalloc(that, 1);
  if(!that) return NULL;
  *that = CapyDatasetRowCreate();
  return that;
}

// Free the memory used by a CapyDatasetRow* and reset '*that' to NULL
// Input:
//   that: a pointer to the CapyDatasetRow to free
void CapyDatasetRowFree(CapyDatasetRow** const that) {
  if(that == NULL || *that == NULL) return;
  $(*that, destruct)();
  free(*that);
  *that = NULL;
}

// Free the memory used by a CapyDatasetFieldDesc.
static void DatasetFieldDescDestruct(void) {
  methodOf(CapyDatasetFieldDesc);
  free(that->categoryVals);
}

// Create a CapyDatasetFieldDesc.
// Output:
//   Return a CapyDatasetFieldDesc.
CapyDatasetFieldDesc CapyDatasetFieldDescCreate(void) {
  CapyDatasetFieldDesc that = {
    .label = NULL,
    .type = 0,
    .interface = 0,
    .idx = 0,
    .nbCategoryVal = 0,
    .categoryVals = NULL,
    .destruct = DatasetFieldDescDestruct,
  };
  return that;
}

// Allocate memory for a new CapyDatasetFieldDesc and create it.
// Output:
//   Return a CapyDatasetFieldDesc.
// Exception:
//   May raise CapyExc_MallocFailed.
CapyDatasetFieldDesc* CapyDatasetFieldDescAlloc(void) {
  CapyDatasetFieldDesc* that = NULL;
  safeMalloc(that, 1);
  if(!that) return NULL;
  *that = CapyDatasetFieldDescCreate();
  return that;
}

// Free the memory used by a CapyDatasetFieldDesc* and reset '*that' to NULL.
// Input:
//   that: a pointer to the CapyDatasetFieldDesc to free
void CapyDatasetFieldDescFree(CapyDatasetFieldDesc** const that) {
  if(that == NULL || *that == NULL) return;
  $(*that, destruct)();
  free(*that);
  *that = NULL;
}

// Split a string by a replacing its comma with '\0'
// Input:
//   str: the string to split
static void SplitAtCommas(char* const str) {
  char* ptr = str;
  while(ptr[0] != '\0') {
    if(ptr[0] == ',' || ptr[0] == '\r' || ptr[0] == '\n') ptr[0] = '\0';
    ++ptr;
  }
}

// Check if a value is a null/unknown value
// Input:
//   val: the value to check
// Output:
//   Return true if the value is considered to be null/unknown. A null/unknown
//   value is the empty string or a string equal to "nan" (case insensitive).
static bool IsNullValue(char const* const val) {
  return (val[0] == '\0' || strcasecmp(val, "nan") == 0);
}

// Load the dataset from a file at a given path
// Input:
//   path: path to the dataset file
// Exception:
//   May raise CapyExc_MallocFailed, CapyExc_StreamReadError, 
//   CapyExc_InvalidStream.
static void LoadFromPath(char const* const path) {
  methodOf(CapyDataset);

  // Ensure the dataset is empty
  $(that, destruct)();

  // Open the file
  CapyStreamIo file = CapyStreamIoCreate();
  $(&file, open)(path, "r");

  // Read all the lines from the file
  CapyListArrChar* lines = $(&file, readLines)();

  // Loop on the line
  forEach(line, lines->iter) {

    // Read the number of fields
    if(lines->iter.idx == 0) {
      sscanf(line.data, "%lu", &(that->nbField));
      safeMalloc(that->fields, that->nbField);
      if(that->fields) loop(i, that->nbField) {
        that->fields[i] = CapyDatasetFieldDescCreate();
        that->fields[i].idx = i;
        that->fields[i].nbCategoryVal = 0;
        that->fields[i].categoryVals = NULL;
      }

    // Read the number of rows
    } else if(lines->iter.idx == 4) {
      sscanf(line.data, "%lu", &(that->nbRow));
      size_t nbLine = $(lines, getSize)();
      if(that->nbRow + 5 != nbLine) raiseExc(CapyExc_InvalidStream);
      safeMalloc(that->rows, that->nbRow);
      if(that->rows) loop(i, that->nbRow) {
        that->rows[i] = CapyDatasetRowCreate();
        safeMalloc(that->rows[i].fields, that->nbField);
      }

    // Read the interface of the fields
    } else if(lines->iter.idx == 1) {
      that->fieldInterfaceStr = strCreate("%s", line.data);
      SplitAtCommas(that->fieldInterfaceStr);
      char* ptr = that->fieldInterfaceStr;
      loop(idx, that->nbField) {
        if(strcmp(ptr, "in") == 0) {
          that->fields[idx].interface = capyDatasetFieldInterface_in;
        } else if(strcmp(ptr, "out") == 0) {
          that->fields[idx].interface = capyDatasetFieldInterface_out;
        } else raiseExc(CapyExc_InvalidStream);
        while(ptr[0] != '\0') ++ptr;
        ++ptr;
        if(idx < that->nbField - 1 && ptr[0] == '\0') {
          raiseExc(CapyExc_InvalidStream);
        }
      }

    // Read the type of the fields
    } else if(lines->iter.idx == 3) {
      that->fieldTypeStr = strCreate("%s", line.data);
      SplitAtCommas(that->fieldTypeStr);
      char* ptr = that->fieldTypeStr;
      loop(idx, that->nbField) {
        if(strcmp(ptr, "cat") == 0) {
          that->fields[idx].type = capyDatasetFieldType_cat;
        } else if(strcmp(ptr, "num") == 0) {
          that->fields[idx].type = capyDatasetFieldType_num;
        } else if(strcmp(ptr, "datetime dd-mm-yyyy hh:ii") == 0) {
          that->fields[idx].type = capyDatasetFieldType_datetime1;
        } else if(strcmp(ptr, "datetime hh:ii") == 0) {
          that->fields[idx].type = capyDatasetFieldType_datetime2;
        } else raiseExc(CapyExc_InvalidStream);
        while(ptr[0] != '\0') ++ptr;
        ++ptr;
        if(idx < that->nbField - 1 && ptr[0] == '\0') {
          raiseExc(CapyExc_InvalidStream);
        }
      }

    // Read the label of the fields
    } else if(lines->iter.idx == 2) {
      that->fieldLabelStr = strCreate("%s", line.data);
      SplitAtCommas(that->fieldLabelStr);
      char* ptr = that->fieldLabelStr;
      loop(idx, that->nbField) {
        that->fields[idx].label = ptr;
        while(ptr[0] != '\0') ++ptr;
        ++ptr;
        if(idx < that->nbField - 1 && ptr[0] == '\0') {
          raiseExc(CapyExc_InvalidStream);
        }
      }

    // Read the rows
    } else {
      if(lines->iter.idx - 5 >= that->nbRow) raiseExc(CapyExc_InvalidStream);
      that->rows[lines->iter.idx - 5].str = strCreate("%s", line.data);
      if(that->rows[lines->iter.idx - 5].str == NULL) {
        raiseExc(CapyExc_InvalidStream);
      }
      SplitAtCommas(that->rows[lines->iter.idx - 5].str);
      char* ptr = that->rows[lines->iter.idx - 5].str;
      loop(idx, that->nbField) {
        that->rows[lines->iter.idx - 5].fields[idx] = ptr;
        bool isNullValue = $(that, isNullValue)(ptr);
        if(isNullValue) {
          if(that->rows[lines->iter.idx - 5].flagNullValue == false) {
            ++(that->nbRowWithNullValue);
            that->rows[lines->iter.idx - 5].flagNullValue = true;
          }
          ++(that->rows[lines->iter.idx - 5].nbFieldWithNullValue);
          if(that->fields[idx].flagNullValue == false) {
            ++(that->nbFieldWithNullValue);
            that->fields[idx].flagNullValue = true;
          }
          ++(that->fields[idx].nbRowWithNullValue);
        }
        while(ptr[0] != '\0') ++ptr;
        ++ptr;
      }
    }
  }

  // Update the values of fields with non-numerical type
  loop (iField, that->nbField) {
    if(that->fields[iField].type != capyDatasetFieldType_num) {
      CapyListPtrChar listValues = CapyListPtrCharCreate();
      $(&listValues, initIterator)();
      loop(iRow, that->nbRow) {
        char* newVal = that->rows[iRow].fields[iField];
        bool isNullValue = $(that, isNullValue)(newVal);
        if(isNullValue == false) {
          bool flagFind = false;
          forEach(val, listValues.iter) {
            if(strcmp(val, newVal) == 0) flagFind = true;
          }
          if(flagFind == false) $(&listValues, add)(newVal);
        }
      }
      that->fields[iField].nbCategoryVal = $(&listValues, getSize)();
      safeMalloc(
        that->fields[iField].categoryVals,
        that->fields[iField].nbCategoryVal);
      if(that->fields[iField].categoryVals) {
        loop(iVal, that->fields[iField].nbCategoryVal) {
          that->fields[iField].categoryVals[iVal] = $(&listValues, pop)();
        }
      }
      $(&listValues, destruct)();
    } else that->fields[iField].nbCategoryVal = that->nbRow;
  }

  // Update the value range and average of fields
  loop (iField, that->nbField) {
    double min = 0.0;
    double max = 0.0;
    double sum = 0.0;
    size_t nb = 0;
    CapyVec vals = CapyVecCreate(that->nbRow);
    bool firstVal = true;
    loop(iRow, that->nbRow) {
      bool isNullValue = $(that, isNullValue)(that->rows[iRow].fields[iField]);
      if(isNullValue == false) {
        double val = $(that, getValAsNum)(iRow, iField);
        if(firstVal || val < min) min = val;
        if(firstVal || val > max) max = val;
        firstVal = false;
        sum += val;
        vals.vals[nb] = val;
        nb += 1;
      }
    }
    if(nb > 0) {
      that->fields[iField].range = CapyRangeDoubleCreate(min, max);
      that->fields[iField].avgVal = sum / (double)nb;
      vals.dim = nb;
      that->fields[iField].medianVal = CapyVecQuickSelect(&vals, nb / 2);
    } else {
      that->fields[iField].range = CapyRangeDoubleCreate(NAN, NAN);
      that->fields[iField].avgVal = NAN;
      that->fields[iField].medianVal = NAN;
    }
    CapyVecDestruct(&vals);
  }

  // Free the memory
  while($(lines, isEmpty)() == false) {
    CapyArrChar line = $(lines, pop)();
    $(&line, destruct)();
  }
  CapyListArrCharFree(&lines);

  // Close the file
  $(&file, destruct)();
}

// Print the dataset description of the dataset on a given stream.
// Input:
//   stream: the stream to print onto
static void Print(FILE* const stream) {
  methodOf(CapyDataset);
  char const* interfaces[2] = {"Inputs", "Outputs"};
  loop(iInterface, 2) {
    fprintf(stream, "%s:\n", interfaces[iInterface]);
    loop(iField, that->nbField) {
      if(that->fields[iField].interface == (unsigned int)iInterface) {
        fprintf(stream, "%s, ", that->fields[iField].label);
        if(that->fields[iField].type == capyDatasetFieldType_num) {
          fprintf(
            stream,
            "numerical values within [%lf,%lf], avg:%lf, med:%lf. ",
            that->fields[iField].range.min,
            that->fields[iField].range.max,
            that->fields[iField].avgVal,
            that->fields[iField].medianVal);
        } else if(that->fields[iField].type == capyDatasetFieldType_cat) {
          fprintf(stream, "categorical values within {");
          loop(iVal, that->fields[iField].nbCategoryVal) {
            if(iVal != 0) fprintf(stream, ",");
            fprintf(stream, "%s", that->fields[iField].categoryVals[iVal]);
          }
          fprintf(
            stream, "}, med:%lu. ", (size_t)(that->fields[iField].medianVal));
        } else if(
          that->fields[iField].type == capyDatasetFieldType_datetime1 ||
          that->fields[iField].type == capyDatasetFieldType_datetime2
        ) {
          CapyDate dateMin = CapyDateCreate();
          $(&dateMin, setDateFromNormalisedDouble)(
            that->fields[iField].range.min);
          CapyDate dateMax = CapyDateCreate();
          $(&dateMax, setDateFromNormalisedDouble)(
            that->fields[iField].range.max);
          fprintf(stream, "dates within [");
          $(&dateMin, print)(stream);
          fprintf(stream, ",");
          $(&dateMax, print)(stream);
          fprintf(stream, "]. ");
          $(&dateMin, destruct)();
          $(&dateMax, destruct)();
        }
        fprintf(
          stream, "%lu unknown values.\n",
          that->fields[iField].nbRowWithNullValue);
      }
    }
  }
  fprintf(
    stream, "%lu field(s) with null/unknown values.\n",
    that->nbFieldWithNullValue);
  fprintf(
    stream, "%lu rows. %lu row(s) with null/unknown values.\n",
    that->nbRow, that->nbRowWithNullValue);
}

// Print the dataset data of the dataset on a given stream.
// Input:
//   stream: the stream to print onto
//   nbRow: if not 0 print the first nbRow rows only
static void PrintData(
   FILE* const stream,
  size_t const nbRow) {
  methodOf(CapyDataset);
  loop(iField, that->nbField) {
    fprintf(stream, "%s\t", that->fields[iField].label);
  }
  fprintf(stream, "\n");
  size_t n = nbRow;
  if(n == 0) n = that->nbRow;
  loop(iRow, n) {
    CapyDatasetRow* row = that->rows + iRow;
    loop(iField, that->nbField) fprintf(stream, "%s\t", row->fields[iField]);
    fprintf(stream, "\n");
  }
}

// Get the number of input fields
// Output:
//   Return the number of input fields
static size_t GetNbInput(void) {
  methodOf(CapyDataset);
  size_t nb = 0;
  loop(iField, that->nbField) {
    if(that->fields[iField].interface == capyDatasetFieldInterface_in) ++nb;
  }
  return nb;
}

// Get the number of output fields
// Output:
//   Return the number of output fields
static size_t GetNbOutput(void) {
  methodOf(CapyDataset);
  size_t nb = 0;
  loop(iField, that->nbField) {
    if(that->fields[iField].interface == capyDatasetFieldInterface_out) ++nb;
  }
  return nb;
}

// Get the number of fields of a given type
// Input:
//   type: the type of field to be counted
// Output:
//   Return the number of fields
static size_t GetNbFieldOfType(CapyDatasetFieldType const type) {
  methodOf(CapyDataset);
  size_t nb = 0;
  loop(iField, that->nbField) if(that->fields[iField].type == type) ++nb;
  return nb;
}

// Get the field index of the i-th input field
// Input:
//   iInput: index of the input
// Output:
//   Return the index
// Exception:
//   May raise CapyExc_InvalidElemIdx.
static size_t GetIdxInputField(
  size_t const iInput) {
  methodOf(CapyDataset);
  size_t idx = 0;
  loop(iField, that->nbField) {
    if(that->fields[iField].interface == capyDatasetFieldInterface_in) {
      if(idx == iInput) return iField;
      ++idx;
    }
  }
  raiseExc(CapyExc_InvalidElemIdx);
  return 0;
}

// Get the field index of the i-th output field
// Input:
//   iOutput: index of the output
// Output:
//   Return the index
// Exception:
//   May raise CapyExc_InvalidElemIdx.
static size_t GetIdxOutputField(
  size_t const iOutput) {
  methodOf(CapyDataset);
  size_t idx = 0;
  loop(iField, that->nbField) {
    if(that->fields[iField].interface == capyDatasetFieldInterface_out) {
      if(idx == iOutput) return iField;
      ++idx;
    }
  }
  raiseExc(CapyExc_InvalidElemIdx);
  return 0;
}

// Get a value as a numeral.
// Inputs:
//   iRow: index of the row
//   iField: index of the field
// Output:
//   For numeral fields return the value as it is, for categorical fields
//   return the index of the value in the list of possible values
//   (fieldDesc.categoryVals)
// Exception:
//   May raise CapyExc_UndefinedExecution, CapyExc_InvalidParameters.
static double GetValAsNum(
   size_t const iRow,
   size_t const iField) {
  methodOf(CapyDataset);
  double ret = 0.0;
  bool isNullValue = $(that, isNullValue)(that->rows[iRow].fields[iField]);
  if(isNullValue) {
    raiseExc(CapyExc_InvalidParameters);
  } else if(that->fields[iField].type == capyDatasetFieldType_num) {
    ret = atof(that->rows[iRow].fields[iField]);
  } else if(that->fields[iField].type == capyDatasetFieldType_datetime1) {
    CapyDate date = CapyDateCreate();
    sscanf(
      that->rows[iRow].fields[iField],
      "%hd-%hd-%hd %hd:%hd",
      date.vals + capyDateUnit_day,
      date.vals + capyDateUnit_month,
      date.vals + capyDateUnit_year,
      date.vals + capyDateUnit_hour,
      date.vals + capyDateUnit_minute);
    date.vals[capyDateUnit_day] -= 1;
    date.vals[capyDateUnit_month] -= 1;
    ret = $(&date, getDateAsNormalisedDouble)();
    $(&date, destruct)();
  } else if(that->fields[iField].type == capyDatasetFieldType_datetime2) {
    CapyDate date = CapyDateCreate();
    sscanf(
      that->rows[iRow].fields[iField],
      "%hd:%hd",
      date.vals + capyDateUnit_hour,
      date.vals + capyDateUnit_minute);
    ret = $(&date, getTimeAsNormalisedDouble)();
    $(&date, destruct)();
  } else if(that->fields[iField].type == capyDatasetFieldType_cat) {
    loop(iVal, that->fields[iField].nbCategoryVal) {
      int cmp = strcmp(
        that->rows[iRow].fields[iField],
        that->fields[iField].categoryVals[iVal]);
      if(cmp == 0) {
        ret = (double)iVal;
        return ret;
      }
    }
  } else raiseExc(CapyExc_UndefinedExecution);
  return ret;
}

// Get a value as a uint64 encoded date.
// Inputs:
//   iRow: index of the row
//   iField: index of the field
// Output:
//   Return the uint64 encoded date (see CapyDate)
// Exception:
//   May raise CapyExc_UndefinedExecution, CapyExc_InvalidParameters.
static uint64_t GetValAsUInt64Date(
  size_t const iRow,
  size_t const iField) {
  methodOf(CapyDataset);
  uint64_t ret = 0;
  bool isNullValue = $(that, isNullValue)(that->rows[iRow].fields[iField]);
  if(isNullValue) {
    raiseExc(CapyExc_InvalidParameters);
  } else if(that->fields[iField].type == capyDatasetFieldType_datetime1) {
    uint64_t year = 0;
    uint64_t month = 0;
    uint64_t day = 0;
    uint64_t hour = 0;
    uint64_t minute = 0;
    uint64_t second = 0;
    sscanf(
      that->rows[iRow].fields[iField], "%lu-%lu-%lu %lu:%lu",
      &day, &month, &year, &hour, &minute);
    uint64_t val = year * 100 + month;
    val = val * 100 + day;
    val = val * 100 + hour;
    val = val * 100 + minute;
    ret = val * 100 + second;
  } else if(that->fields[iField].type == capyDatasetFieldType_datetime2) {
    uint64_t hour = 0;
    uint64_t minute = 0;
    uint64_t second = 0;
    sscanf(that->rows[iRow].fields[iField], "%lu:%lu", &hour, &minute);
    uint64_t val = hour;
    val = val * 100 + minute;
    ret = val * 100 + second;
  } else raiseExc(CapyExc_UndefinedExecution);
  return ret;
}

// Get a value as a normalised numeral.
// Inputs:
//   iRow: index of the row
//   iField: index of the field
// Output:
//   Return the value, converted to numerical if the field is categorical,
//   after normalisation according to the 'range' property of the field
//   description.
// Exception:
//   May raise CapyExc_UndefinedExecution.
static double GetValAsNormalisedNum(
   size_t const iRow,
   size_t const iField) {
  methodOf(CapyDataset);

  // Get the value as a numerical
  double val = $(that, getValAsNum)(iRow, iField);

  // Normalise the value
  CapyRangeDouble normedRange = {.min = 0.0, .max = 1.0};
  if(equald(that->fields[iField].range.min, that->fields[iField].range.max)) {
    val = that->fields[iField].range.min;
  } else {
    val = CapyLerp(val, &(that->fields[iField].range), &normedRange);
  }

  // Return the normalised value
  return val;
}

// Convert a dataset to a matrix to be used by a single category predictor
// Input:
//   iOutput: index of the output
// Output:
//   Return a matrix with as many rows as there are rows in the dataset, and
//   as many columns as there are inputs in the dataset plus one. The output
//   must be of type capyDatasetFieldType_cat. Input values are converted
//   using getValAsNormalisedNum. The output value is assigned to the last
//   column in the matrix, and equal to the category index.
// Exception:
//   May raise CapyExc_UnsupportedFormat.
typedef struct CvtToMatForSingleCatPredictorArg {
  CapyDataset* that;
  CapyRangeSize range;
  size_t nbInput;
  size_t idxOutput;
  CapyMat mat;
} CvtToMatForSingleCatPredictorArg;

static void* CvtToMatForSingleCatPredictorThread(void* arg) {
  CapyDataset* that = ((CvtToMatForSingleCatPredictorArg*)arg)->that;
  CapyRangeSize range = ((CvtToMatForSingleCatPredictorArg*)arg)->range;
  size_t nbInput =
    ((CvtToMatForSingleCatPredictorArg*)arg)->nbInput;
  size_t idxOutput =
    ((CvtToMatForSingleCatPredictorArg*)arg)->idxOutput;
  CapyMat mat = ((CvtToMatForSingleCatPredictorArg*)arg)->mat;
  loopRange(iRow, range) {
    loop(iInput, nbInput) {
      size_t iField = $(that, getIdxInputField)(iInput);
      mat.vals[iRow * mat.nbCol + iInput] =
        $(that, getValAsNum)(iRow, iField);
    }
    mat.vals[iRow * mat.nbCol + nbInput] =
      $(that, getValAsNum)(iRow, idxOutput);
  }
  return NULL;
}

static CapyMat CvtToMatForSingleCatPredictor(size_t const iOutput) {
  methodOf(CapyDataset);

  // Ensure the dataset has at least one input and the output is of type
  // categorical
  size_t nbInput = $(that, getNbInput)();
  if(nbInput == 0) raiseExc(CapyExc_UnsupportedFormat);
  size_t idxOutput = $(that, getIdxOutputField)(iOutput);
  if(that->fields[idxOutput].type != capyDatasetFieldType_cat) {
    raiseExc(CapyExc_UnsupportedFormat);
  }

  // Create the result matrix
  CapyMat mat = CapyMatCreate(nbInput + 1, that->nbRow);

  // Split the conversion into several threads
  size_t nbThread = that->nbThread;
  if(nbThread > mat.nbRow) nbThread = 1;
  pthread_t thread[nbThread];
  CvtToMatForSingleCatPredictorArg threadArgs[nbThread];
  size_t nbRowPerThread = mat.nbRow / nbThread;
  loop(i, nbThread) {
    threadArgs[i] = (CvtToMatForSingleCatPredictorArg){
      .that = that,
      .nbInput = nbInput,
      .idxOutput = idxOutput,
      .mat = mat,
      .range = {.min = i * nbRowPerThread, .max = (i + 1) * nbRowPerThread - 1},
    };
    if(i == nbThread - 1) threadArgs[i].range.max = mat.nbRow - 1;
    int ret =
      pthread_create(
        thread + i, NULL, CvtToMatForSingleCatPredictorThread, threadArgs + i);
    if(ret != 0) raiseExc(CapyExc_ForkFailed);
  }

  // Wait for the thread to terminate
  loop(i, nbThread) {
    int ret = pthread_join(thread[i], NULL);
    if(ret != 0) raiseExc(CapyExc_ForkFailed);
  }

  // Return the matrix
  return mat;
}

// Convert a dataset to a matrix to be used by a numerical predictor
// Input:
//   iOutput: index of the output
// Output:
//   Return a matrix with as many rows as there are rows in the dataset, and
//   as many columns as there are inputs in the dataset plus one. The output
//   must be of type capyDatasetFieldType_num. Input and output values are
//   converted using resp. getValAsNormalisedNum and getValAsNum. The output
//   value is assigned to the last column in the matrix.
// Exception:
//   May raise CapyExc_UnsupportedFormat.
typedef struct CvtToMatForNumericalPredictorArg {
  CapyDataset* that;
  CapyRangeSize range;
  size_t nbInput;
  size_t idxOutput;
  CapyMat mat;
} CvtToMatForNumericalPredictorArg;

static void* CvtToMatForNumericalPredictorThread(void* arg) {
  CapyDataset* that = ((CvtToMatForNumericalPredictorArg*)arg)->that;
  CapyRangeSize range = ((CvtToMatForNumericalPredictorArg*)arg)->range;
  size_t nbInput =
    ((CvtToMatForNumericalPredictorArg*)arg)->nbInput;
  size_t idxOutput =
    ((CvtToMatForNumericalPredictorArg*)arg)->idxOutput;
  CapyMat mat = ((CvtToMatForNumericalPredictorArg*)arg)->mat;
  loopRange(iRow, range) {
    loop(iInput, nbInput) {
      size_t iField = $(that, getIdxInputField)(iInput);
      mat.vals[iRow * mat.nbCol + iInput] =
        $(that, getValAsNum)(iRow, iField);
    }
    mat.vals[iRow * mat.nbCol + nbInput] =
      $(that, getValAsNum)(iRow, idxOutput);
  }
  return NULL;
}

static CapyMat CvtToMatForNumericalPredictor(size_t const iOutput) {
  methodOf(CapyDataset);

  // Ensure the dataset has at least one input
  size_t nbInput = $(that, getNbInput)();
  if(nbInput == 0) raiseExc(CapyExc_UnsupportedFormat);

  // Create the result matrix
  CapyMat mat = CapyMatCreate(nbInput + 1, that->nbRow);

  // Split the conversion into several threads
  size_t nbThread = that->nbThread;
  if(nbThread > mat.nbRow) nbThread = 1;
  pthread_t thread[nbThread];
  CvtToMatForNumericalPredictorArg threadArgs[nbThread];
  size_t nbRowPerThread = mat.nbRow / nbThread;
  size_t idxOutput = $(that, getIdxOutputField)(iOutput);
  loop(i, nbThread) {
    threadArgs[i] = (CvtToMatForNumericalPredictorArg){
      .that = that,
      .nbInput = nbInput,
      .idxOutput = idxOutput,
      .mat = mat,
      .range = {.min = i * nbRowPerThread, .max = (i + 1) * nbRowPerThread - 1},
    };
    if(i == nbThread - 1) threadArgs[i].range.max = mat.nbRow - 1;
    int ret =
      pthread_create(
        thread + i, NULL, CvtToMatForNumericalPredictorThread, threadArgs + i);
    if(ret != 0) raiseExc(CapyExc_ForkFailed);
  }

  // Wait for the thread to terminate
  loop(i, nbThread) {
    int ret = pthread_join(thread[i], NULL);
    if(ret != 0) raiseExc(CapyExc_ForkFailed);
  }

  // Return the matrix
  return mat;
}

// Get the number of different values for a given output
// Inputs:
//   iField: index of the output
// Output:
//   Return the number of different values for a categorical output field
//   or the number of rows for a numerical output field
static size_t GetNbValOutputField(
  size_t const iOutput) {
  methodOf(CapyDataset);

  // Variable to memorise the result
  size_t nb = 0;

  // Get the number of values
  size_t idxOutput = $(that, getIdxOutputField)(iOutput);
  nb = that->fields[idxOutput].nbCategoryVal;

  // Return the result
  return nb;
}

// Free the memory used by a CapyDataset
static void Destruct(void) {
  methodOf(CapyDataset);
  free(that->fieldInterfaceStr);
  free(that->fieldTypeStr);
  free(that->fieldLabelStr);
  loop(iField, that->nbField) $(that->fields + iField, destruct)();
  free(that->fields);
  loop(iRow, that->nbRow) $(that->rows + iRow, destruct)();
  free(that->rows);
}

// Convert a dataset to a matrix to be used by a one hot predictor
// Input:
//   iOutput: index of the output
// Output:
//   Return a matrix with as many rows as there are rows in the dataset, and
//   as many columns as there are inputs in the dataset plus as many values
//   the given output takes. The output must be of type
//   capyDatasetFieldType_cat. Input values are converted using
//   getValAsNormalisedNum. The one hot encoding of the output value is
//   assigned to the last columns in the matrix, and take values 0 for 'is
//   not this category' and 1 for 'is this category'.
// Exception:
//   May raise CapyExc_UnsupportedFormat.
typedef struct CvtToMatForOneHotPredictorArg {
  CapyDataset* that;
  CapyRangeSize range;
  size_t nbInput;
  CapyPad(size_t, 0);
  size_t nbValOutput;
  CapyPad(size_t, 1);
  size_t idxOutput;
  CapyPad(size_t, 2);
  CapyMat mat;
} CvtToMatForOneHotPredictorArg;

static void* CvtToMatForOneHotPredictorThread(void* arg) {
  CapyDataset* that = ((CvtToMatForOneHotPredictorArg*)arg)->that;
  CapyRangeSize range = ((CvtToMatForOneHotPredictorArg*)arg)->range;
  size_t nbInput =
    ((CvtToMatForOneHotPredictorArg*)arg)->nbInput;
  size_t nbValOutput =
    ((CvtToMatForOneHotPredictorArg*)arg)->nbValOutput;
  size_t idxOutput =
    ((CvtToMatForOneHotPredictorArg*)arg)->idxOutput;
  CapyMat mat = ((CvtToMatForOneHotPredictorArg*)arg)->mat;
  loopRange(iRow, range) {
    loop(iInput, nbInput) {
      size_t iField = $(that, getIdxInputField)(iInput);
      mat.vals[iRow * mat.nbCol + iInput] =
        $(that, getValAsNum)(iRow, iField);
    }
    double val = $(that, getValAsNum)(iRow, idxOutput);
    size_t iCat = (size_t)lroundl(val);
    loop(iVal, nbValOutput) {
      mat.vals[iRow * mat.nbCol + nbInput + iVal] =
        (iVal == iCat ? 1.0 : 0.0);
    }
  }
  return NULL;
}

static CapyMat CvtToMatForOneHotPredictor(
  size_t const iOutput) {
  methodOf(CapyDataset);

  // Ensure the dataset has at least one input and the output is of type
  // categorical
  size_t nbInput = $(that, getNbInput)();
  if(nbInput == 0) raiseExc(CapyExc_UnsupportedFormat);
  size_t idxOutput = $(that, getIdxOutputField)(iOutput);
  if(that->fields[idxOutput].type != capyDatasetFieldType_cat) {
    raiseExc(CapyExc_UnsupportedFormat);
  }

  // Create the result matrix
  size_t nbValOutput = that->fields[idxOutput].nbCategoryVal;
  CapyMat mat = CapyMatCreate(nbInput + nbValOutput, that->nbRow);

  // Split the conversion into several threads
  size_t nbThread = that->nbThread;
  if(nbThread > mat.nbRow) nbThread = 1;
  pthread_t thread[nbThread];
  CvtToMatForOneHotPredictorArg threadArgs[nbThread];
  size_t nbRowPerThread = mat.nbRow / nbThread;
  loop(i, nbThread) {
    threadArgs[i] = (CvtToMatForOneHotPredictorArg){
      .that = that,
      .nbInput = nbInput,
      .nbValOutput = nbValOutput,
      .idxOutput = idxOutput,
      .mat = mat,
      .range = {.min = i * nbRowPerThread, .max = (i + 1) * nbRowPerThread - 1},
    };
    if(i == nbThread - 1) threadArgs[i].range.max = mat.nbRow - 1;
    int ret =
      pthread_create(
        thread + i, NULL, CvtToMatForOneHotPredictorThread, threadArgs + i);
    if(ret != 0) raiseExc(CapyExc_ForkFailed);
  }

  // Wait for the thread to terminate
  loop(i, nbThread) {
    int ret = pthread_join(thread[i], NULL);
    if(ret != 0) raiseExc(CapyExc_ForkFailed);
  }

  // Return the matrix
  return mat;
}

// Get the index of a field from its name
// Input:
//   name: the name
// Output:
//   Return the index of the field, or raise the exception
//   CapyExc_InvalidParameters if it couldn't be found.
static size_t GetIdxFieldFromName(char const* const name) {
  methodOf(CapyDataset);
  loop(iField, that->nbField) {
    if(strcmp(name, that->fields[iField].label) == 0) return iField;
  }
  raiseExc(CapyExc_InvalidParameters);
  return 0;
}

// Get the distribution of a field values as an array of bins.
// Input:
//   iField: the index of the field
//   nbBin: the number of bins
// Output:
//   Return a CapyArrSize of size 'nbBin'.
static CapyArrSize* GetDistAsBins(
  size_t const iField,
  size_t const nbBin) {
  methodOf(CapyDataset);
  CapyArrSize* bins = CapyArrSizeAlloc(nbBin);
  loop(i, nbBin) $(bins, set)(i, &(size_t){0});
  loop(iRow, that->nbRow) {
    bool isNullValue = $(that, isNullValue)(that->rows[iRow].fields[iField]);
    if(isNullValue == false) {
      double val = $(that, getValAsNormalisedNum)(iRow, iField);
      size_t iBin = (size_t)floor(val * (double)(nbBin - 1));
      size_t* n = $(bins, getPtr)(iBin);
      (*n)++;
    }
  }
  return bins;
}

// Get the distribution of a field values as an array of bins for a given
// value of a given categorical field
// Input:
//   iField: the index of the field
//   nbBin: the number of bins
//   iCatField: the index of the categorical field
//   valCatField: the falue of the categorical field
// Output:
//   Return a CapyArrSize of size 'nbBin'.
static CapyArrSize* GetDistAsBinsGivenCatValue(
       size_t const iField,
       size_t const nbBin,
       size_t const iCatField,
  char const* const valCatField) {
  methodOf(CapyDataset);
  CapyArrSize* bins = CapyArrSizeAlloc(nbBin);
  loop(i, nbBin) $(bins, set)(i, &(size_t){0});
  loop(iRow, that->nbRow) {
    char const* valCat = that->rows[iRow].fields[iCatField];
    bool isNullValueCat = $(that, isNullValue)(valCat);
    bool isNullValueField =
      $(that, isNullValue)(that->rows[iRow].fields[iField]);
    if(
      isNullValueCat == false &&
      strcmp(valCat, valCatField) == 0 &&
      isNullValueField == false
    ) {
      double val = $(that, getValAsNormalisedNum)(iRow, iField);
      size_t iBin = (size_t)floor(val * (double)(nbBin - 1));
      size_t* n = $(bins, getPtr)(iBin);
      (*n)++;
    }
  }
  return bins;
}

// Get the number of rows containing a particular value for a given field.
// Input:
//   iField: the filtering field
//   valField: the filtering value
// Output:
//   Return the number of rows.
static size_t GetNbRowContainingVal(
       size_t const iField,
  char const* const valField) {
  methodOf(CapyDataset);
  size_t nbRow = 0;
  loop(iRow, that->nbRow) {
    char const* val = that->rows[iRow].fields[iField];
    bool isNullValue = $(that, isNullValue)(val);
    if(isNullValue == false && strcmp(valField, val) == 0) ++nbRow;
  }
  return nbRow;
}

// Get the pair of values from two fields as two vectors.
// Input:
//   iField: the first field
//   jField: the second field
//   u: the vector receiving values from the first field
//   v: the vector receiving values from the second field
// Output:
//   'u' and 'v' are destructed, created afresh and populated with values.
//   Rows with null value in one or the other field are ignored.
static void GetValuesFromTwoFieldsAsVectors(
    size_t const iField,
    size_t const jField,
  CapyVec* const u,
  CapyVec* const v) {
  methodOf(CapyDataset);
  CapyVecDestruct(u);
  CapyVecDestruct(v);
  *u = CapyVecCreate(that->nbRow);
  *v = CapyVecCreate(that->nbRow);
  size_t dim = 0;
  loop(iRow, that->nbRow) {
    bool isNullValueI = $(that, isNullValue)(that->rows[iRow].fields[iField]);
    bool isNullValueJ = $(that, isNullValue)(that->rows[iRow].fields[jField]);
    if(isNullValueI == false && isNullValueJ == false) {
      u->vals[dim] = $(that, getValAsNum)(iRow, iField);
      v->vals[dim] = $(that, getValAsNum)(iRow, jField);
      ++dim;
    }
  }
  u->dim = dim;
  v->dim = dim;
}

// Convert the dataset into a point cloud
// Output:
//   Return a CapyPointCloud of dimension equal to the number of fields
//   and number of point equal to the number of sample. All values are
// converted to numerical values.
static CapyPointCloud* ToPointCloud(void) {
  methodOf(CapyDataset);
  CapyPointCloud* pc = CapyPointCloudAlloc(that->nbField);
  if(pc == NULL) return NULL;
  safeMalloc(pc->points, that->nbRow);
  if(pc->points == NULL) {
    CapyPointCloudFree(&pc);
    return NULL;
  }
  pc->size = that->nbRow;
  loop(iRow, that->nbRow) {
    pc->points[iRow] = CapyVecCreate(that->nbField);
    loop(iField, that->nbField) {
      pc->points[iRow].vals[iField] = $(that, getValAsNum)(iRow, iField);
    }
  }
  return pc;
}

// Create a CapyDataset
// Output:
//   Return a CapyDataset
CapyDataset CapyDatasetCreate(void) {
  return (CapyDataset){
    .nbRow = 0,
    .nbField = 0,
    .fieldInterfaceStr = NULL,
    .fieldTypeStr = NULL,
    .fieldLabelStr = NULL,
    .fields = NULL,
    .rows = NULL,
    .nbThread = 10,
    .destruct = Destruct,
    .loadFromPath = LoadFromPath,
    .print = Print,
    .printData = PrintData,
    .getNbInput = GetNbInput,
    .getNbOutput = GetNbOutput,
    .getNbFieldOfType = GetNbFieldOfType,
    .getIdxInputField = GetIdxInputField,
    .getIdxOutputField = GetIdxOutputField,
    .getValAsNum = GetValAsNum,
    .getValAsUInt64Date = GetValAsUInt64Date,
    .getValAsNormalisedNum = GetValAsNormalisedNum,
    .cvtToMatForSingleCatPredictor = CvtToMatForSingleCatPredictor,
    .cvtToMatForOneHotPredictor = CvtToMatForOneHotPredictor,
    .cvtToMatForNumericalPredictor = CvtToMatForNumericalPredictor,
    .getNbValOutputField = GetNbValOutputField,
    .getIdxFieldFromName = GetIdxFieldFromName,
    .getDistAsBins = GetDistAsBins,
    .getDistAsBinsGivenCatValue = GetDistAsBinsGivenCatValue,
    .getNbRowContainingVal = GetNbRowContainingVal,
    .getValuesFromTwoFieldsAsVectors = GetValuesFromTwoFieldsAsVectors,
    .isNullValue = IsNullValue,
    .toPointCloud = ToPointCloud,
  };
}

// Allocate memory for a new CapyDataset and create it
// Output:
//   Return a CapyDataset
// Exception:
//   May raise CapyExc_MallocFailed.
CapyDataset* CapyDatasetAlloc(void) {
  CapyDataset* that = NULL;
  safeMalloc(that, 1);
  if(!that) return NULL;
  *that = CapyDatasetCreate();
  return that;
}

// Free the memory used by a CapyDataset* and reset '*that' to NULL
// Input:
//   that: a pointer to the CapyDataset to free
void CapyDatasetFree(CapyDataset** const that) {
  if(that == NULL || *that == NULL) return;
  $(*that, destruct)();
  free(*that);
  *that = NULL;
}
