"""
    OpenMLImport - A Python script to import datasets from openml.org in a
    format compatible with LibCapy.
    Copyright (C) 2022 Pascal Baillehache baillehache.pascal@gmail.com
    https://baillehachepascal.dev
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
from pathlib import Path
import argparse
import openml

# Parsing of the arguments
parser = \
    argparse.ArgumentParser(
        prog='OpenMLImport',
        description='Script importing datasets from OpenML and converting ' +
                    'them in a format compatible with LibCapy')
parser.add_argument('-o', '--output-folder', default=Path('./'), type=Path)
parser.add_argument('-n', '--name', default='dataset')
parser.add_argument('-i', '--task-id', required=True, type=int)
parser.add_argument('-s', '--split', action='store_true')
parser.add_argument('-b', '--best', action='store_true')
args = parser.parse_args()
print(f"Importing {args.name} (id:{args.task_id}) to {args.output_folder}")

# Import the data from OpenML
task = openml.tasks.get_task(args.task_id)
inputs, outputs = task.get_X_and_y(dataset_format="dataframe")
nb_input = inputs.values[0].shape[0]
nb_sample = inputs.shape[0]
nb_output = 1
print(
    f"The dataset contains {nb_sample} samples with " +
    f"{nb_input} inputs and {nb_output} outputs")

# Open the result CSV file
path_csv = args.output_folder / (args.name + '.csv')
with open(str(path_csv), "w") as f:

    # Create the header
    nb_field = nb_input + nb_output
    f.write(f"{nb_field}\n")
    categories = (['in'] * nb_input) + ['out']
    f.write(f"{','.join(categories)}\n")
    labels = inputs.columns.values.tolist() + [outputs.name]
    f.write(f"{','.join(labels)}\n")
    types = []
    for i, d in enumerate(inputs.dtypes):
        if str(d) == "float64" or str(d) == "int64":
            types += ['num']
        else:
            types += ['cat']
        print(f"{labels[i]}: {d} -> {types[-1]}")
    if str(outputs.dtypes) == "category":
        types += ['cat']
    else:
        types += ['num']
    print(f"{labels[-1]}: {outputs.dtypes} -> {types[-1]}")
    f.write(f"{','.join(types)}\n")
    f.write(f"{nb_sample}\n")

    # Create the samples
    for i in range(inputs.shape[0]):
        row = \
            ','.join(inputs.values[i].astype(str)) + ',' + outputs[i] + '\n'
        f.write(row)

if args.split:
    # Open the result split file
    path_split = args.output_folder / (args.name + '_split.txt')
    with open(str(path_split), "w") as f:
        nb_repeats, nb_folds, nb_samples = task.get_split_dimensions()
        print("Importing k-fold cross validation data:")
        print(
            f"nb_repeats {nb_repeats} nb_folds {nb_folds} " +
            f"nb_samples {nb_samples}")
        if nb_repeats != 1 or nb_folds != 10 or nb_samples != 1:
            print(
                "The task must have one sample, 10 folds and repeatition")
        else:
            for i_fold in range(nb_folds):
                train_indices, test_indices = \
                    task.get_train_test_split_indices(
                        repeat=0, fold=i_fold, sample=0)
                if i_fold == 0:
                    header_txt = str(nb_folds) + ' '
                    nb_training_sample = train_indices.shape[0]
                    header_txt += str(nb_training_sample) + ' '
                    nb_test_sample = test_indices.shape[0]
                    header_txt += str(nb_test_sample) + '\n'
                    f.write(header_txt)
                split_txt = ' '.join(map(str, train_indices)) + ' '
                split_txt += ' '.join(map(str, test_indices)) + '\n'
                f.write(split_txt)

if args.best:
    # Search the best run
    print('Importing best run')
    evals = openml.evaluations.list_evaluations(
        function="predictive_accuracy",
        tasks=[args.task_id], output_format="dataframe")
    evals = evals.sort_values(by="value", ascending=False)
    path_best = args.output_folder / (args.name + '_best.txt')
    with open(str(path_best), "w") as f:
        f.write(str(evals.iloc[0]))

print('Importation completed.')