""" OpenMLImport - A Python script to import datasets from openml.org in a format compatible with LibCapy. Copyright (C) 2022 Pascal Baillehache baillehache.pascal@gmail.com https://baillehachepascal.dev This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ from pathlib import Path import argparse import openml # Parsing of the arguments parser = \ argparse.ArgumentParser( prog='OpenMLImport', description='Script importing datasets from OpenML and converting ' + 'them in a format compatible with LibCapy') parser.add_argument('-o', '--output-folder', default=Path('./'), type=Path) parser.add_argument('-n', '--name', default='dataset') parser.add_argument('-i', '--task-id', required=True, type=int) parser.add_argument('-s', '--split', action='store_true') parser.add_argument('-b', '--best', action='store_true') args = parser.parse_args() print(f"Importing {args.name} (id:{args.task_id}) to {args.output_folder}") # Import the data from OpenML task = openml.tasks.get_task(args.task_id) inputs, outputs = task.get_X_and_y(dataset_format="dataframe") nb_input = inputs.values[0].shape[0] nb_sample = inputs.shape[0] nb_output = 1 print( f"The dataset contains {nb_sample} samples with " + f"{nb_input} inputs and {nb_output} outputs") # Open the result CSV file path_csv = args.output_folder / (args.name + '.csv') with open(str(path_csv), "w") as f: # Create the header nb_field = nb_input + nb_output f.write(f"{nb_field}\n") categories = (['in'] * nb_input) + ['out'] f.write(f"{','.join(categories)}\n") labels = inputs.columns.values.tolist() + [outputs.name] f.write(f"{','.join(labels)}\n") types = [] for i, d in enumerate(inputs.dtypes): if str(d) == "float64" or str(d) == "int64": types += ['num'] else: types += ['cat'] print(f"{labels[i]}: {d} -> {types[-1]}") if str(outputs.dtypes) == "category": types += ['cat'] else: types += ['num'] print(f"{labels[-1]}: {outputs.dtypes} -> {types[-1]}") f.write(f"{','.join(types)}\n") f.write(f"{nb_sample}\n") # Create the samples for i in range(inputs.shape[0]): row = \ ','.join(inputs.values[i].astype(str)) + ',' + outputs[i] + '\n' f.write(row) if args.split: # Open the result split file path_split = args.output_folder / (args.name + '_split.txt') with open(str(path_split), "w") as f: nb_repeats, nb_folds, nb_samples = task.get_split_dimensions() print("Importing k-fold cross validation data:") print( f"nb_repeats {nb_repeats} nb_folds {nb_folds} " + f"nb_samples {nb_samples}") if nb_repeats != 1 or nb_folds != 10 or nb_samples != 1: print( "The task must have one sample, 10 folds and repeatition") else: for i_fold in range(nb_folds): train_indices, test_indices = \ task.get_train_test_split_indices( repeat=0, fold=i_fold, sample=0) if i_fold == 0: header_txt = str(nb_folds) + ' ' nb_training_sample = train_indices.shape[0] header_txt += str(nb_training_sample) + ' ' nb_test_sample = test_indices.shape[0] header_txt += str(nb_test_sample) + '\n' f.write(header_txt) split_txt = ' '.join(map(str, train_indices)) + ' ' split_txt += ' '.join(map(str, test_indices)) + '\n' f.write(split_txt) if args.best: # Search the best run print('Importing best run') evals = openml.evaluations.list_evaluations( function="predictive_accuracy", tasks=[args.task_id], output_format="dataframe") evals = evals.sort_values(by="value", ascending=False) path_best = args.output_folder / (args.name + '_best.txt') with open(str(path_best), "w") as f: f.write(str(evals.iloc[0])) print('Importation completed.')