Mercurial > repos > recetox > aplcms_to_ramclustr_converter
diff aplcms_to_ramclustr_converter.py @ 4:9ea34e24474f draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tools/aplcms_to_ramclustr_converter/ commit 2dd20229f0c7f43dacc0d201ea50fef3c993d30e"
author | recetox |
---|---|
date | Mon, 09 Aug 2021 15:29:08 +0000 |
parents | 07667688735e |
children |
line wrap: on
line diff
--- a/aplcms_to_ramclustr_converter.py Wed Feb 17 15:14:33 2021 +0000 +++ b/aplcms_to_ramclustr_converter.py Mon Aug 09 15:29:08 2021 +0000 @@ -2,56 +2,32 @@ import argparse import sys -import warnings import pandas as pd -warnings.simplefilter('ignore') - parser = argparse.ArgumentParser() -parser.add_argument("--dataframe", help="Name of hdf dataframe") -parser.add_argument("--table", help="Name of a table in the dataframe") +parser.add_argument("--dataframe", help="Parquet dataframe") parser.add_argument('output') args = parser.parse_args() -def extract_data(table): - num_samples = int((len(table.columns.tolist()) - 4) / 2) - mz_rt = table['mz'].map(str) + "_" + table['rt'].map(str) +def main(): + featureTable = pd.read_parquet(args.dataframe) - intensities = table.iloc[:, 4:(4 + num_samples)] - sample_labels = [label.split('.')[1] for label in intensities.columns.tolist()] - ramclustr_data = pd.DataFrame({'mz_rt': mz_rt}) - - for idx in range(num_samples): - label = sample_labels[idx] - ramclustr_data[label] = intensities.iloc[:, idx] - - return ramclustr_data - + # Concatenate "mz" and "rt" columns; select relevant columns; pivot the table + featureTable["mz_rt"] = featureTable["mz"].astype(str) + "_" + featureTable["rt"].astype(str) + featureTable = featureTable[["sample", "mz_rt", "sample_intensity"]] + featureTable = pd.pivot_table(featureTable, columns="mz_rt", index="sample", values="sample_intensity") -def format_table(ramclustr_data): - ramclustr_data.set_index('mz_rt', inplace=True) - ramclustr_data = ramclustr_data.transpose() - ramclustr_data.index.rename('sample', inplace=True) - return ramclustr_data - - -def main(): try: - aplcms_table = pd.read_hdf(args.dataframe, args.table, errors='None') - except KeyError: - msg = "Selected table does not exist in HDF dataframe" - print(msg, file=sys.stderr) - sys.exit(1) - - ramclustr_data = extract_data(aplcms_table) - ramclustr_table = format_table(ramclustr_data) - - ramclustr_table.to_csv(args.output, sep=',') - msg = "Table '{}' of HDF dataset is converted to csv for RamClutsR".format(args.table) - print(msg, file=sys.stdout) + featureTable.to_csv(args.output, sep=',') + msg = f"Dataset of {len(featureTable)} samples is converted to a feature-by-sample table" + print(msg, file=sys.stdout) + return 0 + except Exception: + print("Could not write the data", file=sys.stdout) + return 1 if __name__ == "__main__":