diff aplcms_to_ramclustr_converter.py @ 4:9ea34e24474f draft

"planemo upload for repository https://github.com/RECETOX/galaxytools/tools/aplcms_to_ramclustr_converter/ commit 2dd20229f0c7f43dacc0d201ea50fef3c993d30e"
author recetox
date Mon, 09 Aug 2021 15:29:08 +0000
parents 07667688735e
children
line wrap: on
line diff
--- a/aplcms_to_ramclustr_converter.py	Wed Feb 17 15:14:33 2021 +0000
+++ b/aplcms_to_ramclustr_converter.py	Mon Aug 09 15:29:08 2021 +0000
@@ -2,56 +2,32 @@
 
 import argparse
 import sys
-import warnings
 
 import pandas as pd
 
 
-warnings.simplefilter('ignore')
-
 parser = argparse.ArgumentParser()
-parser.add_argument("--dataframe", help="Name of hdf dataframe")
-parser.add_argument("--table", help="Name of a table in the dataframe")
+parser.add_argument("--dataframe", help="Parquet dataframe")
 parser.add_argument('output')
 args = parser.parse_args()
 
 
-def extract_data(table):
-    num_samples = int((len(table.columns.tolist()) - 4) / 2)
-    mz_rt = table['mz'].map(str) + "_" + table['rt'].map(str)
+def main():
+    featureTable = pd.read_parquet(args.dataframe)
 
-    intensities = table.iloc[:, 4:(4 + num_samples)]
-    sample_labels = [label.split('.')[1] for label in intensities.columns.tolist()]
-    ramclustr_data = pd.DataFrame({'mz_rt': mz_rt})
-
-    for idx in range(num_samples):
-        label = sample_labels[idx]
-        ramclustr_data[label] = intensities.iloc[:, idx]
-
-    return ramclustr_data
-
+    # Concatenate "mz" and "rt" columns; select relevant columns; pivot the table
+    featureTable["mz_rt"] = featureTable["mz"].astype(str) + "_" + featureTable["rt"].astype(str)
+    featureTable = featureTable[["sample", "mz_rt", "sample_intensity"]]
+    featureTable = pd.pivot_table(featureTable, columns="mz_rt", index="sample", values="sample_intensity")
 
-def format_table(ramclustr_data):
-    ramclustr_data.set_index('mz_rt', inplace=True)
-    ramclustr_data = ramclustr_data.transpose()
-    ramclustr_data.index.rename('sample', inplace=True)
-    return ramclustr_data
-
-
-def main():
     try:
-        aplcms_table = pd.read_hdf(args.dataframe, args.table, errors='None')
-    except KeyError:
-        msg = "Selected table does not exist in HDF dataframe"
-        print(msg, file=sys.stderr)
-        sys.exit(1)
-
-    ramclustr_data = extract_data(aplcms_table)
-    ramclustr_table = format_table(ramclustr_data)
-
-    ramclustr_table.to_csv(args.output, sep=',')
-    msg = "Table '{}' of HDF dataset is converted to csv for RamClutsR".format(args.table)
-    print(msg, file=sys.stdout)
+        featureTable.to_csv(args.output, sep=',')
+        msg = f"Dataset of {len(featureTable)} samples is converted to a feature-by-sample table"
+        print(msg, file=sys.stdout)
+        return 0
+    except Exception:
+        print("Could not write the data", file=sys.stdout)
+        return 1
 
 
 if __name__ == "__main__":