comparison aplcms_to_ramclustr_converter.py @ 4:9ea34e24474f draft

"planemo upload for repository https://github.com/RECETOX/galaxytools/tools/aplcms_to_ramclustr_converter/ commit 2dd20229f0c7f43dacc0d201ea50fef3c993d30e"
author recetox
date Mon, 09 Aug 2021 15:29:08 +0000
parents 07667688735e
children
comparison
equal deleted inserted replaced
3:07667688735e 4:9ea34e24474f
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import sys 4 import sys
5 import warnings
6 5
7 import pandas as pd 6 import pandas as pd
8 7
9 8
10 warnings.simplefilter('ignore')
11
12 parser = argparse.ArgumentParser() 9 parser = argparse.ArgumentParser()
13 parser.add_argument("--dataframe", help="Name of hdf dataframe") 10 parser.add_argument("--dataframe", help="Parquet dataframe")
14 parser.add_argument("--table", help="Name of a table in the dataframe")
15 parser.add_argument('output') 11 parser.add_argument('output')
16 args = parser.parse_args() 12 args = parser.parse_args()
17 13
18 14
19 def extract_data(table): 15 def main():
20 num_samples = int((len(table.columns.tolist()) - 4) / 2) 16 featureTable = pd.read_parquet(args.dataframe)
21 mz_rt = table['mz'].map(str) + "_" + table['rt'].map(str)
22 17
23 intensities = table.iloc[:, 4:(4 + num_samples)] 18 # Concatenate "mz" and "rt" columns; select relevant columns; pivot the table
24 sample_labels = [label.split('.')[1] for label in intensities.columns.tolist()] 19 featureTable["mz_rt"] = featureTable["mz"].astype(str) + "_" + featureTable["rt"].astype(str)
25 ramclustr_data = pd.DataFrame({'mz_rt': mz_rt}) 20 featureTable = featureTable[["sample", "mz_rt", "sample_intensity"]]
21 featureTable = pd.pivot_table(featureTable, columns="mz_rt", index="sample", values="sample_intensity")
26 22
27 for idx in range(num_samples):
28 label = sample_labels[idx]
29 ramclustr_data[label] = intensities.iloc[:, idx]
30
31 return ramclustr_data
32
33
34 def format_table(ramclustr_data):
35 ramclustr_data.set_index('mz_rt', inplace=True)
36 ramclustr_data = ramclustr_data.transpose()
37 ramclustr_data.index.rename('sample', inplace=True)
38 return ramclustr_data
39
40
41 def main():
42 try: 23 try:
43 aplcms_table = pd.read_hdf(args.dataframe, args.table, errors='None') 24 featureTable.to_csv(args.output, sep=',')
44 except KeyError: 25 msg = f"Dataset of {len(featureTable)} samples is converted to a feature-by-sample table"
45 msg = "Selected table does not exist in HDF dataframe" 26 print(msg, file=sys.stdout)
46 print(msg, file=sys.stderr) 27 return 0
47 sys.exit(1) 28 except Exception:
48 29 print("Could not write the data", file=sys.stdout)
49 ramclustr_data = extract_data(aplcms_table) 30 return 1
50 ramclustr_table = format_table(ramclustr_data)
51
52 ramclustr_table.to_csv(args.output, sep=',')
53 msg = "Table '{}' of HDF dataset is converted to csv for RamClutsR".format(args.table)
54 print(msg, file=sys.stdout)
55 31
56 32
57 if __name__ == "__main__": 33 if __name__ == "__main__":
58 main() 34 main()