comparison hdf_converter.py @ 2:644192cf22a5 draft

"planemo upload for repository https://github.com/RECETOX/galaxytools/tools/hdf_converter/ commit 5cdd2628a1a509b3e0ccc599eaab63d664bf031a"
author recetox
date Wed, 13 Jan 2021 15:55:42 +0000
parents 52470d439e50
children
comparison
equal deleted inserted replaced
1:52470d439e50 2:644192cf22a5
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import optparse 3 import argparse
4 import sys 4 import sys
5 import warnings 5 import warnings
6 6
7 import pandas as pd 7 import pandas as pd
8 8
9 9
10 warnings.simplefilter('ignore') 10 warnings.simplefilter('ignore')
11 11
12 parser = optparse.OptionParser() 12 parser = argparse.ArgumentParser()
13 parser.add_option("--dataframe", help="Name of hdf dataframe") 13 parser.add_argument("--dataframe", help="Name of hdf dataframe")
14 parser.add_option("--table", help="Name of a table in the dataframe") 14 parser.add_argument("--table", help="Name of a table in the dataframe")
15 (options, args) = parser.parse_args() 15 parser.add_argument('output')
16 args = parser.parse_args()
16 17
17 18
18 def extract_samples(table, num_samples, idx): 19 def extract_data(table):
19 intensity_idx = 4 + idx 20 num_samples = int((len(table.columns.tolist()) - 4) / 2)
20 rt_idx = 4 + num_samples + idx 21 mz_rt = table['mz'].map(str) + "_" + table['rt'].map(str)
21 rt_idx_name = table.columns.tolist()[rt_idx] 22
22 table.dropna(subset=[rt_idx_name], inplace=True) 23 intensities = table.iloc[:, 4:(4 + num_samples)]
23 sample_name = table.columns.tolist()[intensity_idx].split('.')[1] 24 sample_labels = [label.split('.')[1] for label in intensities.columns.tolist()]
24 mzrt = table['mz'].map(str) + '_' + table.iloc[:, rt_idx].map(str) 25 ramclustr_data = pd.DataFrame({'mz_rt': mz_rt})
25 intensity = table.iloc[:, intensity_idx] 26
26 mzrt_intensity = {'mz_rt': mzrt, sample_name: intensity} 27 for idx in range(num_samples):
27 mzrt_intensity = pd.DataFrame(mzrt_intensity, columns=['mz_rt', sample_name]) 28 label = sample_labels[idx]
28 mzrt_intensity.set_index('mz_rt', inplace=True) 29 ramclustr_data[label] = intensities.iloc[:, idx]
29 return mzrt_intensity 30
31 return ramclustr_data
30 32
31 33
32 def join_samples(table): 34 def format_table(ramclustr_data):
33 num_samples = int((len(table.columns.tolist()) - 4) / 2) # 4 default columns: mz,rt,mz_min,mz_max. The rest is intensity and rt columns for each sample 35 ramclustr_data.set_index('mz_rt', inplace=True)
34 RamClustr_data = pd.DataFrame(columns=['mz_rt']) 36 ramclustr_data = ramclustr_data.transpose()
35 for sample in range(num_samples): 37 ramclustr_data.index.rename('sample', inplace=True)
36 sample_data = extract_samples(table, num_samples, sample) 38 return ramclustr_data
37 RamClustr_data = pd.merge(RamClustr_data, sample_data, on='mz_rt', how='outer')
38 return RamClustr_data
39
40
41 def convert_to_RamClustR(RamClustr_data):
42 RamClustr_data.fillna(0, inplace=True)
43 RamClustr_data.rename(columns={'mz_rt': 'sample'}, inplace=True)
44 RamClustr_data.set_index('sample', inplace=True)
45 RamClustr_data_transposed = RamClustr_data.transpose()
46 RamClustr_data_transposed.index.rename('sample', inplace=True)
47 return RamClustr_data_transposed
48 39
49 40
50 def main(): 41 def main():
51 try: 42 try:
52 aplcms_table = pd.read_hdf(options.dataframe, options.table, errors='None') 43 aplcms_table = pd.read_hdf(args.dataframe, args.table, errors='None')
53 except KeyError: 44 except KeyError:
54 sys.exit("Selected table does not exist in HDF dataframe") 45 msg = "Selected table does not exist in HDF dataframe"
46 print(msg, file=sys.stderr)
47 sys.exit(1)
55 48
56 RamClustr_data = join_samples(aplcms_table) 49 ramclustr_data = extract_data(aplcms_table)
57 RamClustr_data = convert_to_RamClustR(RamClustr_data) 50 ramclustr_table = format_table(ramclustr_data)
58 output = args[0] 51
59 RamClustr_data.to_csv(output, sep=';') 52 ramclustr_table.to_csv(args.output, sep=',')
60 print("Table '{}' of HDF dataset is converted to csv for RamClutsR".format(options.table)) 53 msg = "Table '{}' of HDF dataset is converted to csv for RamClutsR".format(args.table)
54 print(msg, file=sys.stdout)
61 55
62 56
63 if __name__ == "__main__": 57 if __name__ == "__main__":
64 main() 58 main()