Mercurial > repos > recetox > aplcms_to_ramclustr_converter
comparison hdf_converter.py @ 0:062f4c571a24 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tools/hdf_converter/ commit 7c15608bc9e6d0cc28daed590341b2b22f9fcedf"
author | recetox |
---|---|
date | Tue, 15 Dec 2020 17:38:07 +0000 |
parents | |
children | 52470d439e50 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:062f4c571a24 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import optparse | |
4 import sys | |
5 import warnings | |
6 | |
7 import pandas as pd | |
8 | |
9 | |
10 warnings.simplefilter('ignore') | |
11 | |
12 parser = optparse.OptionParser() | |
13 parser.add_option("--dataframe", help="Name of hdf dataframe") | |
14 parser.add_option("--table", help="Name of a table in the dataframe") | |
15 (options, args) = parser.parse_args() | |
16 | |
17 | |
18 def extract_samples(table, num_samples, idx): | |
19 intensity_idx = 4 + idx | |
20 rt_idx = 4 + num_samples + idx | |
21 rt_idx_name = table.columns.tolist()[rt_idx] | |
22 table.dropna(subset=[rt_idx_name], inplace=True) | |
23 sample_name = table.columns.tolist()[intensity_idx].split('.')[1] | |
24 mzrt = table['mz'].map(str) + '_' + table.iloc[:, rt_idx].map(str) | |
25 intensity = table.iloc[:, intensity_idx] | |
26 mzrt_intensity = {'mz_rt': mzrt, sample_name: intensity} | |
27 mzrt_intensity = pd.DataFrame(mzrt_intensity, columns=['mz_rt', sample_name]) | |
28 mzrt_intensity.set_index('mz_rt', inplace=True) | |
29 return mzrt_intensity | |
30 | |
31 | |
32 def join_samples(table): | |
33 num_samples = int((len(table.columns.tolist()) - 4) / 2) # 4 default columns: mz,rt,mz_min,mz_max. The rest is intensity and rt columns for each sample | |
34 RamClustr_data = pd.DataFrame(columns=['mz_rt']) | |
35 for sample in range(num_samples): | |
36 sample_data = extract_samples(table, num_samples, sample) | |
37 RamClustr_data = pd.merge(RamClustr_data, sample_data, on='mz_rt', how='outer') | |
38 return RamClustr_data | |
39 | |
40 | |
41 def convert_to_RamClustR(RamClustr_data): | |
42 RamClustr_data.fillna(0, inplace=True) | |
43 RamClustr_data.rename(columns={'mz_rt': 'sample'}, inplace=True) | |
44 RamClustr_data.set_index('sample', inplace=True) | |
45 RamClustr_data_transposed = RamClustr_data.transpose() | |
46 RamClustr_data_transposed.index.rename('sample', inplace=True) | |
47 return RamClustr_data_transposed | |
48 | |
49 | |
50 def main(): | |
51 try: | |
52 aplcms_table = pd.read_hdf(options.dataframe, options.table, errors='None') | |
53 except KeyError: | |
54 sys.exit("Selected table does not exist in HDF dataframe") | |
55 | |
56 RamClutsr_data = join_samples(aplcms_table) | |
57 RamClustr_data = convert_to_RamClustR(RamClutsr_data) | |
58 output = args[0] | |
59 RamClustr_data.to_csv(output, sep=';') | |
60 print("Table '{}' of HDF dataset is converted to csv for RamClutsR".format(options.table)) | |
61 | |
62 | |
63 if __name__ == "__main__": | |
64 main() |