Mercurial > repos > recetox > aplcms_to_ramclustr_converter
comparison hdf_converter.py @ 2:644192cf22a5 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tools/hdf_converter/ commit 5cdd2628a1a509b3e0ccc599eaab63d664bf031a"
| author | recetox |
|---|---|
| date | Wed, 13 Jan 2021 15:55:42 +0000 |
| parents | 52470d439e50 |
| children |
comparison
equal
deleted
inserted
replaced
| 1:52470d439e50 | 2:644192cf22a5 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import optparse | 3 import argparse |
| 4 import sys | 4 import sys |
| 5 import warnings | 5 import warnings |
| 6 | 6 |
| 7 import pandas as pd | 7 import pandas as pd |
| 8 | 8 |
| 9 | 9 |
| 10 warnings.simplefilter('ignore') | 10 warnings.simplefilter('ignore') |
| 11 | 11 |
| 12 parser = optparse.OptionParser() | 12 parser = argparse.ArgumentParser() |
| 13 parser.add_option("--dataframe", help="Name of hdf dataframe") | 13 parser.add_argument("--dataframe", help="Name of hdf dataframe") |
| 14 parser.add_option("--table", help="Name of a table in the dataframe") | 14 parser.add_argument("--table", help="Name of a table in the dataframe") |
| 15 (options, args) = parser.parse_args() | 15 parser.add_argument('output') |
| 16 args = parser.parse_args() | |
| 16 | 17 |
| 17 | 18 |
| 18 def extract_samples(table, num_samples, idx): | 19 def extract_data(table): |
| 19 intensity_idx = 4 + idx | 20 num_samples = int((len(table.columns.tolist()) - 4) / 2) |
| 20 rt_idx = 4 + num_samples + idx | 21 mz_rt = table['mz'].map(str) + "_" + table['rt'].map(str) |
| 21 rt_idx_name = table.columns.tolist()[rt_idx] | 22 |
| 22 table.dropna(subset=[rt_idx_name], inplace=True) | 23 intensities = table.iloc[:, 4:(4 + num_samples)] |
| 23 sample_name = table.columns.tolist()[intensity_idx].split('.')[1] | 24 sample_labels = [label.split('.')[1] for label in intensities.columns.tolist()] |
| 24 mzrt = table['mz'].map(str) + '_' + table.iloc[:, rt_idx].map(str) | 25 ramclustr_data = pd.DataFrame({'mz_rt': mz_rt}) |
| 25 intensity = table.iloc[:, intensity_idx] | 26 |
| 26 mzrt_intensity = {'mz_rt': mzrt, sample_name: intensity} | 27 for idx in range(num_samples): |
| 27 mzrt_intensity = pd.DataFrame(mzrt_intensity, columns=['mz_rt', sample_name]) | 28 label = sample_labels[idx] |
| 28 mzrt_intensity.set_index('mz_rt', inplace=True) | 29 ramclustr_data[label] = intensities.iloc[:, idx] |
| 29 return mzrt_intensity | 30 |
| 31 return ramclustr_data | |
| 30 | 32 |
| 31 | 33 |
| 32 def join_samples(table): | 34 def format_table(ramclustr_data): |
| 33 num_samples = int((len(table.columns.tolist()) - 4) / 2) # 4 default columns: mz,rt,mz_min,mz_max. The rest is intensity and rt columns for each sample | 35 ramclustr_data.set_index('mz_rt', inplace=True) |
| 34 RamClustr_data = pd.DataFrame(columns=['mz_rt']) | 36 ramclustr_data = ramclustr_data.transpose() |
| 35 for sample in range(num_samples): | 37 ramclustr_data.index.rename('sample', inplace=True) |
| 36 sample_data = extract_samples(table, num_samples, sample) | 38 return ramclustr_data |
| 37 RamClustr_data = pd.merge(RamClustr_data, sample_data, on='mz_rt', how='outer') | |
| 38 return RamClustr_data | |
| 39 | |
| 40 | |
| 41 def convert_to_RamClustR(RamClustr_data): | |
| 42 RamClustr_data.fillna(0, inplace=True) | |
| 43 RamClustr_data.rename(columns={'mz_rt': 'sample'}, inplace=True) | |
| 44 RamClustr_data.set_index('sample', inplace=True) | |
| 45 RamClustr_data_transposed = RamClustr_data.transpose() | |
| 46 RamClustr_data_transposed.index.rename('sample', inplace=True) | |
| 47 return RamClustr_data_transposed | |
| 48 | 39 |
| 49 | 40 |
| 50 def main(): | 41 def main(): |
| 51 try: | 42 try: |
| 52 aplcms_table = pd.read_hdf(options.dataframe, options.table, errors='None') | 43 aplcms_table = pd.read_hdf(args.dataframe, args.table, errors='None') |
| 53 except KeyError: | 44 except KeyError: |
| 54 sys.exit("Selected table does not exist in HDF dataframe") | 45 msg = "Selected table does not exist in HDF dataframe" |
| 46 print(msg, file=sys.stderr) | |
| 47 sys.exit(1) | |
| 55 | 48 |
| 56 RamClustr_data = join_samples(aplcms_table) | 49 ramclustr_data = extract_data(aplcms_table) |
| 57 RamClustr_data = convert_to_RamClustR(RamClustr_data) | 50 ramclustr_table = format_table(ramclustr_data) |
| 58 output = args[0] | 51 |
| 59 RamClustr_data.to_csv(output, sep=';') | 52 ramclustr_table.to_csv(args.output, sep=',') |
| 60 print("Table '{}' of HDF dataset is converted to csv for RamClutsR".format(options.table)) | 53 msg = "Table '{}' of HDF dataset is converted to csv for RamClutsR".format(args.table) |
| 54 print(msg, file=sys.stdout) | |
| 61 | 55 |
| 62 | 56 |
| 63 if __name__ == "__main__": | 57 if __name__ == "__main__": |
| 64 main() | 58 main() |
