# HG changeset patch # User bebatut # Date 1460444173 14400 # Node ID 035a848cb73eb3b01d846d96da4453e75bdbf4eb planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/compare_humann2_output commit 5c45ed58045ce1686aa069403f8a9426ea20bac5-dirty diff -r 000000000000 -r 035a848cb73e compare_humann2_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/compare_humann2_output.py Tue Apr 12 02:56:13 2016 -0400 @@ -0,0 +1,138 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +import argparse +import re + +def extract_abundances(filepath, nb_charact_to_extract): + abundances = {} + more_abund_charact = [] + abund_sum = 0 + with open(filepath, 'r') as abundance_file: + for line in abundance_file.readlines()[1:]: + split_line = line[:-1].split('\t') + charact_id = split_line[0] + abund = float(split_line[1]) + abundances[charact_id] = 100*abund + abund_sum += abundances[charact_id] + + if len(more_abund_charact) < nb_charact_to_extract: + more_abund_charact.append(charact_id) + else: + best_pos = None + for i in range(len(more_abund_charact)-1,-1,-1): + if abundances[more_abund_charact[i]] < abund: + best_pos = i + else: + break + if best_pos != None: + tmp_more_abund_charact = more_abund_charact + more_abund_charact = tmp_more_abund_charact[:best_pos] + more_abund_charact += [charact_id] + more_abund_charact += tmp_more_abund_charact[best_pos:-1] + return abundances, more_abund_charact + +def format_characteristic_name(all_name): + if all_name.find(':') != -1: + charact_id = all_name.split(':')[0] + charact_name = all_name.split(':')[1][1:] + else: + charact_id = all_name + charact_name = '' + + charact_name = charact_name.replace('/',' ') + charact_name = charact_name.replace('-',' ') + charact_name = charact_name.replace("'",'') + if charact_name.find('(') != -1 and charact_name.find(')') != -1: + open_bracket = charact_name.find('(') + close_bracket = charact_name.find(')')+1 + charact_name = charact_name[:open_bracket] + charact_name[close_bracket:] + return charact_id,charact_name + +def write_more_abundant_charat(abundances,more_abund_charact, output_filepath): + with open(output_filepath,'w') as output_file: + output_file.write('id\tname\t') + output_file.write('\t'.join(abundances.keys()) + '\n') + + for mac in more_abund_charact: + charact_id,charact_name = format_characteristic_name(mac) + output_file.write(charact_id + '\t' + charact_name) + for sample in abundances: + abund = abundances[sample].get(mac, 0) + output_file.write('\t' + str(abund)) + output_file.write('\n') + +def extract_similar_characteristics(abundances, sim_output_filepath, + specific_output_files): + sim_characteristics = set(abundances[abundances.keys()[0]].keys()) + for sample in abundances.keys()[1:]: + sim_characteristics.intersection_update(abundances[sample].keys()) + print 'Similar between all samples:', len(sim_characteristics) + + with open(sim_output_filepath, 'w') as sim_output_file: + sim_output_file.write('id\tname\t' + '\t'.join(abundances.keys()) + '\n') + for charact in list(sim_characteristics): + charact_id,charact_name = format_characteristic_name(charact) + sim_output_file.write(charact_id + '\t' + charact_name) + for sample in abundances.keys(): + sim_output_file.write('\t' + str(abundances[sample][charact])) + sim_output_file.write('\n') + + print 'Specific to samples:' + diff_characteristics = {} + for i in range(len(abundances.keys())): + sample = abundances.keys()[i] + print ' ', sample, "" + print ' All:', len(abundances[sample].keys()) + diff_characteristics[sample] = set(abundances[sample].keys()) + diff_characteristics[sample].difference_update(sim_characteristics) + print ' Number of specific characteristics:', + print len(diff_characteristics[sample]) + print ' Percentage of specific characteristics:', + print 100*len(diff_characteristics[sample])/(1.*len(abundances[sample].keys())) + + relative_abundance = 0 + with open(specific_output_files[i], 'w') as output_file: + output_file.write('id\tname\tabundances\n') + for charact in list(diff_characteristics[sample]): + charact_id,charact_name = format_characteristic_name(charact) + output_file.write(charact_id + '\t' + charact_name + '\t') + output_file.write(str(abundances[sample][charact]) + '\n') + relative_abundance += abundances[sample][charact] + print ' Relative abundance of specific characteristics(%):', relative_abundance + + return sim_characteristics + +def compare_humann2_output(args): + abundances = {} + more_abund_charact = [] + + for i in range(len(args.sample_name)): + abundances[args.sample_name[i]], mac = extract_abundances(args.charact_input_file[i], + args.most_abundant_characteristics_to_extract) + more_abund_charact += mac + + write_more_abundant_charat(abundances, list(set(more_abund_charact)), + args.more_abundant_output_file) + sim_characteristics = extract_similar_characteristics(abundances, + args.similar_output_file, args.specific_output_file) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--sample_name', required=True, action='append') + parser.add_argument('--charact_input_file', required=True, action='append') + parser.add_argument('--most_abundant_characteristics_to_extract', required=True, + type = int) + parser.add_argument('--more_abundant_output_file', required=True) + parser.add_argument('--similar_output_file', required=True) + parser.add_argument('--specific_output_file', required=True,action='append') + args = parser.parse_args() + + if len(args.sample_name) != len(args.charact_input_file): + raise ValueError("Same number of values (in same order) are expected for --sample_name and --charact_input_file") + if len(args.sample_name) != len(args.specific_output_file): + raise ValueError("Same number of values (in same order) are expected for --sample_name and --specific_output_file") + + compare_humann2_output(args) \ No newline at end of file diff -r 000000000000 -r 035a848cb73e compare_humann2_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/compare_humann2_output.xml Tue Apr 12 02:56:13 2016 -0400 @@ -0,0 +1,69 @@ + + and extract information + + + + + + + + + + + + $log + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file