Mercurial > repos > bebatut > format_cd_hit_output
annotate format_cd_hit_output.py @ 0:bbd903996900 draft
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
author | bebatut |
---|---|
date | Wed, 27 Jan 2016 03:28:42 -0500 |
parents | |
children | 4ba41bcee051 |
rev | line source |
---|---|
0
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
1 #!/usr/bin/env python |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
3 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
4 import sys |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
5 import os |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
6 import argparse |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
7 import copy |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
8 import operator |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
9 from sets import Set |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
10 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
11 def extract_mapping_info(input_mapping_filepath): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
12 mapping_info = {} |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
13 categories = Set([]) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
14 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
15 with open(input_mapping_filepath,'r') as mapping_file: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
16 for line in mapping_file.readlines(): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
17 split_line = line[:-1].split('\t') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
18 mapping_info.setdefault(split_line[0],split_line[1]) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
19 categories.add(split_line[1]) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
20 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
21 return mapping_info, categories |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
22 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
23 def init_category_distribution(categories = None): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
24 cluster_category_distribution = {} |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
25 if categories != None: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
26 for category in categories: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
27 cluster_category_distribution[category] = 0 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
28 return cluster_category_distribution |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
29 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
30 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
31 cluster_category_distribution, categories, output_category_distribution_file, |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
32 cluster_seq_number): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
33 if cluster_name != '': |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
34 if categories != None: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
35 output_category_distribution_file.write(cluster_name) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
36 output_category_distribution_file.write('\t' + str(cluster_seq_number)) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
37 for category in categories: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
38 output_category_distribution_file.write('\t') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
39 output_category_distribution_file.write(str(cluster_category_distribution[category])) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
40 output_category_distribution_file.write('\n') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
41 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
42 if cluster_ref_seq == '': |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
43 string = "No reference sequence found for " |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
44 string += cluster_name |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
45 raise ValueError(string) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
46 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
47 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
48 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
49 def extract_cluster_info(args, mapping_info = None, categories = None): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
50 ref_seq_cluster = {} |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
51 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
52 if args.output_category_distribution != None: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
53 if mapping_info == None or categories == None: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
54 string = "A file with category distribution is expected but " |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
55 string += "no mapping information are available" |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
56 raise ValueError(string) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
57 output_category_distribution_file = open( |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
58 args.output_category_distribution, 'w') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
59 output_category_distribution_file.write('Cluster\tSequence_number') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
60 for category in categories: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
61 output_category_distribution_file.write('\t' + category) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
62 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
63 output_category_distribution_file.write('\n') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
64 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
65 with open(args.input_cluster_info,'r') as cluster_info_file: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
66 cluster_name = '' |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
67 cluster_category_distribution = init_category_distribution(categories) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
68 cluster_ref_seq = '' |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
69 cluster_seq_number = 0 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
70 for line in cluster_info_file.readlines(): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
71 if line[0] == '>': |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
72 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
73 cluster_category_distribution, categories, |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
74 output_category_distribution_file, cluster_seq_number) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
75 cluster_name = line[1:-1] |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
76 cluster_name = cluster_name.replace(' ','_') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
77 cluster_category_distribution = init_category_distribution(categories) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
78 cluster_ref_seq = '' |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
79 cluster_seq_number = 0 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
80 else: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
81 seq_info = line[:-1].split('\t')[1].split(' ') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
82 seq_name = seq_info[1][1:-3] |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
83 cluster_seq_number += 1 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
84 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
85 if categories != None: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
86 seq_count = 1 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
87 if args.number_sum == 'false': |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
88 if seq_name.find('size') != -1: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
89 substring = seq_name[seq_name.find('size'):-1] |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
90 seq_count = int(substring.split('=')[1]) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
91 if not mapping_info.has_key(seq_name): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
92 string = seq_name + " not found in mapping" |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
93 raise ValueError(string) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
94 category = mapping_info[seq_name] |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
95 cluster_category_distribution[category] += seq_count |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
96 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
97 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
98 if seq_info[-1] == '*': |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
99 if cluster_ref_seq != '': |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
100 string = "A reference sequence (" + cluster_ref_seq |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
101 string += ") already found for cluster " + cluster_name |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
102 string += " (" + seq_name + ")" |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
103 raise ValueError(string) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
104 cluster_ref_seq = seq_name |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
105 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
107 cluster_category_distribution, categories, |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
108 output_category_distribution_file, cluster_seq_number) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
109 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
110 if args.output_category_distribution != None: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
111 output_category_distribution_file.close() |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
112 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
113 return ref_seq_cluster |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
114 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
115 def rename_representative_sequences(args, ref_seq_cluster): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
116 with open(args.input_representative_sequences,'r') as input_sequences: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
117 with open(args.output_representative_sequences,'w') as output_sequences: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
118 for line in input_sequences.readlines(): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
119 if line[0] == '>': |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
120 seq_name = line[1:-1] |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
121 if not ref_seq_cluster.has_key(seq_name): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
122 string = seq_name + " not found as reference sequence" |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
123 raise ValueError(string) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
124 output_sequences.write('>' + ref_seq_cluster[seq_name] + '\n') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
125 else: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
126 output_sequences.write(line) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
127 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
128 def format_cd_hit_outputs(args): |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
129 if args.input_mapping != None: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
130 mapping_info, categories = extract_mapping_info(args.input_mapping) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
131 else: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
132 mapping_info = None |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
133 categories = None |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
134 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
135 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
136 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
137 if args.input_representative_sequences != None: |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
138 rename_representative_sequences(args, ref_seq_cluster) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
139 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
140 if __name__ == "__main__": |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
141 parser = argparse.ArgumentParser() |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
142 parser.add_argument('--input_cluster_info', required=True) |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
143 parser.add_argument('--input_representative_sequences') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
144 parser.add_argument('--output_representative_sequences') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
145 parser.add_argument('--input_mapping') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
146 parser.add_argument('--output_category_distribution') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
147 parser.add_argument('--number_sum') |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
148 args = parser.parse_args() |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
149 |
bbd903996900
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
bebatut
parents:
diff
changeset
|
150 format_cd_hit_outputs(args) |