Mercurial > repos > bebatut > format_cd_hit_output
comparison format_cd_hit_output.py @ 1:4ba41bcee051 draft default tip
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
| author | bebatut |
|---|---|
| date | Tue, 26 Apr 2016 08:54:26 -0400 |
| parents | bbd903996900 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:bbd903996900 | 1:4ba41bcee051 |
|---|---|
| 52 if args.output_category_distribution != None: | 52 if args.output_category_distribution != None: |
| 53 if mapping_info == None or categories == None: | 53 if mapping_info == None or categories == None: |
| 54 string = "A file with category distribution is expected but " | 54 string = "A file with category distribution is expected but " |
| 55 string += "no mapping information are available" | 55 string += "no mapping information are available" |
| 56 raise ValueError(string) | 56 raise ValueError(string) |
| 57 output_category_distribution_file = open( | 57 output_cat_distri_file = open(args.output_category_distribution, 'w') |
| 58 args.output_category_distribution, 'w') | 58 output_cat_distri_file.write('Cluster\tSequence_number') |
| 59 output_category_distribution_file.write('Cluster\tSequence_number') | |
| 60 for category in categories: | 59 for category in categories: |
| 61 output_category_distribution_file.write('\t' + category) | 60 output_cat_distri_file.write('\t' + category) |
| 62 | 61 |
| 63 output_category_distribution_file.write('\n') | 62 output_cat_distri_file.write('\n') |
| 63 else: | |
| 64 output_cat_distri_file = None | |
| 64 | 65 |
| 65 with open(args.input_cluster_info,'r') as cluster_info_file: | 66 with open(args.input_cluster_info,'r') as cluster_info_file: |
| 66 cluster_name = '' | 67 cluster_name = '' |
| 67 cluster_category_distribution = init_category_distribution(categories) | 68 cluster_category_distribution = init_category_distribution(categories) |
| 68 cluster_ref_seq = '' | 69 cluster_ref_seq = '' |
| 69 cluster_seq_number = 0 | 70 cluster_seq_number = 0 |
| 70 for line in cluster_info_file.readlines(): | 71 for line in cluster_info_file.readlines(): |
| 71 if line[0] == '>': | 72 if line[0] == '>': |
| 72 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | 73 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, |
| 73 cluster_category_distribution, categories, | 74 cluster_category_distribution, categories, |
| 74 output_category_distribution_file, cluster_seq_number) | 75 output_cat_distri_file, cluster_seq_number) |
| 75 cluster_name = line[1:-1] | 76 cluster_name = line[1:-1] |
| 76 cluster_name = cluster_name.replace(' ','_') | 77 cluster_name = cluster_name.replace(' ','_') |
| 77 cluster_category_distribution = init_category_distribution(categories) | 78 cluster_category_distribution = init_category_distribution(categories) |
| 78 cluster_ref_seq = '' | 79 cluster_ref_seq = '' |
| 79 cluster_seq_number = 0 | 80 cluster_seq_number = 0 |
| 82 seq_name = seq_info[1][1:-3] | 83 seq_name = seq_info[1][1:-3] |
| 83 cluster_seq_number += 1 | 84 cluster_seq_number += 1 |
| 84 | 85 |
| 85 if categories != None: | 86 if categories != None: |
| 86 seq_count = 1 | 87 seq_count = 1 |
| 87 if args.number_sum == 'false': | 88 if args.number_sum != None: |
| 88 if seq_name.find('size') != -1: | 89 if seq_name.find('size') != -1: |
| 89 substring = seq_name[seq_name.find('size'):-1] | 90 substring = seq_name[seq_name.find('size'):-1] |
| 90 seq_count = int(substring.split('=')[1]) | 91 seq_count = int(substring.split('=')[1]) |
| 91 if not mapping_info.has_key(seq_name): | 92 if not mapping_info.has_key(seq_name): |
| 92 string = seq_name + " not found in mapping" | 93 string = seq_name + " not found in mapping" |
| 93 raise ValueError(string) | 94 raise ValueError(string) |
| 94 category = mapping_info[seq_name] | 95 category = mapping_info[seq_name] |
| 95 cluster_category_distribution[category] += seq_count | 96 cluster_category_distribution[category] += seq_count |
| 96 | |
| 97 | 97 |
| 98 if seq_info[-1] == '*': | 98 if seq_info[-1] == '*': |
| 99 if cluster_ref_seq != '': | 99 if cluster_ref_seq != '': |
| 100 string = "A reference sequence (" + cluster_ref_seq | 100 string = "A reference sequence (" + cluster_ref_seq |
| 101 string += ") already found for cluster " + cluster_name | 101 string += ") already found for cluster " + cluster_name |
| 102 string += " (" + seq_name + ")" | 102 string += " (" + seq_name + ")" |
| 103 raise ValueError(string) | 103 raise ValueError(string) |
| 104 cluster_ref_seq = seq_name | 104 cluster_ref_seq = seq_name |
| 105 | 105 |
| 106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | 106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, |
| 107 cluster_category_distribution, categories, | 107 cluster_category_distribution, categories, output_cat_distri_file, |
| 108 output_category_distribution_file, cluster_seq_number) | 108 cluster_seq_number) |
| 109 | 109 |
| 110 if args.output_category_distribution != None: | 110 if args.output_category_distribution != None: |
| 111 output_category_distribution_file.close() | 111 output_cat_distri_file.close() |
| 112 | 112 |
| 113 return ref_seq_cluster | 113 return ref_seq_cluster |
| 114 | 114 |
| 115 def rename_representative_sequences(args, ref_seq_cluster): | 115 def rename_representative_sequences(args, ref_seq_cluster): |
| 116 with open(args.input_representative_sequences,'r') as input_sequences: | 116 with open(args.input_representative_sequences,'r') as input_sequences: |
