annotate split_fasta.py @ 1:87bdbac78136 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
author rnateam
date Mon, 21 Sep 2020 15:41:01 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
2
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
3 import os
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
4 import sys
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
5 from Bio import SeqIO
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
6
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
7 num_chunks = 0
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
8 if len(sys.argv) == 3:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
9 num_chunks = int(sys.argv[2])
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
10 input_filename = sys.argv[1]
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
11 elif len(sys.argv) == 2:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
12 input_filename = sys.argv[1]
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
13 else:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
14 exit("Usage: split_fasta.py <input_filename> [<num_chunks>]")
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
15
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
16 os.mkdir('splits')
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
17
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
18 if num_chunks != 0:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
19 # if splitting into chunks we need to count how many records are in the
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
20 # input file
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
21 record_count = 0
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
22 with open(input_filename) as input_file:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
23 for line in input_file:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
24 if line.lstrip().startswith('>'):
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
25 record_count += 1
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
26
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
27 records_per_chunk = round(float(record_count) / num_chunks)
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
28
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
29 count = 1
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
30 with open(input_filename) as input_file:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
31
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
32 chunk_record_count = 0 # how many lines have we written to the output file
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
33 records = []
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
34 for record in SeqIO.parse(input_file, 'fasta'):
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
35 records.append(record)
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
36 if num_chunks == 0 or (count < num_chunks and
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
37 len(records) >= records_per_chunk):
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
38 if num_chunks == 0:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
39 output_filename = os.path.join('splits', record.id)
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
40 else:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
41 output_filename = os.path.join('splits', 'part{}'.format(count))
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
42 SeqIO.write(records, output_filename, 'fasta')
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
43 count += 1
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
44 records = []
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
45
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
46 if records:
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
47 # this only applies for the mode where input file is
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
48 # split into chunks
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
49 output_filename = os.path.join('splits', 'part{}'.format(count))
87bdbac78136 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
50 SeqIO.write(records, output_filename, 'fasta')