annotate gbk2fa.py @ 28:b1b327f475ac draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit e4366b6a23223f84039a0590cf0d7079b83d8f84"
author iuc
date Wed, 13 Oct 2021 23:29:54 +0000
parents 718842784732
children 030fe29d4c47
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
14
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
1 import argparse
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
2 import bz2
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
3 import contextlib
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
4 import gzip
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
5 import sys
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
6
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
7 import magic
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
8 from Bio import SeqIO
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
9
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
10 parser = argparse.ArgumentParser()
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
11 parser.add_argument("genbank_file", help="GenBank input file. Can be compressed with gzip or bzip2")
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
12 parser.add_argument("fasta_file", help="FASTA output datset")
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
13 parser.add_argument("--remove_version", dest="remove_version", action="store_true", help="Remove version number from NCBI form formatted accession numbers. For example, this would convert 'B000657.2' to 'B000657'")
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
14 args = parser.parse_args()
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
15
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
16 gbk_filename = args.genbank_file
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
17 fa_filename = args.fasta_file
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
18
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
19
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
20 @contextlib.contextmanager
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
21 def get_file_handle(gbk_filename):
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
22 f_type = magic.from_file(args.genbank_file, mime=True)
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
23 if f_type == 'text/plain':
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
24 input_handle = open(gbk_filename, "r")
19
718842784732 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit 999eca8a05f17ae567f99b8ca3394f2105491173
iuc
parents: 14
diff changeset
25 elif f_type == 'application/gzip' or f_type == 'application/x-gzip':
14
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
26 input_handle = gzip.open(gbk_filename, "rt")
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
27 elif f_type == 'application/x-bzip2':
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
28 input_handle = bz2.open(gbk_filename, "rt")
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
29 else:
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
30 sys.exit("Cannot process file of type {}. Only plain, gzip'ed, and bzip2'ed genbank files are accepted ".format(f_type))
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
31 yield input_handle
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
32 input_handle.close()
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
33
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
34
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
35 with get_file_handle(gbk_filename) as input_handle, open(fa_filename, "w") as output_handle:
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
36
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
37 for seq_record in SeqIO.parse(input_handle, "genbank"):
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
38 if args.remove_version:
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
39 seq_id = seq_record.id.split('.')[0]
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
40 else:
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
41 seq_id = seq_record.id
28
b1b327f475ac "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit e4366b6a23223f84039a0590cf0d7079b83d8f84"
iuc
parents: 19
diff changeset
42 print('Writing FASTA record: {}'.format(seq_id))
14
a7c106b938dd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
iuc
parents:
diff changeset
43 output_handle.write(">{}\n{}\n".format(seq_id, seq_record.seq))