Mercurial > repos > rnateam > graphclust_preprocessing

diff splitStockholm.py @ 6:e31c659be8bc draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust commit 6767a5ffb02052c844e9d862c79912f998f39d8e
author: rnateam
date: Mon, 20 Nov 2017 05:01:21 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/splitStockholm.py	Mon Nov 20 05:01:21 2017 -0500
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+########
+# This script reads multiple alignments merged in single stockholm file
+# and splits the alignment blocks according to data.names table
+# The first sequence of each alignment file assumed to match to names table entries
+# Author: M. Miladi
+########
+import os
+import re
+import sys
+
+from Bio import AlignIO, SeqIO
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+stk_file = sys.argv[1]
+print ("Parsing and splitting stk file:{}".format(stk_file))
+target_f = "alignment_data_split.stk"
+pattern = re.compile("^>.*$")
+toWriteID = ""
+
+count_for_id = 1
+seq_counter = 0
+new_id = ""
+
+seq_id = []
+seq_string = []
+orig_id = []
+name_file = "FASTA/data.names"
+array_all_chunks = []
+with open(name_file, 'r') as f:
+    for line in f:
+        if len(line.strip()) == 0:
+            continue
+        seq_id.append(int(line.split()[0]))
+        seq_string.append(line.split()[1])
+        orig_id_srt = line.split()[3]
+        orig_id_srt = orig_id_srt.rsplit('_',1)[0]
+        orig_id.append(orig_id_srt)
+
+
+
+with open(stk_file) as stk_in:
+    alignments = AlignIO.parse(stk_in, "stockholm")#, alphabet=IUPAC.ambiguous_rna)  
+    alignments_dic = {(a[0].id):a for a in alignments}
+
+
+regx_gaps = '[-.~_]'  # valid gap symbols all be converted to "-"
+str_gaps = '-.~_'  # valid gap symbols all be converted to "-"
+
+
+chunks = []
+with open(target_f, 'w') as out_stk_handle:
+    for i in range(len(orig_id)):
+        
+        #----------------------
+        # We need to map ungapped positions of the chunks to gapped positions of first sequence 
+        gap_count = 0
+        ungap_ind = 0
+        dic_gap_counts = dict()
+        cur_alignment = alignments_dic[orig_id[i]]
+        for c in cur_alignment[0].seq:
+            #print ungap_ind
+            if c in str_gaps:
+                gap_count += 1
+            else:
+                dic_gap_counts[ungap_ind] = gap_count
+                ungap_ind += 1
+        ID =  str(seq_id[i]) + " " + seq_string[i] 
+        chunks = re.findall(r'\d+', seq_string[i])
+        print (ID,chunks)
+
+        index_start, index_end =int(chunks[1])-1, int(chunks[2])-1
+        subalign = cur_alignment[:, index_start + dic_gap_counts[index_start]:
+                           index_end+dic_gap_counts[index_end]+1]
+        
+        #----------------------
+        # BioPython does not handel the GF ID entry for alignment
+        # So we add entry in the second line manually
+        siotmp = StringIO()
+        AlignIO.write(subalign, siotmp, format="stockholm")
+        stk_lines = siotmp.getvalue().split('\n')
+        out_stk_handle.write('{}\n'.format(stk_lines[0]))
+        out_stk_handle.write('#=GF ID {}\n'.format(ID))
+        out_stk_handle.writelines('\n'.join(stk_lines[1:]))
+        #print out_stk_handle.getvalue()
+
+
author	rnateam
date	Mon, 20 Nov 2017 05:01:21 -0500
parents
children