Mercurial > repos > rnateam > graphclust_postprocessing

diff addCdhitseqs.py @ 2:6c88ad83de28 draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit 287021573c592fdb70fdbbc88943aa16a8740fc0
author: rnateam
date: Fri, 13 Jan 2017 16:57:54 -0500
children: dbcea781900e
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/addCdhitseqs.py	Fri Jan 13 16:57:54 2017 -0500
@@ -0,0 +1,59 @@
+import re
+import glob
+import sys
+
+cdhitcluster = sys.argv[1]
+#clusters = sys.argv[2]
+
+cluster_seqs_stats_path = "RESULTS/*.cluster.all"
+cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path)
+
+#clusterFiles = clusters.split(',')
+repSeqRedSeqdict = {}
+repLine = ""
+count = 0
+first = False
+
+with open(cdhitcluster, 'r+') as f:
+    lines = f.readlines()
+    for i in range(0, len(lines)):
+        line = lines[i]
+        if ">Cluster" in line:
+            first = True
+            count = 0
+            if i+1 < len(lines):
+                repLine = lines[i+1]
+            continue
+        elif not first:
+            count += 1
+            first = False
+        else:
+            first = False
+            lineArr = []
+        if count > 0:
+            repLine = repLine.strip()
+            rep_FullId = repLine.split()[2]
+            rep_FullId = rep_FullId.replace(">", "")
+            #rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0]
+            rep_FullId = rep_FullId.replace("...", "")
+            line = line.strip()
+            add_FullId = line.split()[2]
+            add_FullId = add_FullId.replace(">", "")
+            add_FullId = add_FullId.replace("...", "")
+            #add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0]
+            lineArr.append(add_FullId)
+            repSeqRedSeqdict[rep_FullId] = lineArr
+            #lineArr.append(add_short_id)
+            #repSeqRedSeqdict[rep_short_id] = lineArr
+
+toWrite = ""
+
+for singleFile in sorted(cluster_seqs_stats_files):
+    with open(singleFile, "a+") as clFile:
+        file_content = clFile.read()
+        first_line = file_content.split('\n')[0]
+        for key, val in repSeqRedSeqdict.items():
+            if key in file_content:
+                for i in val:
+                    toWrite += first_line.split()[0] + "  " + first_line.split()[1] + "  " + first_line.split()[2] + "  " + " - " + "   " + "CD-Hit" + "    " + first_line.split()[5] + "  " + "ORIGID" + "  "  + str(i) + "\n"
+        clFile.write(toWrite)
author	rnateam
date	Fri, 13 Jan 2017 16:57:54 -0500
parents
children	dbcea781900e