Mercurial > repos > rnateam > graphclust_postprocessing
changeset 2:6c88ad83de28 draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit 287021573c592fdb70fdbbc88943aa16a8740fc0
author | rnateam |
---|---|
date | Fri, 13 Jan 2017 16:57:54 -0500 |
parents | e166d1382033 |
children | a8fde40f00fc |
files | addCdhitseqs.py evaluation.py glob_report.xml test-data/1.cluster.top5.alirna.png test-data/1.cluster.top5.aln.png test-data/2.cluster.top5.alirna.png test-data/2.cluster.top5.aln.png |
diffstat | 7 files changed, 120 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/addCdhitseqs.py Fri Jan 13 16:57:54 2017 -0500 @@ -0,0 +1,59 @@ +import re +import glob +import sys + +cdhitcluster = sys.argv[1] +#clusters = sys.argv[2] + +cluster_seqs_stats_path = "RESULTS/*.cluster.all" +cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) + +#clusterFiles = clusters.split(',') +repSeqRedSeqdict = {} +repLine = "" +count = 0 +first = False + +with open(cdhitcluster, 'r+') as f: + lines = f.readlines() + for i in range(0, len(lines)): + line = lines[i] + if ">Cluster" in line: + first = True + count = 0 + if i+1 < len(lines): + repLine = lines[i+1] + continue + elif not first: + count += 1 + first = False + else: + first = False + lineArr = [] + if count > 0: + repLine = repLine.strip() + rep_FullId = repLine.split()[2] + rep_FullId = rep_FullId.replace(">", "") + #rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0] + rep_FullId = rep_FullId.replace("...", "") + line = line.strip() + add_FullId = line.split()[2] + add_FullId = add_FullId.replace(">", "") + add_FullId = add_FullId.replace("...", "") + #add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0] + lineArr.append(add_FullId) + repSeqRedSeqdict[rep_FullId] = lineArr + #lineArr.append(add_short_id) + #repSeqRedSeqdict[rep_short_id] = lineArr + +toWrite = "" + +for singleFile in sorted(cluster_seqs_stats_files): + with open(singleFile, "a+") as clFile: + file_content = clFile.read() + first_line = file_content.split('\n')[0] + for key, val in repSeqRedSeqdict.items(): + if key in file_content: + for i in val: + toWrite += first_line.split()[0] + " " + first_line.split()[1] + " " + first_line.split()[2] + " " + " - " + " " + "CD-Hit" + " " + first_line.split()[5] + " " + "ORIGID" + " " + str(i) + "\n" + clFile.write(toWrite)
--- a/evaluation.py Thu Dec 22 08:49:14 2016 -0500 +++ b/evaluation.py Fri Jan 13 16:57:54 2017 -0500 @@ -1,6 +1,7 @@ import glob from os import system import re +from sklearn import metrics def sh(script): system("bash -c '%s'" % script) @@ -46,3 +47,23 @@ toWrite += listOfClasses[i] + "\t" + listOfClusters[i] + '\n' with open("RESULTS/fullTab.tabular", "w") as full: full.write(toWrite) + + +pattern = re.compile("^RF.*$") + + +if len(listOfClasses) > 0 and pattern.match(str(listOfClasses[0])): + + completeness_score = metrics.completeness_score(listOfClasses, listOfClusters) + homogeneity_score = metrics.homogeneity_score(listOfClasses, listOfClusters) + adjusted_rand_score = metrics.adjusted_rand_score(listOfClasses, listOfClusters) + adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(listOfClasses, listOfClusters) + v_measure_score = metrics.v_measure_score(listOfClasses, listOfClusters) + + toWrite = "completeness_score : " + str(completeness_score) + "\n" + "homogeneity_score : " + str(homogeneity_score) + "\n" + "adjusted_rand_score : " +str(adjusted_rand_score) + "\n" + "adjusted_mutual_info_score : " + str(adjusted_mutual_info_score)+ "\n" + "v_measure_score : " + str(v_measure_score) + +else: + toWrite = "completeness_score : NA \nhomogeneity_score : NA \nadjusted_rand_score : NA \nadjusted_mutual_info_score : NA \nv_measure_score : NA" + +with open("RESULTS/evaluation.txt", "w") as fOut: + fOut.write(toWrite)
--- a/glob_report.xml Thu Dec 22 08:49:14 2016 -0500 +++ b/glob_report.xml Fri Jan 13 16:57:54 2017 -0500 @@ -2,13 +2,18 @@ <requirements> <requirement type="package" version="0.1">graphclust-wrappers</requirement> <requirement type="package" version='0.5'>perl-array-utils</requirement> + <requirement type="package" version='0.18.1'>scikit-learn</requirement> + <requirement type="package" version='1.8.10'>locarna</requirement> + <requirement type="package" version='2.1'>rnaz</requirement> + <requirement type="package" version="1.1">infernal</requirement> + <requirement type="package" version='2.2.10'>viennarna</requirement> + <requirement type="package" version='1.3.23'>graphicsmagick</requirement> </requirements> <stdio> <exit_code range="1:" /> </stdio> <command> <![CDATA[ - unzip $FASTA &> /dev/null && #set $inputFiles = "" @@ -24,17 +29,30 @@ #set $inputFilesTrees += str($mods)+',' #end for #set $inputFilesTrees = $inputFilesTrees[:-1] - - 'glob_res.pl' '$inputFiles' $merge_cluster_ol $merge_overlap $min_cluster_size $cm_min_bitscore $cm_max_eval $cm_bitscore_sig $partition_type '' $cut_type '$inputFilesTrees' + glob_res.pl + '$inputFiles' + $merge_cluster_ol + $merge_overlap + $min_cluster_size + $cm_min_bitscore + $cm_max_eval + $cm_bitscore_sig + $partition_type '' + $cut_type + '$inputFilesTrees' + $results_top_num #if $iteration_num.iteration_num_selector: $iteration_num.CI - $final_partition_soft $final_partition_used_cmsearch #end if && python '$__tool_directory__/evaluation.py' + #if $cdhit: + && + python '$__tool_directory__/addCdhitseqs.py' '$cdhit' + #end if ]]> </command> <inputs> @@ -43,6 +61,7 @@ <param type="data" name="model_tree_files" format="txt" multiple="True"/> <param name="partition_type" type="boolean" checked="True" truevalue="0" falsevalue="1" label="Hard partition"/> <param name="cut_type" type="boolean" checked="True" truevalue="0" falsevalue="1" label="Use CM score for cutoff" help="otherwise use E-value"/> + <param type="data" name="cdhit" format="txt" optional="true"/> <conditional name="iteration_num"> <param name="iteration_num_selector" type="boolean" checked="no" label="Multiple iterations" help="for single iteration- NO, for multiple-YES"/> <when value="true"> @@ -58,18 +77,26 @@ <param name="cm_min_bitscore" type="integer" value="20" size="5" label="cm_min_bitscore" help=""/> <param name="cm_max_eval" type="float" value="0.001" size="5" label="cm_max_eval" help=""/> <param name="cm_bitscore_sig" type="integer" value="1" size="5" label="cm_bitscore_sig" help=""/> + <param name="results_top_num" type="integer" value="5" size="5" label="results_top_num" help=""/> </inputs> <outputs> <data name="final_stats" format="txt" from_work_dir="RESULTS/cluster.final.stats" label="cluster.final.stats" /> <data name="tableForEval" format="tabular" from_work_dir="RESULTS/fullTab.tabular" label="tableForEval" /> <data name="final_soft" format="txt" from_work_dir="RESULTS/partitions/final_partition.soft" label="soft_part" /> <data name="final_used_cmsearch" format="txt" from_work_dir="RESULTS/partitions/final_partition.used_cmsearch" label="final_partition_used_cmsearch" /> + <data name="evaluation" format="txt" from_work_dir="RESULTS/evaluation.txt" label="evaluation_of_clusters" /> <collection name="clusters" type="list" label="CLUSTERS"> <discover_datasets pattern="(?P<name>^.*\.all$)" directory="RESULTS" /> </collection> <collection name="partitions" type="list" label="Partitions"> <discover_datasets pattern="(?P<name>^.*$)" directory="RESULTS/partitions" /> </collection> + <collection name="topSecondaryStruct" type="list" label="Top $results_top_num alirna.ps"> + <discover_datasets format="png" pattern="(?P<name>^.*\.alirna.png$)" /> + </collection> + <collection name="topDot" type="list" label="Top $results_top_num aln.ps"> + <discover_datasets format="png" pattern="(?P<name>^.*\.aln.png$)" /> + </collection> </outputs> <tests> <test> @@ -111,6 +138,15 @@ <element name="final_partition.soft" file="RESULTS/partitions/final_partition.soft" /> <element name="final_partition.used_cmsearch" file="RESULTS/partitions/final_partition.used_cmsearch" compare="contains"/> </output_collection> + <param name="results_top_num" value="5"/> + <output_collection name="topSecondaryStruct" type="list"> + <element name="1.cluster.top5.alirna.png" file="1.cluster.top5.alirna.png" ftype="png" compare="sim_size" /> + <element name="2.cluster.top5.alirna.png" file="2.cluster.top5.alirna.png" ftype="png" compare="sim_size" /> + </output_collection> + <output_collection name="topDot" type="list"> + <element name="1.cluster.top5.aln.png" file="1.cluster.top5.aln.png" ftype="png" compare="sim_size"/> + <element name="2.cluster.top5.aln.png" file="2.cluster.top5.aln.png" ftype="png" compare="sim_size"/> + </output_collection> </test> </tests> <help>