Mercurial > repos > peterjc > venn_list
comparison tools/plotting/venn_list.py @ 0:aefc86eda5f6
Migrated tool version 0.0.3 from old tool shed archive to new tool shed repository
| author | peterjc |
|---|---|
| date | Tue, 07 Jun 2011 17:42:40 -0400 |
| parents | |
| children | 29fcd027d67a |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:aefc86eda5f6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy) | |
| 3 | |
| 4 This script is copyright 2010 by Peter Cock, The James Hutton Institute | |
| 5 (formerly SCRI), UK. All rights reserved. | |
| 6 See accompanying text file for licence details (MIT/BSD style). | |
| 7 | |
| 8 This is version 0.0.3 of the script. | |
| 9 """ | |
| 10 | |
| 11 | |
| 12 import sys | |
| 13 import rpy | |
| 14 | |
| 15 def stop_err(msg, error_level=1): | |
| 16 """Print error message to stdout and quit with given error level.""" | |
| 17 sys.stderr.write("%s\n" % msg) | |
| 18 sys.exit(error_level) | |
| 19 | |
| 20 try: | |
| 21 import rpy | |
| 22 except ImportError: | |
| 23 stop_err("Requires the Python library rpy (to call R)") | |
| 24 | |
| 25 try: | |
| 26 rpy.r.library("limma") | |
| 27 except: | |
| 28 stop_err("Requires the R library limma (for vennDiagram function)") | |
| 29 | |
| 30 | |
| 31 if len(sys.argv)-1 not in [7, 10, 13]: | |
| 32 stop_err("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv)-1)) | |
| 33 | |
| 34 all_file, all_type, all_label = sys.argv[1:4] | |
| 35 set_data = [] | |
| 36 if len(sys.argv)-1 >= 7: | |
| 37 set_data.append(tuple(sys.argv[4:7])) | |
| 38 if len(sys.argv)-1 >= 10: | |
| 39 set_data.append(tuple(sys.argv[7:10])) | |
| 40 if len(sys.argv)-1 >= 13: | |
| 41 set_data.append(tuple(sys.argv[10:13])) | |
| 42 pdf_file = sys.argv[-1] | |
| 43 n = len(set_data) | |
| 44 print "Doing %i-way Venn Diagram" % n | |
| 45 | |
| 46 def load_ids(filename, filetype): | |
| 47 if filetype=="tabular": | |
| 48 for line in open(filename): | |
| 49 if not line.startswith("#"): | |
| 50 yield line.rstrip("\n").split("\t",1)[0] | |
| 51 elif filetype=="fasta": | |
| 52 for line in open(filename): | |
| 53 if line.startswith(">"): | |
| 54 yield line[1:].rstrip("\n").split(None,1)[0] | |
| 55 elif filetype.startswith("fastq"): | |
| 56 #Use the Galaxy library not Biopython to cope with CS | |
| 57 from galaxy_utils.sequence.fastq import fastqReader | |
| 58 handle = open(filename, "rU") | |
| 59 for record in fastqReader(handle): | |
| 60 #The [1:] is because the fastaReader leaves the @ on the identifer. | |
| 61 yield record.identifier.split()[0][1:] | |
| 62 handle.close() | |
| 63 elif filetype=="sff": | |
| 64 try: | |
| 65 from Bio.SeqIO import index | |
| 66 except ImportError: | |
| 67 stop_err("Require Biopython 1.54 or later (to read SFF files)") | |
| 68 #This will read the SFF index block if present (very fast) | |
| 69 for name in index(filename, "sff"): | |
| 70 yield name | |
| 71 else: | |
| 72 stop_err("Unexpected file type %s" % filetype) | |
| 73 | |
| 74 def load_ids_whitelist(filename, filetype, whitelist): | |
| 75 for name in load_ids(filename, filetype): | |
| 76 if name in whitelist: | |
| 77 yield name | |
| 78 else: | |
| 79 stop_err("Unexpected ID %s in %s file %s" % (name, filetype, filename)) | |
| 80 | |
| 81 if all_file in ["", "-", '""', '"-"']: | |
| 82 #Load without white list | |
| 83 sets = [set(load_ids(f,t)) for (f,t,c) in set_data] | |
| 84 #Take union | |
| 85 all = set() | |
| 86 for s in sets: | |
| 87 all.update(s) | |
| 88 print "Inferred total of %i IDs" % len(all) | |
| 89 else: | |
| 90 all = set(load_ids(all_file, all_type)) | |
| 91 print "Total of %i IDs" % len(all) | |
| 92 sets = [set(load_ids_whitelist(f,t,all)) for (f,t,c) in set_data] | |
| 93 | |
| 94 for s, (f,t,c) in zip(sets, set_data): | |
| 95 print "%i in %s" % (len(s), c) | |
| 96 | |
| 97 #Now call R library to draw simple Venn diagram | |
| 98 try: | |
| 99 #Create dummy Venn diagram counts object for three groups | |
| 100 cols = 'c("%s")' % '","'.join("Set%i" % (i+1) for i in range(n)) | |
| 101 rpy.r('groups <- cbind(%s)' % ','.join(['1']*n)) | |
| 102 rpy.r('colnames(groups) <- %s' % cols) | |
| 103 rpy.r('vc <- vennCounts(groups)') | |
| 104 #Populate the 2^n classes with real counts | |
| 105 #Don't make any assumptions about the class order | |
| 106 #print rpy.r('vc') | |
| 107 for index, row in enumerate(rpy.r('vc[,%s]' % cols)): | |
| 108 if isinstance(row, int) or isinstance(row, float): | |
| 109 #Hack for rpy being too clever for single element row | |
| 110 row = [row] | |
| 111 names = all | |
| 112 for wanted, s in zip(row, sets): | |
| 113 if wanted: | |
| 114 names = names.intersection(s) | |
| 115 else: | |
| 116 names = names.difference(s) | |
| 117 rpy.r('vc[%i,"Counts"] <- %i' % (index+1, len(names))) | |
| 118 #print rpy.r('vc') | |
| 119 if n == 1: | |
| 120 #Single circle, don't need to add (Total XXX) line | |
| 121 names = [c for (t,f,c) in set_data] | |
| 122 else: | |
| 123 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f,t,c) in zip(sets, set_data)] | |
| 124 rpy.r.assign("names", names) | |
| 125 rpy.r.assign("colors", ["red","green","blue"][:n]) | |
| 126 rpy.r.pdf(pdf_file, 8, 8) | |
| 127 rpy.r("""vennDiagram(vc, include="both", names=names, | |
| 128 main="%s", sub="(Total %i)", | |
| 129 circle.col=colors) | |
| 130 """ % (all_label, len(all))) | |
| 131 rpy.r.dev_off() | |
| 132 except Exception, exc: | |
| 133 stop_err( "%s" %str( exc ) ) | |
| 134 rpy.r.quit( save="no" ) | |
| 135 print "Done" |
