annotate sRbowtieCascade.py @ 3:0052d1dd31df draft default tip

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Mon, 29 Jun 2015 05:54:27 -0400
parents ecb041b49cd7
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
1 #!/usr/bin/env python
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
2 # small RNA oriented bowtie wrapper in cascade for small RNA data set genome annotation
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
3 # version 0.9 13-6-2014
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
4 # Usage sRbowtie_cascade.py see Parser() for valid arguments
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
5 # Christophe Antoniewski <drosofff@gmail.com>
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
6
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
7 import sys, os, subprocess, tempfile, shutil, argparse
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
8 from collections import defaultdict
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
9
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
10 def Parser():
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
11 the_parser = argparse.ArgumentParser()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
12 the_parser.add_argument('--output', action="store", type=str, help="output file")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
13 the_parser.add_argument('--num-threads', dest="num_threads", action="store", type=str, help="number of bowtie threads")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
14 the_parser.add_argument('--mismatch', action="store", type=str, help="number of mismatches allowed")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
15 the_parser.add_argument('--indexing-flags', dest="indexing_flags", nargs='+', help="whether the index should be generated or not by bowtie-buid")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
16 the_parser.add_argument('--index',nargs='+', help="paths to indexed or fasta references")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
17 the_parser.add_argument('--indexName',nargs='+', help="Names of the indexes")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
18 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
19 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
20 args = the_parser.parse_args()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
21 return args
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
22
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
23 def stop_err( msg ):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
24 sys.stderr.write( '%s\n' % msg )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
25 sys.exit()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
26
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
27 def bowtie_squash(fasta):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
28 tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
29 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
30 ref_file_name = ref_file.name
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
31 ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
32 os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
33 cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
34 try:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
35 FNULL = open(os.devnull, 'w')
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
36 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
37 tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
38 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
39 returncode = proc.wait()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
40 tmp_stderr.close()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
41 FNULL.close()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
42 sys.stdout.write(cmd1 + "\n")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
43 except Exception, e:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
44 # clean up temp dir
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
45 if os.path.exists( tmp_index_dir ):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
46 shutil.rmtree( tmp_index_dir )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
47 stop_err( 'Error indexing reference sequence\n' + str( e ) )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
48 # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
49 index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
50 return index_full_path
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
51
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
52 def make_working_dir():
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
53 working_dir = tempfile.mkdtemp()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
54 return working_dir
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
55
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
56 def Clean_TempDir(directory):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
57 if os.path.exists( directory ):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
58 shutil.rmtree( directory )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
59 return
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
60
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
61 def bowtie_alignment(command_line="None", working_dir = ""):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
62 FNULL = open(os.devnull, 'w')
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
63 p = subprocess.Popen(args=command_line, cwd=working_dir, shell=True, stderr=FNULL, stdout=FNULL)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
64 returncode = p.wait()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
65 sys.stdout.write("%s\n" % command_line)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
66 FNULL.close()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
67 #p = subprocess.Popen(["wc", "-l", "%s/al.fasta"%working_dir], cwd=working_dir, stdout=subprocess.PIPE)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
68 #aligned = p.communicate()[0].split()[0]
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
69 aligned = 0
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
70 try: # hacked at gcc2014 in case of no alignment, no al.fasta file generated (?)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
71 F = open ("%s/al.fasta" % working_dir, "r")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
72 for line in F:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
73 aligned += 1
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
74 F.close()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
75 except: pass
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
76 sys.stdout.write("Aligned: %s\n" % aligned)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
77 return aligned/2
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
78
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
79 def CommandLiner (v_mis="1", pslots="12", index="dum/my", input="dum/my", working_dir=""):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
80 return "bowtie -v %s -k 1 --best -p %s --al %s/al.fasta --un %s/unal.fasta --suppress 1,2,3,4,5,6,7,8 %s -f %s" % (v_mis, pslots, working_dir, working_dir, index, input)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
81
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
82 def __main__():
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
83 args = Parser()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
84 ## first we make all indexes available. They can be already available or be squashed by bowtie-build
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
85 ## we keep them in a list that alternates indexPath and "toClear" or "DoNotDelete"
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
86 BowtieIndexList = []
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
87 for indexing_flags, bowtiePath in zip (args.indexing_flags, args.index):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
88 if indexing_flags == "history":
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
89 BowtieIndexList.append ( bowtie_squash (bowtiePath) )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
90 BowtieIndexList.append ( "toClear" )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
91 else:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
92 BowtieIndexList.append ( bowtiePath )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
93 BowtieIndexList.append ( "DoNotDelete")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
94 ###### temporary Indexes are generated. They must be deleted at the end (after removing file name in the temp path)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
95 ResultDict = defaultdict(list)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
96 for label, input in zip(args.label, args.input): ## the main cascade, iterating over samples and bowtie indexes
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
97 workingDir = make_working_dir()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
98 cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[0], input=input, working_dir=workingDir)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
99 ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) # first step of the cascade
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
100 if len(BowtieIndexList) > 2: # is there a second step to perform ?
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
101 os.rename("%s/al.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) ## end of first step. the aligned reads are the input of the next step
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
102 cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[2], input="%s/toAlign.fasta"%workingDir, working_dir=workingDir)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
103 ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )## second step of the cascade
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
104 if len(BowtieIndexList) > 4: ## remaining steps
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
105 for BowtieIndexPath in BowtieIndexList[4::2]:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
106 try:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
107 os.unlink("%s/al.fasta" % workingDir) # hacked at gcc 2014, to remove previous al.fasta file that may interfere with counting if new al.fasta is empty
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
108 except: pass
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
109 os.rename("%s/unal.fasta"%workingDir, "%s/toAlign.fasta"%workingDir)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
110 cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexPath, input="%s/toAlign.fasta"%workingDir, working_dir=workingDir)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
111 ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
112 Fun = open("%s/unal.fasta"%workingDir, "r") ## to finish, compute the number of unmatched reads
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
113 n = 0
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
114 for line in Fun:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
115 n += 1
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
116 ResultDict[label].append(n/2)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
117 Fun.close()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
118 Clean_TempDir (workingDir) # clean the sample working directory
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
119 ## cleaning
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
120 for IndexPath, IndexFlag in zip(BowtieIndexList[::2], BowtieIndexList[1::2]):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
121 if IndexFlag == "toClear":
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
122 Clean_TempDir ("/".join(IndexPath.split("/")[:-1]))
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
123 ## end of cleaning
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
124
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
125
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
126
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
127 F = open (args.output, "w")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
128 print >> F, "alignment reference\t%s" % "\t".join(args.label)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
129 for i, reference in enumerate(args.indexName):
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
130 F.write ("%s" % reference)
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
131 for sample in args.label:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
132 F.write ("\t%s" % "{:,}".format(ResultDict[sample][i]) )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
133 print >> F
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
134 F.write ("Remaining Unmatched")
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
135 for sample in args.label:
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
136 F.write ("\t%s" % "{:,}".format(ResultDict[sample][-1]) )
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
137 print >> F
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
138
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
139 F.close()
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
140
ecb041b49cd7 Imported from capsule None
drosofff
parents:
diff changeset
141 if __name__=="__main__": __main__()