0
|
1 #!/usr/bin/env python
|
|
2 import argparse
|
|
3 import os
|
|
4 import shutil
|
|
5 import string
|
|
6 import subprocess
|
|
7 import sys
|
|
8 import tempfile
|
|
9
|
|
10 BUFFSIZE = 1048576
|
|
11 # Translation table for reverse Complement, with ambiguity codes.
|
|
12 DNA_COMPLEMENT = string.maketrans("ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb")
|
|
13
|
|
14
|
|
15 def get_stderr(tmp_stderr):
|
|
16 tmp_stderr.seek(0)
|
|
17 stderr = ''
|
|
18 try:
|
|
19 while True:
|
|
20 stderr += tmp_stderr.read(BUFFSIZE)
|
|
21 if not stderr or len(stderr) % BUFFSIZE != 0:
|
|
22 break
|
|
23 except OverflowError:
|
|
24 pass
|
|
25 return stderr
|
|
26
|
|
27
|
|
28 def reverse(sequence):
|
|
29 # Reverse sequence string.
|
|
30 return sequence[::-1]
|
|
31
|
|
32
|
|
33 def dna_complement(sequence):
|
|
34 # Complement DNA sequence string.
|
|
35 return sequence.translate(DNA_COMPLEMENT)
|
|
36
|
|
37
|
|
38 def dna_reverse_complement(sequence):
|
|
39 # Returns the reverse complement of the sequence.
|
|
40 sequence = reverse(sequence)
|
|
41 return dna_complement(sequence)
|
|
42
|
|
43
|
|
44 def stop_err(msg):
|
|
45 sys.stderr.write(msg)
|
|
46 sys.exit(1)
|
|
47
|
|
48 parser = argparse.ArgumentParser()
|
|
49 parser.add_argument('--input_motifs', dest='input_motifs', help='MEME output formatted files for input to fimo')
|
|
50 parser.add_argument('--input_fasta', dest='input_fasta', help='Fassta sequence file')
|
|
51 parser.add_argument('--options_type', dest='options_type', help='Basic or Advance options')
|
|
52 parser.add_argument('--input_psp', dest='input_psp', default=None, help='File containing position specific priors')
|
|
53 parser.add_argument('--input_prior_dist', dest='input_prior_dist', default=None, help='File containing binned distribution of priors')
|
|
54 parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, help='The alpha parameter for calculating position specific priors')
|
|
55 parser.add_argument('--bgfile', dest='bgfile', default=None, help='Background file type, used only if not "default"')
|
|
56 parser.add_argument('--max_strand', action='store_true', help='If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score')
|
|
57 parser.add_argument('--max_stored_scores', dest='max_stored_scores', type=int, help='Maximum score count to store')
|
|
58 parser.add_argument('--motif', dest='motifs', action='append', default=[], help='Specify motif by id')
|
6
|
59 parser.add_argument('--output_separate_motifs', dest='output_separate_motifs', default='no', help='Output one dataset per motif')
|
0
|
60 parser.add_argument('--motif_pseudo', dest='motif_pseudo', type=float, default=0.1, help='Pseudocount to add to counts in motif matrix')
|
|
61 parser.add_argument('--no_qvalue', action='store_true', help='Do not compute a q-value for each p-value')
|
|
62 parser.add_argument('--norc', action='store_true', help='Do not score the reverse complement DNA strand')
|
|
63 parser.add_argument('--output_path', dest='output_path', help='Output files directory')
|
6
|
64 parser.add_argument('--parse_genomic_coord', dest='parse_genomic_coord', default='no', help='Check each sequence header for UCSC style genomic coordinates')
|
|
65 parser.add_argument('--remove_duplicate_coords', dest='remove_duplicate_coords', default='no', help='Remove duplicate entries in unique GFF coordinates')
|
0
|
66 parser.add_argument('--qv_thresh', action='store_true', help='Use q-values for the output threshold')
|
|
67 parser.add_argument('--thresh', dest='thresh', type=float, help='p-value threshold')
|
|
68 parser.add_argument('--gff_output', dest='gff_output', help='Gff output file')
|
|
69 parser.add_argument('--html_output', dest='html_output', help='HTML output file')
|
|
70 parser.add_argument('--interval_output', dest='interval_output', help='Interval output file')
|
|
71 parser.add_argument('--txt_output', dest='txt_output', help='Text output file')
|
|
72 parser.add_argument('--xml_output', dest='xml_output', help='XML output file')
|
|
73 args = parser.parse_args()
|
|
74
|
|
75 fimo_cmd_list = ['fimo']
|
|
76 if args.options_type == 'advanced':
|
|
77 fimo_cmd_list.append('--alpha %4f' % args.alpha)
|
|
78 if args.bgfile is not None:
|
|
79 fimo_cmd_list.append('--bgfile "%s"' % args.bgfile)
|
|
80 if args.max_strand:
|
|
81 fimo_cmd_list.append('--max-strand')
|
|
82 fimo_cmd_list.append('--max-stored-scores %d' % args.max_stored_scores)
|
|
83 if len(args.motifs) > 0:
|
|
84 for motif in args.motifs:
|
|
85 fimo_cmd_list.append('--motif "%s"' % motif)
|
|
86 fimo_cmd_list.append('--motif-pseudo %4f' % args.motif_pseudo)
|
|
87 if args.no_qvalue:
|
|
88 fimo_cmd_list.append('--no-qvalue')
|
|
89 if args.norc:
|
|
90 fimo_cmd_list.append('--norc')
|
5
|
91 if args.parse_genomic_coord == 'yes':
|
0
|
92 fimo_cmd_list.append('--parse-genomic-coord')
|
|
93 if args.qv_thresh:
|
|
94 fimo_cmd_list.append('--qv-thresh')
|
|
95 fimo_cmd_list.append('--thresh %4f' % args.thresh)
|
|
96 if args.input_psp is not None:
|
|
97 fimo_cmd_list.append('--psp "%s"' % args.input_psp)
|
|
98 if args.input_prior_dist is not None:
|
|
99 fimo_cmd_list.append('--prior-dist "%s"' % args.input_prior_dist)
|
|
100 fimo_cmd_list.append('--o "%s"' % (args.output_path))
|
|
101 fimo_cmd_list.append('--verbosity 1')
|
|
102 fimo_cmd_list.append(args.input_motifs)
|
|
103 fimo_cmd_list.append(args.input_fasta)
|
|
104
|
|
105 fimo_cmd = ' '.join(fimo_cmd_list)
|
|
106
|
|
107 try:
|
|
108 tmp_stderr = tempfile.NamedTemporaryFile()
|
|
109 proc = subprocess.Popen(args=fimo_cmd, shell=True, stderr=tmp_stderr)
|
|
110 returncode = proc.wait()
|
|
111 if returncode != 0:
|
|
112 stderr = get_stderr(tmp_stderr)
|
|
113 stop_err(stderr)
|
|
114 except Exception, e:
|
|
115 stop_err('Error running FIMO:\n%s' % str(e))
|
|
116
|
|
117 shutil.move(os.path.join(args.output_path, 'fimo.txt'), args.txt_output)
|
|
118
|
|
119 gff_file = os.path.join(args.output_path, 'fimo.gff')
|
|
120 if args.remove_duplicate_coords == 'yes':
|
|
121 tmp_stderr = tempfile.NamedTemporaryFile()
|
2
|
122 # Identify and eliminating identical motif occurrences. These
|
|
123 # are identical if the combination of chrom, start, end and
|
|
124 # motif id are identical.
|
|
125 cmd = 'sort -k1,1 -k4,4n -k5,5n -k9.1,9.6 -u -o %s %s' % (gff_file, gff_file)
|
0
|
126 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True)
|
|
127 returncode = proc.wait()
|
|
128 if returncode != 0:
|
|
129 stderr = get_stderr(tmp_stderr)
|
|
130 stop_err(stderr)
|
2
|
131 # Sort GFF output by a combination of chrom, score, start.
|
3
|
132 cmd = 'sort -k1,1 -k4,4n -k6,6n -o %s %s' % (gff_file, gff_file)
|
0
|
133 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True)
|
|
134 returncode = proc.wait()
|
|
135 if returncode != 0:
|
|
136 stderr = get_stderr(tmp_stderr)
|
|
137 stop_err(stderr)
|
|
138 if args.output_separate_motifs == 'yes':
|
|
139 # Create the collection output directory.
|
|
140 collection_path = (os.path.join(os.getcwd(), 'output'))
|
|
141 # Keep track of motif occurrences.
|
|
142 header_line = None
|
|
143 motif_ids = []
|
|
144 file_handles = []
|
|
145 for line in open(gff_file, 'r'):
|
|
146 if line.startswith('#'):
|
|
147 if header_line is None:
|
|
148 header_line = line
|
|
149 continue
|
|
150 items = line.split('\t')
|
|
151 attribute = items[8]
|
|
152 attributes = attribute.split(';')
|
|
153 name = attributes[0]
|
|
154 motif_id = name.split('=')[1]
|
|
155 file_name = os.path.join(collection_path, 'MOTIF%s.gff' % motif_id)
|
|
156 if motif_id in motif_ids:
|
|
157 i = motif_ids.index(motif_id)
|
|
158 fh = file_handles[i]
|
|
159 fh.write(line)
|
|
160 else:
|
|
161 fh = open(file_name, 'wb')
|
|
162 if header_line is not None:
|
|
163 fh.write(header_line)
|
|
164 fh.write(line)
|
|
165 motif_ids.append(motif_id)
|
|
166 file_handles.append(fh)
|
|
167 for file_handle in file_handles:
|
|
168 file_handle.close()
|
|
169 else:
|
|
170 shutil.move(gff_file, args.gff_output)
|
|
171 shutil.move(os.path.join(args.output_path, 'fimo.xml'), args.xml_output)
|
|
172 shutil.move(os.path.join(args.output_path, 'fimo.html'), args.html_output)
|
|
173
|
|
174 out_file = open(args.interval_output, 'wb')
|
|
175 out_file.write("#%s\n" % "\t".join(("chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value")))
|
|
176 for line in open(args.txt_output):
|
|
177 if line.startswith('#'):
|
|
178 continue
|
|
179 fields = line.rstrip("\n\r").split("\t")
|
|
180 start, end = int(fields[2]), int(fields[3])
|
|
181 sequence = fields[7]
|
|
182 if start > end:
|
|
183 # Flip start and end and set strand.
|
|
184 start, end = end, start
|
|
185 strand = "-"
|
|
186 # We want sequences relative to strand; FIMO always provides + stranded sequence.
|
|
187 sequence = dna_reverse_complement(sequence)
|
|
188 else:
|
|
189 strand = "+"
|
|
190 # Make 0-based start position.
|
|
191 start -= 1
|
|
192 out_file.write("%s\n" % "\t".join([fields[1], str(start), str(end), fields[0], fields[4], strand, sequence, fields[5], fields[6]]))
|
|
193 out_file.close()
|