Mercurial > repos > drosofff > repenrich
comparison RepEnrich_setup.py @ 0:041de602103e draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit ef0b32c10d178e61faf9042bc5e0d3cc66a10729
author | drosofff |
---|---|
date | Tue, 23 May 2017 03:08:45 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:041de602103e |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 import csv | |
4 import os | |
5 import shlex | |
6 import subprocess | |
7 import sys | |
8 from Bio import SeqIO | |
9 from Bio.Seq import Seq | |
10 from Bio.SeqRecord import SeqRecord | |
11 from Bio.Alphabet import IUPAC | |
12 | |
13 parser = argparse.ArgumentParser(description='Part I: Prepartion of repetive element psuedogenomes and repetive element bamfiles. This script prepares the annotation used by downstream applications to analyze for repetitive element enrichment. For this script to run properly bowtie must be loaded. The repeat element psuedogenomes are prepared in order to analyze reads that map to multiple locations of the genome. The repeat element bamfiles are prepared in order to use a region sorter to analyze reads that map to a single location of the genome.You will 1) annotation_file: The repetitive element annotation file downloaded from RepeatMasker.org database for your organism of interest. 2) genomefasta: Your genome of interest in fasta format, 3)setup_folder: a folder to contain repeat element setup files command-line usage EXAMPLE: python master_setup.py /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt /users/nneretti/data/annotation/mm9/mm9.fa /users/nneretti/data/annotation/mm9/setup_folder', prog='getargs_genome_maker.py') | |
14 parser.add_argument('--version', action='version', version='%(prog)s 0.1') | |
15 parser.add_argument('annotation_file', action= 'store', metavar='annotation_file', help='List annotation file. The annotation file contains the repeat masker annotation for the genome of interest and may be downloaded at RepeatMasker.org Example /data/annotation/mm9/mm9.fa.out') | |
16 parser.add_argument('genomefasta', action= 'store', metavar='genomefasta', help='File name and path for genome of interest in fasta format. Example /data/annotation/mm9/mm9.fa') | |
17 parser.add_argument('setup_folder', action= 'store', metavar='setup_folder', help='List folder to contain bamfiles for repeats and repeat element psuedogenomes. Example /data/annotation/mm9/setup') | |
18 parser.add_argument('--nfragmentsfile1', action= 'store', dest='nfragmentsfile1', metavar='nfragmentsfile1', default='./repnames_nfragments.txt', help='Output location of a description file that saves the number of fragments processed per repname. Default ./repnames_nfragments.txt') | |
19 parser.add_argument('--gaplength', action= 'store', dest='gaplength', metavar='gaplength', default= '200', type=int, help='Length of the spacer used to build repeat psuedogeneomes. Default 200') | |
20 parser.add_argument('--flankinglength', action= 'store', dest='flankinglength', metavar='flankinglength', default= '25', type=int, help='Length of the flanking region adjacent to the repeat element that is used to build repeat psuedogeneomes. The flanking length should be set according to the length of your reads. Default 25') | |
21 parser.add_argument('--is_bed', action= 'store', dest='is_bed', metavar='is_bed', default= 'FALSE', help='Is the annotation file a bed file. This is also a compatible format. The file needs to be a tab seperated bed with optional fields. Ex. format chr\tstart\tend\tName_element\tclass\tfamily. The class and family should identical to name_element if not applicable. Default FALSE change to TRUE') | |
22 args = parser.parse_args() | |
23 | |
24 # parameters and paths specified in args_parse | |
25 gapl = args.gaplength | |
26 flankingl = args.flankinglength | |
27 annotation_file = args.annotation_file | |
28 genomefasta = args.genomefasta | |
29 setup_folder = args.setup_folder | |
30 nfragmentsfile1 = args.nfragmentsfile1 | |
31 is_bed = args.is_bed | |
32 | |
33 ################################################################################ | |
34 # check that the programs we need are available | |
35 try: | |
36 subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) | |
37 except OSError: | |
38 print ("Error: Bowtie or BEDTools not loaded") | |
39 raise | |
40 | |
41 ################################################################################ | |
42 # Define a text importer | |
43 csv.field_size_limit(sys.maxsize) | |
44 def import_text(filename, separator): | |
45 for line in csv.reader(open(os.path.realpath(filename)), delimiter=separator, | |
46 skipinitialspace=True): | |
47 if line: | |
48 yield line | |
49 # Make a setup folder | |
50 if not os.path.exists(setup_folder): | |
51 os.makedirs(setup_folder) | |
52 | |
53 ################################################################################ | |
54 # load genome into dictionary | |
55 print ("loading genome...") | |
56 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | |
57 | |
58 print ("Precomputing length of all chromosomes...") | |
59 idxgenome = {} | |
60 lgenome = {} | |
61 genome = {} | |
62 allchrs = g.keys() | |
63 k = 0 | |
64 for chr in allchrs: | |
65 genome[chr] = str(g[chr].seq) | |
66 # del g[chr] | |
67 lgenome[chr] = len(genome[chr]) | |
68 idxgenome[chr] = k | |
69 k = k + 1 | |
70 del g | |
71 | |
72 ################################################################################ | |
73 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | |
74 if is_bed == "FALSE": | |
75 repeat_elements= [] | |
76 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w') | |
77 fin = import_text(annotation_file, ' ') | |
78 x = 0 | |
79 rep_chr = {} | |
80 rep_start = {} | |
81 rep_end = {} | |
82 x = 0 | |
83 for line in fin: | |
84 if x>2: | |
85 line9 = line[9].replace("(","_").replace(")","_").replace("/","_") | |
86 repname = line9 | |
87 if not repname in repeat_elements: | |
88 repeat_elements.append(repname) | |
89 repchr = line[4] | |
90 repstart = int(line[5]) | |
91 repend = int(line[6]) | |
92 # print >> fout, str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) | |
93 fout.write(str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname)+ '\n') | |
94 # if rep_chr.has_key(repname): | |
95 if repname in rep_chr: | |
96 rep_chr[repname].append(repchr) | |
97 rep_start[repname].append(int(repstart)) | |
98 rep_end[repname].append(int(repend)) | |
99 else: | |
100 rep_chr[repname] = [repchr] | |
101 rep_start[repname] = [int(repstart)] | |
102 rep_end[repname] = [int(repend)] | |
103 x +=1 | |
104 if is_bed == "TRUE": | |
105 repeat_elements= [] | |
106 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w') | |
107 fin = open(os.path.realpath(annotation_file), 'r') | |
108 x =0 | |
109 rep_chr = {} | |
110 rep_start = {} | |
111 rep_end = {} | |
112 x =0 | |
113 for line in fin: | |
114 line=line.strip('\n') | |
115 line=line.split('\t') | |
116 line3 = line[3].replace("(","_").replace(")","_").replace("/","_") | |
117 repname = line3 | |
118 if not repname in repeat_elements: | |
119 repeat_elements.append(repname) | |
120 repchr = line[0] | |
121 repstart = int(line[1]) | |
122 repend = int(line[2]) | |
123 # print >> fout, str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) | |
124 fout.write(str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) + '\n') | |
125 # if rep_chr.has_key(repname): | |
126 if repname in rep_chr: | |
127 rep_chr[repname].append(repchr) | |
128 rep_start[repname].append(int(repstart)) | |
129 rep_end[repname].append(int(repend)) | |
130 else: | |
131 rep_chr[repname] = [repchr] | |
132 rep_start[repname] = [int(repstart)] | |
133 rep_end[repname] = [int(repend)] | |
134 | |
135 fin.close() | |
136 fout.close() | |
137 repeat_elements = sorted(repeat_elements) | |
138 print ("Writing a key for all repeats...") | |
139 #print to fout the binary key that contains each repeat type with the associated binary number; sort the binary key: | |
140 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repgenomes_key.txt'), 'w') | |
141 x = 0 | |
142 for repeat in repeat_elements: | |
143 # print >> fout, str(repeat) + '\t' + str(x) | |
144 fout.write(str(repeat) + '\t' + str(x) + '\n') | |
145 x +=1 | |
146 fout.close() | |
147 ################################################################################ | |
148 # generate spacer for psuedogenomes | |
149 spacer = "" | |
150 for i in range(gapl): | |
151 spacer = spacer + "N" | |
152 | |
153 # save file with number of fragments processed per repname | |
154 print ("Saving number of fragments processed per repname to " + nfragmentsfile1) | |
155 fout1 = open(os.path.realpath(nfragmentsfile1),"w") | |
156 for repname in rep_chr.keys(): | |
157 rep_chr_current = rep_chr[repname] | |
158 # print >>fout1, str(len(rep_chr[repname])) + "\t" + repname | |
159 fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n') | |
160 fout1.close() | |
161 | |
162 # generate metagenomes and save them to FASTA files | |
163 k = 1 | |
164 nrepgenomes = len(rep_chr.keys()) | |
165 for repname in rep_chr.keys(): | |
166 metagenome = "" | |
167 newname = repname.replace("(","_").replace(")","_").replace("/","_") | |
168 print ("processing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") | |
169 rep_chr_current = rep_chr[repname] | |
170 rep_start_current = rep_start[repname] | |
171 rep_end_current = rep_end[repname] | |
172 print ("-------> " + str(len(rep_chr[repname])) + " fragments") | |
173 for i in range(len(rep_chr[repname])): | |
174 try: | |
175 chr = rep_chr_current[i] | |
176 rstart = max(rep_start_current[i] - flankingl, 0) | |
177 rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) | |
178 metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] | |
179 except KeyError: | |
180 print ("Unrecognised Chromosome: "+chr) | |
181 pass | |
182 | |
183 # Convert metagenome to SeqRecord object (required by SeqIO.write) | |
184 record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id = "repname", name = "", description = "") | |
185 print ("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") | |
186 fastafilename = os.path.realpath(setup_folder + os.path.sep + newname + ".fa") | |
187 SeqIO.write(record, fastafilename, "fasta") | |
188 print ("indexing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") | |
189 command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + setup_folder + os.path.sep + newname) | |
190 p = subprocess.Popen(command).communicate() | |
191 k += 1 | |
192 | |
193 print ("... Done") | |
194 |