Mercurial > repos > eganrol > rnaseqdataannotation
comparison RNAseqDataAnnotation/RNAseqDataAnnotation.R @ 6:22da2dc0e103 draft
Uploaded
author | eganrol |
---|---|
date | Wed, 19 Nov 2014 12:00:59 -0500 |
parents | de3907c85546 |
children |
comparison
equal
deleted
inserted
replaced
5:2c23b93b1026 | 6:22da2dc0e103 |
---|---|
1 #Author : keime / lornage | |
2 #Date : 2014/11 | |
3 | |
4 | |
5 ######################################################################################################## | |
6 #This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations | |
7 | |
8 #arguments | |
9 #path2htseqfiles : path to htseq-count result files | |
10 #samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header | |
11 #Species : latin name of the species | |
12 #ensversion : version of Ensembl to use | |
13 #fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length | |
14 #conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host | |
15 # (Column1 : Version Column2 : Host) | |
16 #conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use | |
17 # (Column1 : Specie Column2 : Dataset) | |
18 | |
19 #output : a data.frame with the following columns : | |
20 #ensembl gene id | |
21 #raw read counts for each library (one column per library) | |
22 #normalized data for each library (one column per library) | |
23 #normalized data divided by gene length for each library (one column per library) | |
24 #Gene name | |
25 #Description | |
26 | |
27 #require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1 | |
28 | |
29 #Methods : | |
30 #Considering that the resulting files of HTSeq-count have 5 lines of comments in the end | |
31 #Normalization is performed using the method described in Genome Biology 2010;11(10):R106 | |
32 #and implemented in the DESeq2 Bioconductor package | |
33 #Gene length correspond to the median of the size of all transcripts corresponding to this gene | |
34 ######################################################################################################### | |
35 | |
36 | |
37 | |
38 RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){ | |
39 | |
40 #Create a list with the file names in path2htseqfiles | |
41 sampleFiles=list.files(path2htseqfiles) | |
42 sampleFiles=strsplit(sampleFiles,".txt") | |
43 #_noSpikes_htseq | |
44 nfiles=length(sampleFiles) | |
45 | |
46 #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names | |
47 corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character")) | |
48 corresp$File = strsplit(corresp$File,".fastq.gz") | |
49 | |
50 #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles | |
51 libnames=rep("",nfiles) | |
52 for (i in 1:nfiles){ | |
53 libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]] | |
54 } | |
55 | |
56 #For all files located in path2htseqfiles read the corresponding file into R | |
57 library(plyr) | |
58 datalist = list() | |
59 for(i in 1:nfiles){ | |
60 rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep=".")) | |
61 #noSpikes_htseq. | |
62 nbrrows=nrow(rawdata) | |
63 datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files | |
64 colnames(datalist[[i]]) = c("ID",libnames[i]) | |
65 } | |
66 | |
67 #Join all the files in a data.frame called datafile with rownames = gene id | |
68 datafile = join_all(datalist, by = "ID", type = "left", match = "all") | |
69 | |
70 #Calculate the number of geneID pro file | |
71 nbID=data.frame(rep("",nfiles)) | |
72 for(i in 1:nfiles){ | |
73 nbID[,i]=nrow(datalist[[i]]) | |
74 } | |
75 totalnbID=apply((nbID[,1:4]),1,sum) | |
76 | |
77 #Verify that all the files contain the same gene ID | |
78 if (nrow(datafile)*4==totalnbID[1]){ | |
79 | |
80 #Suppress genes not expressed in all samples | |
81 datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,] | |
82 row.names(datafile)=datafile[,1] | |
83 data=datafile[,-1] | |
84 | |
85 #Number of libraries | |
86 nblib= dim(data)[2] | |
87 #Determine Data + normalization if the specie is not known | |
88 if (Species==""){ | |
89 #Normalized data calculation | |
90 nbcol = dim(data)[2] #nb of column in the data.frame | |
91 library(DESeq2) | |
92 conds = factor(1:nblib) | |
93 design = data.frame(Condition=conds) | |
94 dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition) | |
95 dds = estimateSizeFactors(dds) | |
96 datanorm = t(t(data)/sizeFactors(dds)) | |
97 | |
98 #Data + normalization | |
99 dataall = data.frame(row.names(datafile), data, datanorm ) | |
100 | |
101 #Renames columns | |
102 colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)")) | |
103 write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) | |
104 } | |
105 | |
106 #Determine Data + normalization + annotation if the specie is known | |
107 else{ | |
108 #Add annotations and calculate gene length | |
109 library(biomaRt) | |
110 | |
111 #Convert Ensembl version to host | |
112 conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character")) | |
113 correspondingdate = conversionfile[conversionfile$Version == ensversion, 2] | |
114 host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="") | |
115 | |
116 #Convert species name to the name of the corresponding bmdataset | |
117 conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character")) | |
118 bmdataset = conversion[conversion$Specie == Species, 2] | |
119 ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset) | |
120 if (ensversion<=75){ | |
121 annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) | |
122 } | |
123 else{ | |
124 annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) | |
125 } | |
126 | |
127 #because all the annotations are not always found in a first step | |
128 not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)] | |
129 if (length(not) !=0){ | |
130 annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) | |
131 annotation = rbind(annotation1, annotationnot) | |
132 } | |
133 else{ | |
134 annotation = annotation1 | |
135 } | |
136 | |
137 #Exon length | |
138 ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1) | |
139 colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length") | |
140 | |
141 #Transcript length | |
142 tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum) | |
143 tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id") | |
144 colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id") | |
145 | |
146 #Gene length = median of the size of all transcripts corresponding to this gene | |
147 glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median) | |
148 | |
149 #Data with gene length | |
150 datalen = merge(data, glen, by="row.names") | |
151 colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length") | |
152 | |
153 #Data with annotations and gene length | |
154 annotationgene = unique(annotation[,1:3]) | |
155 dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id") | |
156 | |
157 #To keep only the first part of the gene description (before [) | |
158 tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T) | |
159 f = function(l){ | |
160 if (length(l)>=1){ | |
161 return(l[[1]]) | |
162 } | |
163 else{ | |
164 return("") | |
165 } | |
166 } | |
167 tmpdescok = unlist(lapply(tmpdesc, f)) | |
168 dataannot$description = tmpdescok | |
169 | |
170 #Normalized data calculation | |
171 nbcol = dim(dataannot)[2] #nb of column in the data.frame | |
172 library(DESeq2) | |
173 conds = factor(1:nblib) | |
174 design = data.frame(Condition=conds) | |
175 dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition) | |
176 dds = estimateSizeFactors(dds) | |
177 datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds)) | |
178 | |
179 #Normalized data adjusted for gene length (normalized data / gene length) | |
180 rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 )) | |
181 | |
182 #Data + annotations + rpkn | |
183 dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] ) | |
184 | |
185 #Renames columns | |
186 colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description") | |
187 write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) | |
188 | |
189 #Return(dataall) | |
190 | |
191 } | |
192 } | |
193 else{ | |
194 print("The files are not the same length") | |
195 } | |
196 } | |
197 | |
198 args <- commandArgs(trailingOnly = TRUE) | |
199 print(args) | |
200 | |
201 RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7]) | |
202 | |
203 #R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt | |
204 | |
205 | |
206 | |
207 | |
208 | |
209 |