0
|
1 #Author : keime / lornage
|
|
2 #Date : 2014/11
|
|
3
|
|
4
|
|
5 ########################################################################################################
|
|
6 #This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations
|
|
7
|
|
8 #arguments
|
|
9 #path2htseqfiles : path to htseq-count result files
|
|
10 #samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header
|
|
11 #Species : latin name of the species
|
|
12 #ensversion : version of Ensembl to use
|
|
13 #fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length
|
|
14 #conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host
|
|
15 # (Column1 : Version Column2 : Host)
|
|
16 #conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use
|
|
17 # (Column1 : Specie Column2 : Dataset)
|
|
18
|
|
19 #output : a data.frame with the following columns :
|
|
20 #ensembl gene id
|
|
21 #raw read counts for each library (one column per library)
|
|
22 #normalized data for each library (one column per library)
|
|
23 #normalized data divided by gene length for each library (one column per library)
|
|
24 #Gene name
|
|
25 #Description
|
|
26
|
|
27 #require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1
|
|
28
|
|
29 #Methods :
|
|
30 #Considering that the resulting files of HTSeq-count have 5 lines of comments in the end
|
|
31 #Normalization is performed using the method described in Genome Biology 2010;11(10):R106
|
|
32 #and implemented in the DESeq2 Bioconductor package
|
|
33 #Gene length correspond to the median of the size of all transcripts corresponding to this gene
|
|
34 #########################################################################################################
|
|
35
|
|
36
|
|
37
|
|
38 RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){
|
|
39
|
|
40 #Create a list with the file names in path2htseqfiles
|
|
41 sampleFiles=list.files(path2htseqfiles)
|
|
42 sampleFiles=strsplit(sampleFiles,".txt")
|
|
43 #_noSpikes_htseq
|
|
44 nfiles=length(sampleFiles)
|
|
45
|
|
46 #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names
|
|
47 corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character"))
|
|
48 corresp$File = strsplit(corresp$File,".fastq.gz")
|
|
49
|
|
50 #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles
|
|
51 libnames=rep("",nfiles)
|
|
52 for (i in 1:nfiles){
|
|
53 libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]]
|
|
54 }
|
|
55
|
|
56 #For all files located in path2htseqfiles read the corresponding file into R
|
|
57 library(plyr)
|
|
58 datalist = list()
|
|
59 for(i in 1:nfiles){
|
|
60 rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep="."))
|
|
61 #noSpikes_htseq.
|
|
62 nbrrows=nrow(rawdata)
|
|
63 datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files
|
|
64 colnames(datalist[[i]]) = c("ID",libnames[i])
|
|
65 }
|
|
66
|
|
67 #Join all the files in a data.frame called datafile with rownames = gene id
|
|
68 datafile = join_all(datalist, by = "ID", type = "left", match = "all")
|
|
69
|
|
70 #Calculate the number of geneID pro file
|
|
71 nbID=data.frame(rep("",nfiles))
|
|
72 for(i in 1:nfiles){
|
|
73 nbID[,i]=nrow(datalist[[i]])
|
|
74 }
|
|
75 totalnbID=apply((nbID[,1:4]),1,sum)
|
|
76
|
|
77 #Verify that all the files contain the same gene ID
|
|
78 if (nrow(datafile)*4==totalnbID[1]){
|
|
79
|
|
80 #Suppress genes not expressed in all samples
|
|
81 datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,]
|
|
82 row.names(datafile)=datafile[,1]
|
|
83 data=datafile[,-1]
|
|
84
|
|
85 #Number of libraries
|
|
86 nblib= dim(data)[2]
|
|
87 #Determine Data + normalization if the specie is not known
|
|
88 if (Species==""){
|
|
89 #Normalized data calculation
|
|
90 nbcol = dim(data)[2] #nb of column in the data.frame
|
|
91 library(DESeq2)
|
|
92 conds = factor(1:nblib)
|
|
93 design = data.frame(Condition=conds)
|
|
94 dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition)
|
|
95 dds = estimateSizeFactors(dds)
|
|
96 datanorm = t(t(data)/sizeFactors(dds))
|
|
97
|
|
98 #Data + normalization
|
|
99 dataall = data.frame(row.names(datafile), data, datanorm )
|
|
100
|
|
101 #Renames columns
|
|
102 colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"))
|
|
103 write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
|
|
104 }
|
|
105
|
|
106 #Determine Data + normalization + annotation if the specie is known
|
|
107 else{
|
|
108 #Add annotations and calculate gene length
|
|
109 library(biomaRt)
|
|
110
|
|
111 #Convert Ensembl version to host
|
|
112 conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character"))
|
|
113 correspondingdate = conversionfile[conversionfile$Version == ensversion, 2]
|
|
114 host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="")
|
|
115
|
|
116 #Convert species name to the name of the corresponding bmdataset
|
|
117 conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character"))
|
|
118 bmdataset = conversion[conversion$Specie == Species, 2]
|
|
119 ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset)
|
|
120 if (ensversion<=75){
|
|
121 annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
|
|
122 }
|
|
123 else{
|
|
124 annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
|
|
125 }
|
|
126
|
|
127 #because all the annotations are not always found in a first step
|
|
128 not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)]
|
|
129 if (length(not) !=0){
|
|
130 annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)
|
|
131 annotation = rbind(annotation1, annotationnot)
|
|
132 }
|
|
133 else{
|
|
134 annotation = annotation1
|
|
135 }
|
|
136
|
|
137 #Exon length
|
|
138 ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1)
|
|
139 colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length")
|
|
140
|
|
141 #Transcript length
|
|
142 tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum)
|
|
143 tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id")
|
|
144 colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id")
|
|
145
|
|
146 #Gene length = median of the size of all transcripts corresponding to this gene
|
|
147 glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median)
|
|
148
|
|
149 #Data with gene length
|
|
150 datalen = merge(data, glen, by="row.names")
|
|
151 colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length")
|
|
152
|
|
153 #Data with annotations and gene length
|
|
154 annotationgene = unique(annotation[,1:3])
|
|
155 dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id")
|
|
156
|
|
157 #To keep only the first part of the gene description (before [)
|
|
158 tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T)
|
|
159 f = function(l){
|
|
160 if (length(l)>=1){
|
|
161 return(l[[1]])
|
|
162 }
|
|
163 else{
|
|
164 return("")
|
|
165 }
|
|
166 }
|
|
167 tmpdescok = unlist(lapply(tmpdesc, f))
|
|
168 dataannot$description = tmpdescok
|
|
169
|
|
170 #Normalized data calculation
|
|
171 nbcol = dim(dataannot)[2] #nb of column in the data.frame
|
|
172 library(DESeq2)
|
|
173 conds = factor(1:nblib)
|
|
174 design = data.frame(Condition=conds)
|
|
175 dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition)
|
|
176 dds = estimateSizeFactors(dds)
|
|
177 datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds))
|
|
178
|
|
179 #Normalized data adjusted for gene length (normalized data / gene length)
|
|
180 rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 ))
|
|
181
|
|
182 #Data + annotations + rpkn
|
|
183 dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] )
|
|
184
|
|
185 #Renames columns
|
|
186 colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description")
|
|
187 write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
|
|
188
|
|
189 #Return(dataall)
|
|
190
|
|
191 }
|
|
192 }
|
|
193 else{
|
|
194 print("The files are not the same length")
|
|
195 }
|
|
196 }
|
|
197
|
|
198 args <- commandArgs(trailingOnly = TRUE)
|
|
199 print(args)
|
|
200
|
|
201 RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7])
|
|
202
|
|
203 #R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt
|
|
204
|
|
205
|
|
206
|
|
207
|
|
208
|
|
209
|