Mercurial > repos > eganrol > rnaseqdataannotation
comparison RNAseqDataAnnotation/RNAseqDataAnnotation.R @ 26:f183f8648c5a draft default tip
Uploaded
| author | eganrol | 
|---|---|
| date | Wed, 10 Dec 2014 06:42:21 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 25:ea3f7a6504e0 | 26:f183f8648c5a | 
|---|---|
| 1 #Author : keime / lornage | |
| 2 #Date : 2014/11 | |
| 3 | |
| 4 | |
| 5 ######################################################################################################## | |
| 6 #This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations | |
| 7 | |
| 8 #arguments | |
| 9 #Species : Name of the species | |
| 10 #ensversion : version of Ensembl to use | |
| 11 #fileout : tab-delimited file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length | |
| 12 #corresp : data.frame linking the file loaded into galaxy to the corresponding condition | |
| 13 #nfiles : number of files(conditions) | |
| 14 | |
| 15 #output : a data.frame with the following columns : | |
| 16 #ensembl gene id | |
| 17 #raw read counts for each library (one column per library) | |
| 18 #normalized data for each library (one column per library) | |
| 19 #normalized data divided by gene length for each library (one column per library) | |
| 20 #Gene name | |
| 21 #Description | |
| 22 | |
| 23 #require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1 | |
| 24 | |
| 25 #Methods : | |
| 26 #Normalization is performed using the method described in Genome Biology 2010;11(10):R106 | |
| 27 #and implemented in the DESeq2 Bioconductor package | |
| 28 #Gene length correspond to the median of the size of all transcripts corresponding to this gene | |
| 29 ######################################################################################################### | |
| 30 | |
| 31 | |
| 32 | |
| 33 RNAseqDataAnnotation = function(Species, ensversion, fileout, corresp ,nfiles){ | |
| 34 | |
| 35 #Create a string vector called libnames that contains the name of the samples | |
| 36 libnames=rep("",nfiles) | |
| 37 for (i in 1:nfiles){ | |
| 38 libnames[i]=toString(corresp$Sample_name[i]) | |
| 39 } | |
| 40 | |
| 41 #For all files in corresp read the corresponding file into R | |
| 42 suppressPackageStartupMessages(library(plyr, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE, verbose=FALSE, quietly = TRUE)) | |
| 43 datalist = list() | |
| 44 for(i in 1:nfiles){ | |
| 45 rawdata=read.table(toString(corresp$Files[i]), header =T, sep ="\t") | |
| 46 #noSpikes_htseq. | |
| 47 nbrrows=nrow(rawdata) | |
| 48 datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files | |
| 49 colnames(datalist[[i]]) = c("ID",libnames[i]) | |
| 50 } | |
| 51 | |
| 52 #Join all the files in a data.frame called datafile with rownames = gene id | |
| 53 datafile = join_all(datalist, by = "ID", type = "left", match = "all") | |
| 54 | |
| 55 #Calculate the number of geneID pro file | |
| 56 nbID=data.frame(rep("",nfiles)) | |
| 57 for(i in 1:nfiles){ | |
| 58 nbID[,i]=nrow(datalist[[i]]) | |
| 59 } | |
| 60 totalnbID=apply((nbID[,1:nfiles]),1,sum) | |
| 61 | |
| 62 #Verify that all the files contain the same gene ID | |
| 63 if (nrow(datafile)*nfiles==totalnbID[1]){ | |
| 64 | |
| 65 #Suppress genes not expressed in all samples | |
| 66 datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,] | |
| 67 row.names(datafile)=datafile[,1] | |
| 68 data=datafile[,-1] | |
| 69 | |
| 70 #Number of libraries | |
| 71 nblib= dim(data)[2] | |
| 72 #Determine Data + normalization if the specie is not known | |
| 73 if (Species=="None"){ | |
| 74 #Normalized data calculation | |
| 75 nbcol = dim(data)[2] #nb of column in the data.frame | |
| 76 suppressPackageStartupMessages(library(DESeq2, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE, verbose=FALSE, quietly = TRUE)) | |
| 77 conds = factor(1:nblib) | |
| 78 design = data.frame(Condition=conds) | |
| 79 dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition) | |
| 80 dds = estimateSizeFactors(dds) | |
| 81 datanorm = t(t(data)/sizeFactors(dds)) | |
| 82 | |
| 83 #Data + normalization | |
| 84 dataall = data.frame(row.names(datafile), data, datanorm ) | |
| 85 | |
| 86 #Renames columns | |
| 87 colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)")) | |
| 88 write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) | |
| 89 } | |
| 90 | |
| 91 #Determine Data + normalization + annotation if the specie is known | |
| 92 else{ | |
| 93 #Add annotations and calculate gene length | |
| 94 suppressPackageStartupMessages(library(biomaRt, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE,verbose=FALSE, quietly = TRUE)) | |
| 95 | |
| 96 #Convert Ensembl version to host | |
| 97 correspondingdate = toString(ensversion) | |
| 98 host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="") | |
| 99 | |
| 100 #Load the correct bmdataset | |
| 101 bmdataset = toString(Species) | |
| 102 ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset) | |
| 103 if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) { | |
| 104 annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) | |
| 105 } | |
| 106 else{ | |
| 107 annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) | |
| 108 } | |
| 109 | |
| 110 #because all the annotations are not always found in a first step | |
| 111 not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)] | |
| 112 if (length(not) !=0){ | |
| 113 if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) { | |
| 114 annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=not, mart=ensembl) | |
| 115 annotation2 = rbind(annotation1, annotationnot) | |
| 116 } | |
| 117 else { | |
| 118 annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) | |
| 119 annotation2 = rbind(annotation1, annotationnot) | |
| 120 } | |
| 121 } | |
| 122 else{ | |
| 123 annotation2 = annotation1 | |
| 124 } | |
| 125 | |
| 126 | |
| 127 #because all the annotations are not always found in a first or second step | |
| 128 not = rownames(data)[!rownames(data) %in% unique(annotation2$ensembl_gene_id)] | |
| 129 if (length(not) !=0){ | |
| 130 if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) { | |
| 131 annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=not, mart=ensembl) | |
| 132 annotation = rbind(annotation2, annotationnot) | |
| 133 } | |
| 134 else { | |
| 135 annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) | |
| 136 annotation = rbind(annotation2, annotationnot) | |
| 137 } | |
| 138 } | |
| 139 else{ | |
| 140 annotation = annotation2 | |
| 141 } | |
| 142 | |
| 143 #Exon length | |
| 144 ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1) | |
| 145 colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length") | |
| 146 | |
| 147 #Transcript length | |
| 148 tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum) | |
| 149 tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id") | |
| 150 colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id") | |
| 151 | |
| 152 #Gene length = median of the size of all transcripts corresponding to this gene | |
| 153 glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median) | |
| 154 | |
| 155 #Data with gene length | |
| 156 datalen = merge(data, glen, by="row.names") | |
| 157 colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length") | |
| 158 | |
| 159 #Data with annotations and gene length | |
| 160 annotationgene = unique(annotation[,1:3]) | |
| 161 dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id") | |
| 162 | |
| 163 #To keep only the first part of the gene description (before [) | |
| 164 tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T) | |
| 165 f = function(l){ | |
| 166 if (length(l)>=1){ | |
| 167 return(l[[1]]) | |
| 168 } | |
| 169 else{ | |
| 170 return("") | |
| 171 } | |
| 172 } | |
| 173 tmpdescok = unlist(lapply(tmpdesc, f)) | |
| 174 dataannot$description = tmpdescok | |
| 175 | |
| 176 #Normalized data calculation | |
| 177 nbcol = dim(dataannot)[2] #nb of column in the data.frame | |
| 178 suppressPackageStartupMessages(library(DESeq2, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE,verbose=FALSE, quietly = TRUE)) | |
| 179 conds = factor(1:nblib) | |
| 180 design = data.frame(Condition=conds) | |
| 181 dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition) | |
| 182 dds = estimateSizeFactors(dds) | |
| 183 datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds)) | |
| 184 | |
| 185 #Normalized data adjusted for gene length (normalized data / gene length) | |
| 186 rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 )) | |
| 187 | |
| 188 #Data + annotations + rpkn | |
| 189 dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] ) | |
| 190 | |
| 191 #Renames columns | |
| 192 colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description") | |
| 193 write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) | |
| 194 | |
| 195 } | |
| 196 } | |
| 197 else{ | |
| 198 print("The files are not the same length") | |
| 199 } | |
| 200 } | |
| 201 | |
| 202 # Build a dataframe containing the files loaded into galaxy and their corresponding condition | |
| 203 | |
| 204 args <- commandArgs(trailingOnly = TRUE) | |
| 205 | |
| 206 Files=c() | |
| 207 Sample_name =c() | |
| 208 nbcells = (length(args)-3) | |
| 209 for (i in seq(1,nbcells,2)){ | |
| 210 Files = c(Files, args[3+i]) | |
| 211 Sample_name = c(Sample_name, args[4+i]) | |
| 212 } | |
| 213 nfiles=nbcells/2 | |
| 214 corresp = data.frame(Files=Files, Sample_name=Sample_name) | |
| 215 | |
| 216 # Take the informations given by the galaxy user to run the script | |
| 217 RNAseqDataAnnotation(args[1], args[2],args[3],corresp,nfiles) | |
| 218 | |
| 219 | |
| 220 | |
| 221 | |
| 222 | |
| 223 | 
