view RNAseqDataAnnotation/RNAseqDataAnnotation.R @ 26:f183f8648c5a draft default tip

Uploaded
author eganrol
date Wed, 10 Dec 2014 06:42:21 -0500
parents
children
line wrap: on
line source

#Author : keime / lornage
#Date : 2014/11


########################################################################################################
#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations

#arguments
#Species : Name of the species
#ensversion : version of Ensembl to use
#fileout : tab-delimited file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length
#corresp : data.frame linking the file loaded into galaxy to the corresponding condition
#nfiles : number of files(conditions)

#output : a data.frame with the following columns :
#ensembl gene id
#raw read counts for each library (one column per library)
#normalized data for each library (one column per library) 
#normalized data divided by gene length for each library (one column per library)
#Gene name
#Description

#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1

#Methods : 
#Normalization is performed using the method described in Genome Biology 2010;11(10):R106 
#and implemented in the DESeq2 Bioconductor package
#Gene length correspond to the median of the size of all transcripts corresponding to this gene
#########################################################################################################



RNAseqDataAnnotation = function(Species, ensversion, fileout, corresp ,nfiles){											
	
  #Create a string vector called libnames that contains the name of the samples 
	libnames=rep("",nfiles)
	for (i in 1:nfiles){
		libnames[i]=toString(corresp$Sample_name[i])
	} 

  #For all files in corresp read the corresponding file into R
	suppressPackageStartupMessages(library(plyr, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE, verbose=FALSE, quietly = TRUE))
	datalist = list()
	for(i in 1:nfiles){
		rawdata=read.table(toString(corresp$Files[i]), header =T, sep ="\t")
		#noSpikes_htseq.
		nbrrows=nrow(rawdata)
		datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files
		colnames(datalist[[i]]) = c("ID",libnames[i])		
	}  
		
  #Join all the files in a data.frame called datafile with rownames = gene id
	datafile = join_all(datalist, by = "ID", type = "left", match = "all")
	
  #Calculate the number of geneID pro file
	nbID=data.frame(rep("",nfiles))
	for(i in 1:nfiles){
		nbID[,i]=nrow(datalist[[i]])
	}
	totalnbID=apply((nbID[,1:nfiles]),1,sum)
	
  #Verify that all the files contain the same gene ID
	if (nrow(datafile)*nfiles==totalnbID[1]){
  
  #Suppress genes not expressed in all samples                                                                                                                                                              
		datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,]
		row.names(datafile)=datafile[,1]
		data=datafile[,-1]

  #Number of libraries
		nblib= dim(data)[2]	
  #Determine Data + normalization if the specie is not known 
		if (Species=="None"){
  #Normalized data calculation
			nbcol = dim(data)[2] #nb of column in the data.frame
			suppressPackageStartupMessages(library(DESeq2, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE, verbose=FALSE, quietly = TRUE))
			conds = factor(1:nblib)
			design = data.frame(Condition=conds)
			dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition)
			dds = estimateSizeFactors(dds)
			datanorm = t(t(data)/sizeFactors(dds))
			
  #Data + normalization 
			dataall = data.frame(row.names(datafile), data, datanorm )
	
  #Renames columns
			colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"))
			write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
		}
  
  #Determine Data + normalization + annotation if the specie is known 
		else{
  #Add annotations and calculate gene length
			suppressPackageStartupMessages(library(biomaRt, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE,verbose=FALSE, quietly = TRUE))
	
  #Convert Ensembl version to host
			correspondingdate = toString(ensversion)
			host  = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="")   
	 
  #Load the correct bmdataset
			bmdataset = toString(Species)
			ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset) 
			if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) {
				annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
			} 
			else{
				annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
			}	
			
  #because all the annotations are not always found in a first step 
			not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)]
			if (length(not) !=0){
				if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) {
					annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=not, mart=ensembl)
					annotation2 = rbind(annotation1, annotationnot)
				} 
				else {
					annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)
					annotation2 = rbind(annotation1, annotationnot)		
				}
			}	
			else{
				annotation2 = annotation1
			}
			
			
  #because all the annotations are not always found in a first or second step 
			not = rownames(data)[!rownames(data) %in% unique(annotation2$ensembl_gene_id)]
			if (length(not) !=0){
				if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) {
					annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=not, mart=ensembl)
					annotation = rbind(annotation2, annotationnot)
				} 
				else {
					annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)
					annotation = rbind(annotation2, annotationnot)		
				}
			}	
			else{
				annotation = annotation2
			}
	
  #Exon length
			ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1)
			colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length")
	
  #Transcript length
			tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum)
			tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id")
			colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id")
	
  #Gene length = median of the size of all transcripts corresponding to this gene
			glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median)
	
  #Data with gene length
			datalen = merge(data, glen, by="row.names") 
			colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length")
	
  #Data with annotations and gene length
			annotationgene = unique(annotation[,1:3])
			dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id")
	
  #To keep only the first part of the gene description (before [)
			tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T)
			f = function(l){
				if (length(l)>=1){
					return(l[[1]])
				}
				else{
					return("")
				}
			}
			tmpdescok = unlist(lapply(tmpdesc, f))
			dataannot$description = tmpdescok
	
  #Normalized data calculation
			nbcol = dim(dataannot)[2] #nb of column in the data.frame
			suppressPackageStartupMessages(library(DESeq2, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE,verbose=FALSE, quietly = TRUE))
			conds = factor(1:nblib)
			design = data.frame(Condition=conds)
			dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition)
			dds = estimateSizeFactors(dds)
			datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds))
	
  #Normalized data adjusted for gene length (normalized data / gene length)
			rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 ))
	
  #Data + annotations + rpkn
			dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)]  )
		
  #Renames columns
			colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description")
            write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
	
		}
	}
	else{
		print("The files are not the same length")
	}
}

# Build a dataframe containing the files loaded into galaxy and their corresponding condition

args <- commandArgs(trailingOnly = TRUE)

Files=c()
Sample_name =c()
nbcells = (length(args)-3)
for (i in seq(1,nbcells,2)){
	Files = c(Files, args[3+i])
	Sample_name = c(Sample_name, args[4+i])
}
nfiles=nbcells/2
corresp = data.frame(Files=Files, Sample_name=Sample_name)

# Take the informations given by the galaxy user to run the script
RNAseqDataAnnotation(args[1], args[2],args[3],corresp,nfiles)