Mercurial > repos > eganrol > rnaseqdataannotation
view RNAseqDataAnnotation/RNAseqDataAnnotation.R @ 6:22da2dc0e103 draft
Uploaded
author | eganrol |
---|---|
date | Wed, 19 Nov 2014 12:00:59 -0500 |
parents | de3907c85546 |
children |
line wrap: on
line source
#Author : keime / lornage #Date : 2014/11 ######################################################################################################## #This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations #arguments #path2htseqfiles : path to htseq-count result files #samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header #Species : latin name of the species #ensversion : version of Ensembl to use #fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length #conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host # (Column1 : Version Column2 : Host) #conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use # (Column1 : Specie Column2 : Dataset) #output : a data.frame with the following columns : #ensembl gene id #raw read counts for each library (one column per library) #normalized data for each library (one column per library) #normalized data divided by gene length for each library (one column per library) #Gene name #Description #require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1 #Methods : #Considering that the resulting files of HTSeq-count have 5 lines of comments in the end #Normalization is performed using the method described in Genome Biology 2010;11(10):R106 #and implemented in the DESeq2 Bioconductor package #Gene length correspond to the median of the size of all transcripts corresponding to this gene ######################################################################################################### RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){ #Create a list with the file names in path2htseqfiles sampleFiles=list.files(path2htseqfiles) sampleFiles=strsplit(sampleFiles,".txt") #_noSpikes_htseq nfiles=length(sampleFiles) #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character")) corresp$File = strsplit(corresp$File,".fastq.gz") #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles libnames=rep("",nfiles) for (i in 1:nfiles){ libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]] } #For all files located in path2htseqfiles read the corresponding file into R library(plyr) datalist = list() for(i in 1:nfiles){ rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep=".")) #noSpikes_htseq. nbrrows=nrow(rawdata) datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files colnames(datalist[[i]]) = c("ID",libnames[i]) } #Join all the files in a data.frame called datafile with rownames = gene id datafile = join_all(datalist, by = "ID", type = "left", match = "all") #Calculate the number of geneID pro file nbID=data.frame(rep("",nfiles)) for(i in 1:nfiles){ nbID[,i]=nrow(datalist[[i]]) } totalnbID=apply((nbID[,1:4]),1,sum) #Verify that all the files contain the same gene ID if (nrow(datafile)*4==totalnbID[1]){ #Suppress genes not expressed in all samples datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,] row.names(datafile)=datafile[,1] data=datafile[,-1] #Number of libraries nblib= dim(data)[2] #Determine Data + normalization if the specie is not known if (Species==""){ #Normalized data calculation nbcol = dim(data)[2] #nb of column in the data.frame library(DESeq2) conds = factor(1:nblib) design = data.frame(Condition=conds) dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition) dds = estimateSizeFactors(dds) datanorm = t(t(data)/sizeFactors(dds)) #Data + normalization dataall = data.frame(row.names(datafile), data, datanorm ) #Renames columns colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)")) write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) } #Determine Data + normalization + annotation if the specie is known else{ #Add annotations and calculate gene length library(biomaRt) #Convert Ensembl version to host conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character")) correspondingdate = conversionfile[conversionfile$Version == ensversion, 2] host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="") #Convert species name to the name of the corresponding bmdataset conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character")) bmdataset = conversion[conversion$Specie == Species, 2] ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset) if (ensversion<=75){ annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) } else{ annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) } #because all the annotations are not always found in a first step not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)] if (length(not) !=0){ annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) annotation = rbind(annotation1, annotationnot) } else{ annotation = annotation1 } #Exon length ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1) colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length") #Transcript length tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum) tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id") colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id") #Gene length = median of the size of all transcripts corresponding to this gene glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median) #Data with gene length datalen = merge(data, glen, by="row.names") colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length") #Data with annotations and gene length annotationgene = unique(annotation[,1:3]) dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id") #To keep only the first part of the gene description (before [) tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T) f = function(l){ if (length(l)>=1){ return(l[[1]]) } else{ return("") } } tmpdescok = unlist(lapply(tmpdesc, f)) dataannot$description = tmpdescok #Normalized data calculation nbcol = dim(dataannot)[2] #nb of column in the data.frame library(DESeq2) conds = factor(1:nblib) design = data.frame(Condition=conds) dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition) dds = estimateSizeFactors(dds) datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds)) #Normalized data adjusted for gene length (normalized data / gene length) rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 )) #Data + annotations + rpkn dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] ) #Renames columns colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description") write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) #Return(dataall) } } else{ print("The files are not the same length") } } args <- commandArgs(trailingOnly = TRUE) print(args) RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7]) #R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt