Mercurial > repos > eganrol > rnaseqdataannotation
changeset 7:80216215973c draft
Deleted selected files
| author | eganrol | 
|---|---|
| date | Wed, 19 Nov 2014 12:05:17 -0500 | 
| parents | 22da2dc0e103 | 
| children | 73ea91916c9d | 
| files | RNAseqDataAnnotation/RNAseqDataAnnotation.R RNAseqDataAnnotation/RNAseqDataAnnotation.xml RNAseqDataAnnotation/packages.R RNAseqDataAnnotation/tool_dependencies.xml | 
| diffstat | 4 files changed, 0 insertions(+), 288 deletions(-) [+] | 
line wrap: on
 line diff
--- a/RNAseqDataAnnotation/RNAseqDataAnnotation.R Wed Nov 19 12:00:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,209 +0,0 @@ -#Author : keime / lornage -#Date : 2014/11 - - -######################################################################################################## -#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations - -#arguments -#path2htseqfiles : path to htseq-count result files -#samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header -#Species : latin name of the species -#ensversion : version of Ensembl to use -#fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length -#conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host -# (Column1 : Version Column2 : Host) -#conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use -# (Column1 : Specie Column2 : Dataset) - -#output : a data.frame with the following columns : -#ensembl gene id -#raw read counts for each library (one column per library) -#normalized data for each library (one column per library) -#normalized data divided by gene length for each library (one column per library) -#Gene name -#Description - -#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1 - -#Methods : -#Considering that the resulting files of HTSeq-count have 5 lines of comments in the end -#Normalization is performed using the method described in Genome Biology 2010;11(10):R106 -#and implemented in the DESeq2 Bioconductor package -#Gene length correspond to the median of the size of all transcripts corresponding to this gene -######################################################################################################### - - - -RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){ - - #Create a list with the file names in path2htseqfiles - sampleFiles=list.files(path2htseqfiles) - sampleFiles=strsplit(sampleFiles,".txt") - #_noSpikes_htseq - nfiles=length(sampleFiles) - - #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names - corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character")) - corresp$File = strsplit(corresp$File,".fastq.gz") - - #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles - libnames=rep("",nfiles) - for (i in 1:nfiles){ - libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]] - } - - #For all files located in path2htseqfiles read the corresponding file into R - library(plyr) - datalist = list() - for(i in 1:nfiles){ - rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep=".")) - #noSpikes_htseq. - nbrrows=nrow(rawdata) - datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files - colnames(datalist[[i]]) = c("ID",libnames[i]) - } - - #Join all the files in a data.frame called datafile with rownames = gene id - datafile = join_all(datalist, by = "ID", type = "left", match = "all") - - #Calculate the number of geneID pro file - nbID=data.frame(rep("",nfiles)) - for(i in 1:nfiles){ - nbID[,i]=nrow(datalist[[i]]) - } - totalnbID=apply((nbID[,1:4]),1,sum) - - #Verify that all the files contain the same gene ID - if (nrow(datafile)*4==totalnbID[1]){ - - #Suppress genes not expressed in all samples - datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,] - row.names(datafile)=datafile[,1] - data=datafile[,-1] - - #Number of libraries - nblib= dim(data)[2] - #Determine Data + normalization if the specie is not known - if (Species==""){ - #Normalized data calculation - nbcol = dim(data)[2] #nb of column in the data.frame - library(DESeq2) - conds = factor(1:nblib) - design = data.frame(Condition=conds) - dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition) - dds = estimateSizeFactors(dds) - datanorm = t(t(data)/sizeFactors(dds)) - - #Data + normalization - dataall = data.frame(row.names(datafile), data, datanorm ) - - #Renames columns - colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)")) - write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) - } - - #Determine Data + normalization + annotation if the specie is known - else{ - #Add annotations and calculate gene length - library(biomaRt) - - #Convert Ensembl version to host - conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character")) - correspondingdate = conversionfile[conversionfile$Version == ensversion, 2] - host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="") - - #Convert species name to the name of the corresponding bmdataset - conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character")) - bmdataset = conversion[conversion$Specie == Species, 2] - ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset) - if (ensversion<=75){ - annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) - } - else{ - annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) - } - - #because all the annotations are not always found in a first step - not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)] - if (length(not) !=0){ - annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) - annotation = rbind(annotation1, annotationnot) - } - else{ - annotation = annotation1 - } - - #Exon length - ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1) - colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length") - - #Transcript length - tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum) - tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id") - colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id") - - #Gene length = median of the size of all transcripts corresponding to this gene - glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median) - - #Data with gene length - datalen = merge(data, glen, by="row.names") - colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length") - - #Data with annotations and gene length - annotationgene = unique(annotation[,1:3]) - dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id") - - #To keep only the first part of the gene description (before [) - tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T) - f = function(l){ - if (length(l)>=1){ - return(l[[1]]) - } - else{ - return("") - } - } - tmpdescok = unlist(lapply(tmpdesc, f)) - dataannot$description = tmpdescok - - #Normalized data calculation - nbcol = dim(dataannot)[2] #nb of column in the data.frame - library(DESeq2) - conds = factor(1:nblib) - design = data.frame(Condition=conds) - dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition) - dds = estimateSizeFactors(dds) - datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds)) - - #Normalized data adjusted for gene length (normalized data / gene length) - rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 )) - - #Data + annotations + rpkn - dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] ) - - #Renames columns - colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description") - write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) - - #Return(dataall) - - } - } - else{ - print("The files are not the same length") - } -} - -args <- commandArgs(trailingOnly = TRUE) -print(args) - -RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7]) - -#R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt - - - - - -
--- a/RNAseqDataAnnotation/RNAseqDataAnnotation.xml Wed Nov 19 12:00:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -<tool id="RNAseqDataAnnotation" name="RNAseqDataAnnotation" version="1.0.0"> - <description>tool for RNAseq Data Normalisation and Annotation</description> - <requirements> - <!--<requirement type="set_environment">SCRIPT_PATH</requirement>--> - <requirement type="package" version="3.0.2">R_3_0_2</requirement> - <requirement type="package" version="1.0">DESeq2biomaRt</requirement> - </requirements> - - <command> - R --slave --vanilla --file=RNAseqDataAnnotation.R --args - $path2htseqfiles - $samplenamefile - $Species - $ensversion - $conversionensemblversion - $conversionensemblname - $fileout - </command> - - <inputs> - <param name="path2htseqfiles" label="Path to the directory containing the files from HTSeq-count" type="text"/> - <param name="samplenamefile" label="Conversion file sample/conditions" type="data" format="tabular" help="file should be tab-delimited"/> - <param name="Species" type="select" label="Select the specie for your data" help="If your specie of interest is not listed, your data will be normalized but no annotation will be added. Contact us if you want us to add your specie." > - <option value="Homo_sapiens">Homo sapiens</option> - <option value="Mus_musculus">Mus musculus</option> - <option value="">Other specie</option> - </param> - <param name="ensversion" type="select" label="Select the version of Ensembl to use" > - <option value="67">Version 67</option> - <option value="68">Version 68</option> - <option value="69">Version 69</option> - <option value="70">Version 70</option> - <option value="71">Version 71</option> - <option value="72">Version 72</option> - <option value="73">Version 73</option> - <option value="74">Version 74</option> - <option value="75">Version 75</option> - <option value="76">Version 76</option> - <option value="77">Version 77</option> - </param> - <param name="conversionensemblversion" label="File for conversion Ensembl to version" type="data" format="tabular" help="Tab-delimited input file" /> - <param name="conversionensemblname" label="File for conversion Ensemble name of the specie " type="data" format="tabular" help="Tab-delimited input file"/> - </inputs> - - <outputs> -<param name="fileout" label="Path where the resulting file should be stored" type="data" format="tabular"/> - </outputs> - <help> -**What it does* -**Example** - </help> - </tool>
--- a/RNAseqDataAnnotation/packages.R Wed Nov 19 12:00:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -source("http://bioconductor.org/biocLite.R") -biocLite() -biocLite("DESeq2") -biocLite("biomaRt") -install.packages("plyr")
--- a/RNAseqDataAnnotation/tool_dependencies.xml Wed Nov 19 12:00:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="R_3_0_2" version="3.0.2"> - <repository changeset_revision="b6fe8ca3230d" name="package_r_3_0_2" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="DESeq2biomaRt" version="1.0"> - <install version="1.0"> - <actions> - <action type="set_environment_for_install"> - <repository changeset_revision="b6fe8ca3230d" name="package_r_3_0_2" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu"> - <package name="R_3_0_2" version="3.0.2" /> - </repository> - </action> - <action type="shell_command">R CMD BATCH packages.R</action> - <action type="shell_command">echo "export PATH=$PATH" > $INSTALL_DIR/env.sh </action> - <action type="shell_command">chmod 755 $INSTALL_DIR/env.sh </action> - </actions> - </install> - <readme> - </readme> - </package> -</tool_dependency>
