Repository 'rnaseqdataannotation'
hg clone https://eddie.galaxyproject.org/repos/eganrol/rnaseqdataannotation

Changeset 15:a7ec3c9cb0b7 (2014-11-20)
Previous changeset 14:e6d7a8ca8d94 (2014-11-20) Next changeset 16:e7e14643be0d (2014-11-20)
Commit message:
Deleted selected files
removed:
RNAseqDataAnnotation/RNAseqDataAnnotation.R
RNAseqDataAnnotation/RNAseqDataAnnotation.xml
RNAseqDataAnnotation/packages.R
RNAseqDataAnnotation/test-data/Ensembl_Version_Host.txt
RNAseqDataAnnotation/test-data/Ensemble_Specie_Dataset.txt
RNAseqDataAnnotation/test-data/Fichier1.txt
RNAseqDataAnnotation/test-data/Fichier2.txt
RNAseqDataAnnotation/test-data/Fichier3.txt
RNAseqDataAnnotation/test-data/Fichier4.txt
RNAseqDataAnnotation/test-data/ichierconvertitnames.txt
RNAseqDataAnnotation/tool_dependencies.xml
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/RNAseqDataAnnotation.R
--- a/RNAseqDataAnnotation/RNAseqDataAnnotation.R Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,209 +0,0 @@\n-#Author : keime / lornage\n-#Date : 2014/11\n-\n-\n-########################################################################################################\n-#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations\n-\n-#arguments\n-#path2htseqfiles : path to htseq-count result files\n-#samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header\n-#Species : latin name of the species\n-#ensversion : version of Ensembl to use\n-#fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length\n-#conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host \n-#\t\t\t\t\t\t\t (Column1 : Version\t Column2 : Host)  \n-#conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use\n-#\t\t\t\t\t\t (Column1 : Specie Column2 : Dataset)  \n-\n-#output : a data.frame with the following columns :\n-#ensembl gene id\n-#raw read counts for each library (one column per library)\n-#normalized data for each library (one column per library) \n-#normalized data divided by gene length for each library (one column per library)\n-#Gene name\n-#Description\n-\n-#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1\n-\n-#Methods : \n-#Considering that the resulting files of HTSeq-count have 5 lines of comments in the end\n-#Normalization is performed using the method described in Genome Biology 2010;11(10):R106 \n-#and implemented in the DESeq2 Bioconductor package\n-#Gene length correspond to the median of the size of all transcripts corresponding to this gene\n-#########################################################################################################\n-\n-\n-\n-RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){\n-  \t\t\t\t\t\t\t\t\t\t\t\t\n-  #Create a list with the file names in path2htseqfiles \n-\tsampleFiles=list.files(path2htseqfiles)\n-\tsampleFiles=strsplit(sampleFiles,".txt")\n-\t#_noSpikes_htseq\n-\tnfiles=length(sampleFiles) \n-\n-  #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names\n-\tcorresp = read.table(samplenamefile,header=T,sep="\\t",colClasses=c("character","character"))\n-\tcorresp$File = strsplit(corresp$File,".fastq.gz")\n-\t\n-  #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles\n-\tlibnames=rep("",nfiles)\n-\tfor (i in 1:nfiles){\n-\t\tlibnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]]\n-\t}\n-\n-  #For all files located in path2htseqfiles read the corresponding file into R\n-\tlibrary(plyr)\n-\tdatalist = list()\n-\tfor(i in 1:nfiles){\n-\t\trawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep="."))\n-\t\t#noSpikes_htseq.\n-\t\tnbrrows=nrow(rawdata)\n-\t\tdatalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files\n-\t\tcolnames(datalist[[i]]) = c("ID",libnames[i])\t\t\n-\t}  \n-\t\t\n-  #Join all the files in a data.frame called datafile with rownames = gene id\n-\tdatafile = join_all(datalist, by = "ID", type = "left", match = "all")\n-\t\n-  #Calculate the number of geneID pro file\n-\tnbID=data.frame(rep("",nfiles))\n-\tfor(i in 1:nfiles){\n-\t\tnbID[,i]=nrow(datalist[[i]])\n-\t}\n-\ttotalnbID=apply((nbID[,1:4]),1,sum)\n-\t\n-  #Verify that all the files contain the same gene ID\n-\tif (nrow(datafile)*4==totalnbID[1]){\n-  \n-  #Suppress genes not expressed in all samples                                                                                                                                                              \n-\t\tdatafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,]\n-\t\trow.names(datafile)=datafile[,1]\n-\t\tdata=datafile[,-1]\n-\t\t\n-  #Number of libraries\n-\t\tnblib= dim(data)[2]\t\n-  #Determine Data + normalization if the specie is not kno'..b'sion<=75){  \n-\t\t\t\tannotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)\n-\t\t\t}\n-\t\t\telse{\n-\t\t\t\tannotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)\n-\t\t\t}\t\n-\t\t\t\n-  #because all the annotations are not always found in a first step \n-\t\t\tnot = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)]\n-\t\t\tif (length(not) !=0){\n-\t\t\t\tannotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)\n-\t\t\tannotation = rbind(annotation1, annotationnot)\t\t\n-\t\t\t}\n-\t\t\telse{\n-\t\t\t\tannotation = annotation1\n-\t\t\t}\n-\t\n-  #Exon length\n-\t\t\tensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1)\n-\t\t\tcolnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length")\n-\t\n-  #Transcript length\n-\t\t\ttlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum)\n-\t\t\ttlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id")\n-\t\t\tcolnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id")\n-\t\n-  #Gene length = median of the size of all transcripts corresponding to this gene\n-\t\t\tglen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median)\n-\t\n-  #Data with gene length\n-\t\t\tdatalen = merge(data, glen, by="row.names") \n-\t\t\tcolnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length")\n-\t\n-  #Data with annotations and gene length\n-\t\t\tannotationgene = unique(annotation[,1:3])\n-\t\t\tdataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id")\n-\t\n-  #To keep only the first part of the gene description (before [)\n-\t\t\ttmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T)\n-\t\t\tf = function(l){\n-\t\t\t\tif (length(l)>=1){\n-\t\t\t\t\treturn(l[[1]])\n-\t\t\t\t}\n-\t\t\t\telse{\n-\t\t\t\t\treturn("")\n-\t\t\t\t}\n-\t\t\t}\n-\t\t\ttmpdescok = unlist(lapply(tmpdesc, f))\n-\t\t\tdataannot$description = tmpdescok\n-\t\n-  #Normalized data calculation\n-\t\t\tnbcol = dim(dataannot)[2] #nb of column in the data.frame\n-\t\t\tlibrary(DESeq2)\n-\t\t\tconds = factor(1:nblib)\n-\t\t\tdesign = data.frame(Condition=conds)\n-\t\t\tdds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition)\n-\t\t\tdds = estimateSizeFactors(dds)\n-\t\t\tdatanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds))\n-\t\n-  #Normalized data adjusted for gene length (normalized data / gene length)\n-\t\t\trpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 ))\n-\t\n-  #Data + annotations + rpkn\n-\t\t\tdataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)]  )\n-\t\t\n-  #Renames columns\n-\t\t\tcolnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description")\n-\t\t\twrite.table(dataall, file=fileout, sep="\\t", quote=F, row.names=F)\n-\n-  #Return(dataall)\n-\t\n-\t\t}\n-\t}\n-\telse{\n-\t\tprint("The files are not the same length")\n-\t}\n-}\n-\n-args <- commandArgs(trailingOnly = TRUE)\n-print(args)\n-\t\t\n-RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7])\n-\n-#R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt\n-\n-\n-\n-\n-\n-\n'
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/RNAseqDataAnnotation.xml
--- a/RNAseqDataAnnotation/RNAseqDataAnnotation.xml Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,52 +0,0 @@
-<tool id="RNAseqDataAnnotation" name="RNAseqDataAnnotation" version="1.0.0">
-  <description>tool for RNAseq Data Normalisation and Annotation</description>
-  <requirements>
-    <!--<requirement type="set_environment">SCRIPT_PATH</requirement>-->
-    <requirement type="package" version="3.0.2">R_3_0_2</requirement>
-    <requirement type="package" version="1.0">DESeq2biomaRt</requirement>
-  </requirements>
-
- <command>
- R --slave --vanilla --file=RNAseqDataAnnotation.R --args 
- $path2htseqfiles
- $samplenamefile
- $Species
- $ensversion
- $conversionensemblversion
- $conversionensemblname
- $fileout
- </command>
-
-  <inputs>
-   <param name="path2htseqfiles" label="Path to the directory containing the files from HTSeq-count" type="text"/>
-   <param name="samplenamefile" label="Conversion file sample/conditions" type="data" format="tabular" help="file should be tab-delimited"/>
-   <param name="Species" type="select" label="Select the specie for your data" help="If your specie of interest is not listed, your data will be normalized but no annotation will be added. Contact us if you want us to add your specie." >
- <option value="Homo_sapiens">Homo sapiens</option>
- <option value="Mus_musculus">Mus musculus</option> 
- <option value="">Other specie</option>
-   </param>
-   <param name="ensversion" type="select" label="Select the version of Ensembl to use" >
- <option value="67">Version 67</option>
- <option value="68">Version 68</option> 
- <option value="69">Version 69</option>
- <option value="70">Version 70</option>
- <option value="71">Version 71</option> 
- <option value="72">Version 72</option>
- <option value="73">Version 73</option>
- <option value="74">Version 74</option> 
- <option value="75">Version 75</option>
- <option value="76">Version 76</option>
- <option value="77">Version 77</option>
-   </param>
-   <param name="conversionensemblversion" label="File for conversion Ensembl to version" type="data" format="tabular" help="Tab-delimited input file" />
-   <param name="conversionensemblname" label="File for conversion Ensemble name of the specie " type="data" format="tabular" help="Tab-delimited input file"/>
-  </inputs>
-
-  <outputs>
-<param name="fileout" label="Path where the resulting file should be stored" type="data" format="tabular"/>  
-  </outputs>
- <help>
-**What it does*
-**Example**
- </help>
- </tool>
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/packages.R
--- a/RNAseqDataAnnotation/packages.R Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,5 +0,0 @@
-source("http://bioconductor.org/biocLite.R")
-biocLite()
-biocLite("DESeq2")
-biocLite("biomaRt")
-install.packages("plyr")
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/test-data/Ensembl_Version_Host.txt
--- a/RNAseqDataAnnotation/test-data/Ensembl_Version_Host.txt Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,12 +0,0 @@
-Version Host
-77 oct2014
-76 aug2014
-75 feb2014
-74 dec2013
-73 sep2013
-72 jun2013
-71 apr2013
-70 jan2013
-69 oct2012
-68 jul2012
-67 may2012
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/test-data/Ensemble_Specie_Dataset.txt
--- a/RNAseqDataAnnotation/test-data/Ensemble_Specie_Dataset.txt Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,3 +0,0 @@
-Specie Dataset
-Homo_sapiens hsapiens_gene_ensembl
-Mus_musculus mmusculus_gene_ensembl
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/test-data/Fichier1.txt
--- a/RNAseqDataAnnotation/test-data/Fichier1.txt Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,19 +0,0 @@
-ENSG00000000005 0
-ENSG00000000419 2661
-ENSG00000000457 602
-ENSG00000000460 2077
-ENSG00000000938 2
-ENSG00000000971 75
-ENSG00000001036 2389
-ENSG00000001084 1730
-ENSG00000001167 1473
-ENSG00000001460 387
-ENSG00000001461 905
-ENSG00000001497 2975
-ENSG00000001561 19
-ENSG00000001617 118
-ENSG00000001626 2
-ENSG00000001629 2559
-ENSG00000001630 314
-ENSG00000001631 1581
-ENSG00000002016 307
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/test-data/Fichier2.txt
--- a/RNAseqDataAnnotation/test-data/Fichier2.txt Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,19 +0,0 @@
-ENSG00000000005 0
-ENSG00000000419 3409
-ENSG00000000457 706
-ENSG00000000460 2385
-ENSG00000000938 0
-ENSG00000000971 100
-ENSG00000001036 2876
-ENSG00000001084 2154
-ENSG00000001167 1695
-ENSG00000001460 405
-ENSG00000001461 1010
-ENSG00000001497 3344
-ENSG00000001561 27
-ENSG00000001617 132
-ENSG00000001626 1
-ENSG00000001629 3042
-ENSG00000001630 352
-ENSG00000001631 1865
-ENSG00000002016 375
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/test-data/Fichier3.txt
--- a/RNAseqDataAnnotation/test-data/Fichier3.txt Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,19 +0,0 @@
-ENSG00000000005 0
-ENSG00000000419 2171
-ENSG00000000457 484
-ENSG00000000460 1056
-ENSG00000000938 1
-ENSG00000000971 157
-ENSG00000001036 2019
-ENSG00000001084 1580
-ENSG00000001167 1290
-ENSG00000001460 311
-ENSG00000001461 1607
-ENSG00000001497 2217
-ENSG00000001561 34
-ENSG00000001617 54
-ENSG00000001626 2
-ENSG00000001629 2116
-ENSG00000001630 207
-ENSG00000001631 1501
-ENSG00000002016 263
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/test-data/Fichier4.txt
--- a/RNAseqDataAnnotation/test-data/Fichier4.txt Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,19 +0,0 @@
-ENSG00000000005 0
-ENSG00000000419 2495
-ENSG00000000457 521
-ENSG00000000460 1092
-ENSG00000000938 1
-ENSG00000000971 192
-ENSG00000001036 2217
-ENSG00000001084 1685
-ENSG00000001167 1509
-ENSG00000001460 362
-ENSG00000001461 1622
-ENSG00000001497 2369
-ENSG00000001561 41
-ENSG00000001617 69
-ENSG00000001626 4
-ENSG00000001629 2361
-ENSG00000001630 215
-ENSG00000001631 1626
-ENSG00000002016 295
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/test-data/ichierconvertitnames.txt
--- a/RNAseqDataAnnotation/test-data/ichierconvertitnames.txt Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,5 +0,0 @@
-File Sample_name
-Fichier1.fastq.gz siLuc2
-Fichier2.fastq.gz siLuc3
-Fichier3.fastq.gz siMitf3
-Fichier4.fastq.gz siMitf4
b
diff -r e6d7a8ca8d94 -r a7ec3c9cb0b7 RNAseqDataAnnotation/tool_dependencies.xml
--- a/RNAseqDataAnnotation/tool_dependencies.xml Thu Nov 20 03:35:14 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="R_3_0_2" version="3.0.2">
- <repository changeset_revision="b6fe8ca3230d" name="package_r_3_0_2" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="DESeq2biomaRt" version="1.0">
- <install version="1.0">
- <actions>
-                <action type="set_environment_for_install">
- <repository changeset_revision="b6fe8ca3230d" name="package_r_3_0_2" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu">
- <package name="R_3_0_2" version="3.0.2" />
- </repository> 
- </action>
-                <action type="shell_command">R CMD BATCH packages.R</action>
- <!--<action type="shell_command">echo "export PATH=$PATH" > $INSTALL_DIR/env.sh </action>-->
- <!--<action type="shell_command">chmod 755 $INSTALL_DIR/env.sh </action>-->
- </actions> 
-       </install> 
-     <readme>
-     </readme> 
-   </package> 
-</tool_dependency>