annotate get_length_and_gc_content.r @ 6:0e9424413ab0 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
author mvdbeek
date Thu, 03 Mar 2016 09:56:51 -0500
parents a6427f7893b0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
1 # originally by Devon Ryan, https://www.biostars.org/p/84467/
6
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
2
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
4
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
5 # we need that to not crash galaxy with an UTF8 error on German LC settings.
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
6 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
0
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
7
6
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
8 suppressPackageStartupMessages({
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
9 library("GenomicRanges")
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
10 library("rtracklayer")
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
11 library("Rsamtools")
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
12 library("optparse")
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
13 library("data.table")
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
14 })
0
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
15
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
16 option_list <- list(
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
17 make_option(c("-g","--gtf"), type="character", help="Input GTF file with gene / exon information."),
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
18 make_option(c("-f","--fasta"), type="character", default=FALSE, help="Fasta file that corresponds to the supplied GTF."),
6
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
19 make_option(c("-l","--length"), type="character", default=FALSE, help="Output file with gene name and length."),
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
20 make_option(c("-gc","--gc_content"), type="character", default=FALSE, help="Output file with gene name and GC content.")
0
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
21 )
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
22
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
23 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
24 args = parse_args(parser)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
25
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
26 GTFfile = args$gtf
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
27 FASTAfile = args$fasta
6
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
28 length = args$length
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
29 gc_content = args$gc_content
0
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
30
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
31 #Load the annotation and reduce it
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
32 GTF <- import.gff(GTFfile, format="gtf", genome=NA, feature.type="exon")
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
33 grl <- reduce(split(GTF, elementMetadata(GTF)$gene_id))
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
34 reducedGTF <- unlist(grl, use.names=T)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
35 elementMetadata(reducedGTF)$gene_id <- rep(names(grl), elementLengths(grl))
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
36
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
37 #Open the fasta file
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
38 FASTA <- FaFile(FASTAfile)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
39 open(FASTA)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
40
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
41 #Add the GC numbers
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
42 elementMetadata(reducedGTF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGTF), "GC")[,1]
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
43 elementMetadata(reducedGTF)$widths <- width(reducedGTF)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
44
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
45 #Create a list of the ensembl_id/GC/length
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
46 calc_GC_length <- function(x) {
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
47 nGCs = sum(elementMetadata(x)$nGCs)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
48 width = sum(elementMetadata(x)$widths)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
49 c(width, nGCs/width)
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
50 }
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
51 output <- t(sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_id), calc_GC_length))
6
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
52 output <- data.frame(setDT(data.frame(output), keep.rownames = TRUE)[])
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
53
0
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
54
6
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
55 write.table(output[,c(1,2)], file=length, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
56 write.table(output[,c(1,3)], file=gc_content, col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
0e9424413ab0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
mvdbeek
parents: 0
diff changeset
57
0
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
58
a6427f7893b0 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit fdd0811efc61c31f88ff17096fbe8ee8cfacd766-dirty
mvdbeek
parents:
diff changeset
59 sessionInfo()