Mercurial > repos > galaxyp > mt2mq
annotate MT2MQ.R @ 7:b8ec209d0928 draft default tip
"planemo upload commit 657f131e8dd4182938d1b391d10c5ea5c280d5b7"
| author | galaxyp | 
|---|---|
| date | Wed, 21 Oct 2020 18:08:41 +0000 | 
| parents | 3088377510dc | 
| children | 
| rev | line source | 
|---|---|
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
1 # MT2MQ: prepares metatranscriptomic outputs from ASaiM (HUMAnN2 and metaphlan) for metaquantome | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
2 | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
3 # Load libraries | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
4 suppressPackageStartupMessages(library(tidyverse)) | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
5 suppressPackageStartupMessages(library(taxize)) | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
6 | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
7 # Set parameters from arguments | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
8 args <- commandArgs(trailingOnly = TRUE) | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
9 data <- args[1] | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
10 # data: full path to file or directory: | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
11 # - if in functional or f-t mode, should be a tsv file of HUMAnN2 gene families, after regrouping and renaming to GO, joining samples, and renormalizing to CPM. | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
12 # - if in taxonomic mode, should be a directory of tsv files of metaphlan genus-level results | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
13 mode <- args[2] | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
14 # mode: | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
15 # -"f": function | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
16 # -"t": taxonomy | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
17 # -"ft": function-taxonomy | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
18 ontology <- unlist(strsplit(args[3], split = ",")) | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
19 # ontology: only for function or f-t mode. A string of the GO namespace(s) to include, separated by commas. | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
20 # ex: to include all: "molecular_function,biological_process,cellular_component" | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
21 | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
22 int_file <- args[4] | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
23 # int_file: full path and file name and extension to write intensity file | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
24 | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
25 func_file <- args[5] | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
26 # func_file: full path and file name and extension to write func file | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
27 | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
28 tax_file <- args[6] | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
29 # tax_file: full path and file name and extension to write tax file | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
30 | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
31 | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
32 # Functional mode | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
33 if (mode == "f") { | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
34 int <- read.delim(file = data, header = TRUE, sep = "\t") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
35 filter(!grepl(".+g__.+", X..Gene.Family)) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
36 separate(col = X..Gene.Family, into = c("id", "Extra"), sep = ": ", fill = "left") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
37 separate(col = Extra, into = c("namespace", "name"), sep = " ", fill = "left", extra = "merge") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
38 mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
39 filter(namespace %in% ontology) %>% | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
40 select(id, name, namespace, 4:ncol(.)) | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
41 func <- int %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
42 select(id) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
43 mutate(gos = id) | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
44 write.table(x = int, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE) | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
45 write.table(x = func, file = func_file, quote = FALSE, sep = "\t", row.names = FALSE) | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
46 } | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
47 | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
48 # Taxonomic mode | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
49 if (mode == "t") { | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
50 files <- dir(path = data) | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
51 int <- tibble(filename = files) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
52 mutate(file_contents = map(filename, ~read.delim(file = file.path(data, .), header = TRUE, sep = "\t"))) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
53 unnest(cols = c(file_contents)) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
54 rename(sample = filename) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
55 separate(col = sample, into = c("sample", NA), sep = ".tsv") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
56 pivot_wider(names_from = sample, values_from = abundance) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
57 mutate(rank = "genus") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
58 rename(name = genus) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
59 mutate(name = as.character(name)) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
60 mutate(id = get_uid(name, key = NULL, messages = FALSE)) %>% | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
61 select(id, name, rank, 2:ncol(.)) | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
62 tax <- int %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
63 select(id) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
64 mutate(tax = id) | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
65 write.table(x = int, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE) | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
66 write.table(x = tax, file = tax_file, quote = FALSE, sep = "\t", row.names = FALSE) | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
67 } | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
68 | 
| 
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
69 # Function-taxonomy mode | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
70 if (mode == "ft") { | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
71 ft <- read.delim(file = data, header = TRUE, sep = "\t") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
72 filter(grepl(".+g__.+", X..Gene.Family)) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
73 separate(col = X..Gene.Family, into = c("id", "Extra"), sep = ": ", fill = "left") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
74 separate(col = Extra, into = c("namespace", "name"), sep = " ", fill = "left", extra = "merge") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
75 separate(col = name, into = c("name", "taxa"), sep = "\\|", extra = "merge") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
76 separate(col = taxa, into = c("Extra", "genus", "species"), sep = "__") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
77 select(-"Extra") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
78 mutate_if(is.character, str_replace_all, pattern = "\\.s", replacement = "") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
79 mutate_at(c("species"), str_replace_all, pattern = "_", replacement = " ") %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
80 mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>% | 
| 
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
81 filter(namespace %in% ontology) %>% | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
82 select(id, name, namespace, 4:ncol(.)) | 
| 
2
 
3088377510dc
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
 
galaxyp 
parents: 
1 
diff
changeset
 | 
83 write.table(x = ft, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE) | 
| 
0
 
8822dd8bfc71
"planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
 
galaxyp 
parents:  
diff
changeset
 | 
84 } | 
