Mercurial > repos > proteore > proteore_prot_features
changeset 4:759850de6ed2 draft
planemo upload commit c599cfc156c77626df2b674bdfbd437b9f664ab9
| author | proteore |
|---|---|
| date | Thu, 13 Dec 2018 03:57:57 -0500 |
| parents | 7746af0f8209 |
| children | bb4a5f1b415f |
| files | add_protein_features.R add_protein_features.xml prot_features.xml protein_features.R |
| diffstat | 4 files changed, 351 insertions(+), 351 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_protein_features.R Thu Dec 13 03:57:57 2018 -0500 @@ -0,0 +1,202 @@ +# Read file and return file content as data.frame +read_file <- function(path,header){ + file <- try(read.table(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="", check.names = F),silent=TRUE) + if (inherits(file,"try-error")){ + stop("File not found !") + }else{ + file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] + return(file) + } +} + +order_columns <- function (df,ncol,id_type,file){ + if (id_type=="Uniprot_AC"){ncol=ncol(file)} + if (ncol==1){ #already at the right position + return (df) + } else { + df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])] + } + return (df) +} + +get_list_from_cp <-function(list){ + list = strsplit(list, "[ \t\n]+")[[1]] + list = gsub("NA","",list) + list = list[list != ""] #remove empty entry + list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") + return(list) +} + +get_args <- function(){ + + ## Collect arguments + args <- commandArgs(TRUE) + + ## Default setting when no arguments passed + if(length(args) < 1) { + args <- c("--help") + } + + ## Help section + if("--help" %in% args) { + cat("Selection and Annotation HPA + Arguments: + --inputtype: type of input (list of id or filename) + --input: input + --nextprot: path to nextprot information file + --column: the column number which you would like to apply... + --header: true/false if your file contains a header + --type: the type of input IDs (Uniprot_AC/EntrezID) + --pc_features: IsoPoint,SeqLength,MW + --localization: Chr,SubcellLocations + --diseases_info: Diseases + --output: text output filename \n") + + q(save="no") + } + + parseArgs <- function(x) strsplit(sub("^--", "", x), "=") + argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) + args <- as.list(as.character(argsDF$V2)) + names(args) <- argsDF$V1 + + return(args) +} + +str2bool <- function(x){ + if (any(is.element(c("t","true"),tolower(x)))){ + return (TRUE) + }else if (any(is.element(c("f","false"),tolower(x)))){ + return (FALSE) + }else{ + return(NULL) + } +} + +#take data frame, return data frame +split_ids_per_line <- function(line,ncol){ + + #print (line) + header = colnames(line) + line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) + + if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { + if (length(line)==1 ) { + lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) + } else { + if (ncol==1) { #first column + lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) + } else if (ncol==length(line)) { #last column + lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) + } else { + lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) + } + } + colnames(lines)=header + return(lines) + } else { + return(line) + } +} + +#create new lines if there's more than one id per cell in the columns in order to have only one id per line +one_id_one_line <-function(tab,ncol){ + + if (ncol(tab)>1){ + + tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) + header=colnames(tab) + res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) + for (i in 1:nrow(tab) ) { + lines = split_ids_per_line(tab[i,],ncol) + res = rbind(res,lines) + } + }else { + res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F) + res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F) + colnames(res)=colnames(tab) + } + return(res) +} + +# Get information from neXtProt +get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){ + if(diseases_info){ + cols = c("NextprotID",pc_features,localization,"Diseases") + } else { + cols = c("NextprotID",pc_features,localization) + } + + cols=cols[cols!="None"] + info = nextprot[match(input,nextprot$NextprotID),cols] + return(info) +} + +protein_features = function() { + + args <- get_args() + + #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda") + #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda") + + #setting variables + inputtype = args$inputtype + if (inputtype == "copy_paste") { + input = get_list_from_cp(args$input) + file = data.frame(input,stringsAsFactors = F) + ncol=1 + } else if (inputtype == "file") { + filename = args$input + ncol = args$column + # Check ncol + if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { + stop("Please enter an integer for level") + } else { + ncol = as.numeric(gsub("c", "", ncol)) + } + + header = str2bool(args$header) + file = read_file(filename, header) # Get file content + if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)} + if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" + } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { + colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" + colnames(file)[ncol] = "NextprotID" + } + } + + # Read reference file + nextprot = read_file(args$nextprot,T) + + # Parse arguments + id_type = args$type + pc_features = strsplit(args$pc_features, ",")[[1]] + localization = strsplit(args$localization, ",")[[1]] + diseases_info = str2bool(args$diseases_info) + output = args$output + + # Change the sample ids if they are Uniprot_AC ids to be able to match them with + # Nextprot data + if (id_type=="Uniprot_AC"){ + NextprotID = gsub("^NX_$","",gsub("^","NX_",file[,ncol])) + file = cbind(file,NextprotID) + if (inputtype=="copy_paste") {colnames(file)[1]="Uniprot-AC"} + ncol=ncol(file) + } + NextprotID = file[,ncol] + + #Select user input protein ids in nextprot + #NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))]) + if (all(!NextprotID %in% nextprot[,1])){ + write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) + } else { + res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info) + res = res[!duplicated(res$NextprotID),] + output_content = merge(file, res,by.x=ncol,by.y="NextprotID",incomparables = NA,all.x=T) + output_content = order_columns(output_content,ncol,id_type,file) + output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA + write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) + } + +} +protein_features()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_protein_features.xml Thu Dec 13 03:57:57 2018 -0500 @@ -0,0 +1,149 @@ +<tool id="prot_features" name="Add protein features" version="2018.12.12"> +<description>[neXtProt] +</description> +<requirements> + <requirement type="package" version="3.4.1">R</requirement> +</requirements> +<stdio> + <exit_code range="1:" /> +</stdio> +<command><![CDATA[ + + Rscript $__tool_directory__/add_protein_features.R + --inputtype="$inputtype.filetype" + --input='$inputtype.genelist' + + #if $inputtype.filetype == "file" + --column='$inputtype.column' + --header=$inputtype.header + #end if + + --type='$idtype' + --pc_features='$Nextprot_params.pc_features' + --localization='$Nextprot_params.localization' + --diseases_info='$Nextprot_params.diseases_info' + --output='$output' + --nextprot=$__tool_directory__/tool-data/result_nextprot.txt + +]]></command> + +<inputs> + <conditional name="inputtype"> + <param name="filetype" type="select" label="Select your type of input file"> + <option value="file" selected="true">Input file containing your identifiers (neXtProt or Uniprot ID)</option> + <option value="copy_paste">Copy/paste your list of IDs</option> + </param> + <when value="copy_paste"> + <param name="genelist" type="text" label="Enter a list of identifiers separated by tab, space or carriage return into the form field" help="for example : A0AVI2 A6NGB0"> + <sanitizer invalid_char=""> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + <add source=" " target=""/> + <add source="
" target=""/> + <add source="
" target=""/> + <add source="	" target=""/> + </mapping> + </sanitizer> + </param> + </when> + <when value="file"> + <param name="genelist" type="data" format="txt,tabular" label="Choose a file that contains your list of IDs" help="This file must imperatively have 1 column filled with IDs consistent with the neXtprot database (Uniprot accession number or neXtProt ID). If this is not the case, please use the ID_Converter tool."/> + <param name="column" type="text" label="Please specify the column where are your IDs (e.g : Enter c1 for column n°1)" value="c1"/> + <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file have a header?" /> + + </when> + </conditional> + + <param name="idtype" type="select" label="Type of your input ids" multiple="false" optional="false"> + <option value="Uniprot_AC" selected="true">Uniprot accession number</option> + <option value="NextprotID" selected="false">neXtProt IDs</option> + </param> + <section name="Nextprot_params" title="Select features of interest (compulsory step)" expanded="True"> + <param name="pc_features" type="select" label="Physico-Chemical Features" multiple="true" help="Choose the information you want to add to your data from Nextprot" display="checkboxes" optional="true"> + <option value="SeqLength" selected="false">Sequence Length</option> + <option value="MW" selected="false">Molecular Weight</option> + <option value="IsoPoint" selected="false">Isoelectric point</option> + <option value="TMDomains" selected="false">Number of transmembrane domains</option> + <option value="ProteinExistence" selected="false">Protein Existence (evidence score from 1 to 5)</option> + </param> + + <param name="localization" type="select" label="Localization" multiple="true" help="Choose the information you want to add to your data from Nextprot" display="checkboxes" optional="true"> + <option value="Chr" selected="false">Chromosome</option> + <option value="SubcellLocations" selected="false">Subcellular Location</option> + </param> + + <param name="diseases_info" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Diseases informations" /> + + </section> + +</inputs> + + +<outputs> + <data name="output" format="tsv" label="Add_information_from_neXtProt on ${inputtype.genelist.name}"> + <filter>inputtype=="file"</filter> + </data> + <data name="output" format="tsv" label="Add_information_from_neXtProt"/> +</outputs> + +<tests> + <test> + <conditional name="inputtype"> + <param name="filetype " value="file"/> + <param name="genelist" value="FKW_ID_Converter_Lacombe_et_al_2017_OK.tsv"/> + <param name="column" value="c1"/> + <param name="header" value="true"/> + </conditional> + + <param name="idtype" value="uniprot"/> + + <section name="Nextprot_params"> + <param name="pc_features" value="SeqLength,MW,IsoPoint,TMDomains,ProteinExistence"/> + <param name="localization" value="Chr,SubcellLocations"/> + <param name="diseases_info" value="true"/> + </section> + + <output name="output" file="Add_information_from_neXtProt.tsv"/> + </test> +</tests> + +<help><![CDATA[ + +This tool add annotation (protein features) from neXtProt database (knowledge base on human proteins) to your protein IDs list. + +**Input** + +Input can be a file containing multiple fields but with **at least one column of Uniprot accession number or neXtProt IDs**. If your input file contains other type of IDs, please use the ID_Converter tool. + +**Databases** + +Annotations have been retrieved from the neXtProt released on 21/02/2018 using the latest data from peptideAtlas (release Human 2018-1) + +using a REST API (https://academic.oup.com/nar/article/43/D1/D764/2439066#40348985) (Gaudet et al., 2017) + +**Outputs** + +The output is a tabular file. The initial columns are kept and columns are be added according to which annotation you have selected. + +----- + +.. class:: infomark + +**Authors** + +David Christiany, Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR + +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + + ]]></help> + <citations> + </citations> + +</tool>
--- a/prot_features.xml Fri Dec 07 05:09:57 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,149 +0,0 @@ -<tool id="prot_features" name="Add human protein features" version="2018.12.07"> -<description>(neXtProt) -</description> -<requirements> - <requirement type="package" version="3.4.1">R</requirement> -</requirements> -<stdio> - <exit_code range="1:" /> -</stdio> -<command><![CDATA[ - - Rscript $__tool_directory__/protein_features.R - --inputtype="$inputtype.filetype" - --input='$inputtype.genelist' - - #if $inputtype.filetype == "file" - --column='$inputtype.column' - --header=$inputtype.header - #end if - - --type='$idtype' - --pc_features='$Nextprot_params.pc_features' - --localization='$Nextprot_params.localization' - --diseases_info='$Nextprot_params.diseases_info' - --output='$output' - --nextprot=$__tool_directory__/tool-data/result_nextprot.txt - -]]></command> - -<inputs> - <conditional name="inputtype"> - <param name="filetype" type="select" label="Select your type of input file"> - <option value="file" selected="true">Input file containing your identifiers (neXtProt or Uniprot ID)</option> - <option value="copy_paste">Copy/paste your list of IDs</option> - </param> - <when value="copy_paste"> - <param name="genelist" type="text" label="Enter a list of identifiers separated by tab,space or carriage return into the form field" help="for example : A0AVI2 A6NGB0"> - <sanitizer invalid_char=""> - <valid initial="string.printable"> - <remove value="'"/> - </valid> - <mapping initial="none"> - <add source="'" target="__sq__"/> - <add source=" " target=""/> - <add source="
" target=""/> - <add source="
" target=""/> - <add source="	" target=""/> - </mapping> - </sanitizer> - </param> - </when> - <when value="file"> - <param name="genelist" type="data" format="txt,tabular" label="Choose a file that contains your list of IDs" help="This file must imperatively have 1 column filled with IDs consistent with the neXtprot database (Uniprot accession number or neXtProt ID). If this is not the case, please use the ID_Converter tool."/> - <param name="column" type="text" label="Please specify the column where are your IDs (e.g : Enter c1 for column n°1)" value="c1"/> - <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file have a header?" /> - - </when> - </conditional> - - <param name="idtype" type="select" label="Type of your input ids" multiple="false" optional="false"> - <option value="Uniprot_AC" selected="true">Uniprot accession number</option> - <option value="NextprotID" selected="false">neXtProt IDs</option> - </param> - <section name="Nextprot_params" title="Select features of interest (compulsory step)" expanded="True"> - <param name="pc_features" type="select" label="Physico-Chemical Features" multiple="true" help="Choose the information you want to add to your data from Nextprot" display="checkboxes" optional="true"> - <option value="SeqLength" selected="false">Sequence Length</option> - <option value="MW" selected="false">Molecular Weight</option> - <option value="IsoPoint" selected="false">Isoelectric point</option> - <option value="TMDomains" selected="false">Number of transmembrane domains</option> - <option value="ProteinExistence" selected="false">Protein Existence (evidence score from 1 to 5)</option> - </param> - - <param name="localization" type="select" label="Localization" multiple="true" help="Choose the information you want to add to your data from Nextprot" display="checkboxes" optional="true"> - <option value="Chr" selected="false">Chromosome</option> - <option value="SubcellLocations" selected="false">Subcellular Location</option> - </param> - - <param name="diseases_info" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Diseases informations" /> - - </section> - -</inputs> - - -<outputs> - <data name="output" format="tsv" label="Add_information_from_neXtProt on ${inputtype.genelist.name}"> - <filter>inputtype=="file"</filter> - </data> - <data name="output" format="tsv" label="Add_information_from_neXtProt"/> -</outputs> - -<tests> - <test> - <conditional name="inputtype"> - <param name="filetype " value="file"/> - <param name="genelist" value="FKW_ID_Converter_Lacombe_et_al_2017_OK.tsv"/> - <param name="column" value="c1"/> - <param name="header" value="true"/> - </conditional> - - <param name="idtype" value="uniprot"/> - - <section name="Nextprot_params"> - <param name="pc_features" value="SeqLength,MW,IsoPoint,TMDomains,ProteinExistence"/> - <param name="localization" value="Chr,SubcellLocations"/> - <param name="diseases_info" value="true"/> - </section> - - <output name="output" file="Add_information_from_neXtProt.tsv"/> - </test> -</tests> - -<help><![CDATA[ - -This tool add annotation (protein features) from neXtProt database (knowledge base on human proteins) to your protein IDs list. - -**Input** - -Input can be a file containing multiple fields but with **at least one column of Uniprot accession number or neXtProt IDs**. If your input file contains other type of IDs, please use the ID_Converter tool. - -**Databases** - -Annotations have been retrieved from the neXtProt released on 21/02/2018 using the latest data from peptideAtlas (release Human 2018-1) - -using a REST API (https://academic.oup.com/nar/article/43/D1/D764/2439066#40348985) (Gaudet et al., 2017) - -**Outputs** - -The output is a tabular file. The initial columns are kept and columns are be added according to which annotation you have selected. - ------ - -.. class:: infomark - -**Authors** - -David Christiany, Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR - -Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform - -This work has been partially funded through the French National Agency for Research (ANR) IFB project. - -Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. - - ]]></help> - <citations> - </citations> - -</tool>
--- a/protein_features.R Fri Dec 07 05:09:57 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ -# Read file and return file content as data.frame -read_file <- function(path,header){ - file <- try(read.table(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="", check.names = F),silent=TRUE) - if (inherits(file,"try-error")){ - stop("File not found !") - }else{ - file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] - return(file) - } -} - -order_columns <- function (df,ncol,id_type,file){ - if (id_type=="Uniprot_AC"){ncol=ncol(file)} - if (ncol==1){ #already at the right position - return (df) - } else { - df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])] - } - return (df) -} - -get_list_from_cp <-function(list){ - list = strsplit(list, "[ \t\n]+")[[1]] - list = gsub("NA","",list) - list = list[list != ""] #remove empty entry - list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") - return(list) -} - -get_args <- function(){ - - ## Collect arguments - args <- commandArgs(TRUE) - - ## Default setting when no arguments passed - if(length(args) < 1) { - args <- c("--help") - } - - ## Help section - if("--help" %in% args) { - cat("Selection and Annotation HPA - Arguments: - --inputtype: type of input (list of id or filename) - --input: input - --nextprot: path to nextprot information file - --column: the column number which you would like to apply... - --header: true/false if your file contains a header - --type: the type of input IDs (Uniprot_AC/EntrezID) - --pc_features: IsoPoint,SeqLength,MW - --localization: Chr,SubcellLocations - --diseases_info: Diseases - --output: text output filename \n") - - q(save="no") - } - - parseArgs <- function(x) strsplit(sub("^--", "", x), "=") - argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) - args <- as.list(as.character(argsDF$V2)) - names(args) <- argsDF$V1 - - return(args) -} - -str2bool <- function(x){ - if (any(is.element(c("t","true"),tolower(x)))){ - return (TRUE) - }else if (any(is.element(c("f","false"),tolower(x)))){ - return (FALSE) - }else{ - return(NULL) - } -} - -#take data frame, return data frame -split_ids_per_line <- function(line,ncol){ - - #print (line) - header = colnames(line) - line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) - - if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { - if (length(line)==1 ) { - lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) - } else { - if (ncol==1) { #first column - lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) - } else if (ncol==length(line)) { #last column - lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) - } else { - lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) - } - } - colnames(lines)=header - return(lines) - } else { - return(line) - } -} - -#create new lines if there's more than one id per cell in the columns in order to have only one id per line -one_id_one_line <-function(tab,ncol){ - - if (ncol(tab)>1){ - - tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) - header=colnames(tab) - res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) - for (i in 1:nrow(tab) ) { - lines = split_ids_per_line(tab[i,],ncol) - res = rbind(res,lines) - } - }else { - res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F) - res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F) - colnames(res)=colnames(tab) - } - return(res) -} - -# Get information from neXtProt -get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){ - if(diseases_info){ - cols = c("NextprotID",pc_features,localization,"Diseases") - } else { - cols = c("NextprotID",pc_features,localization) - } - - cols=cols[cols!="None"] - info = nextprot[match(input,nextprot$NextprotID),cols] - return(info) -} - -protein_features = function() { - - args <- get_args() - - #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda") - #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda") - - #setting variables - inputtype = args$inputtype - if (inputtype == "copy_paste") { - input = get_list_from_cp(args$input) - file = data.frame(input,stringsAsFactors = F) - ncol=1 - } else if (inputtype == "file") { - filename = args$input - ncol = args$column - # Check ncol - if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { - stop("Please enter an integer for level") - } else { - ncol = as.numeric(gsub("c", "", ncol)) - } - - header = str2bool(args$header) - file = read_file(filename, header) # Get file content - if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)} - if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" - } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { - colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" - colnames(file)[ncol] = "NextprotID" - } - } - - # Read reference file - nextprot = read_file(args$nextprot,T) - - # Parse arguments - id_type = args$type - pc_features = strsplit(args$pc_features, ",")[[1]] - localization = strsplit(args$localization, ",")[[1]] - diseases_info = str2bool(args$diseases_info) - output = args$output - - # Change the sample ids if they are Uniprot_AC ids to be able to match them with - # Nextprot data - if (id_type=="Uniprot_AC"){ - NextprotID = gsub("^NX_$","",gsub("^","NX_",file[,ncol])) - file = cbind(file,NextprotID) - if (inputtype=="copy_paste") {colnames(file)[1]="Uniprot-AC"} - ncol=ncol(file) - } - NextprotID = file[,ncol] - - #Select user input protein ids in nextprot - #NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))]) - if (all(!NextprotID %in% nextprot[,1])){ - write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) - } else { - res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info) - res = res[!duplicated(res$NextprotID),] - output_content = merge(file, res,by.x=ncol,by.y="NextprotID",incomparables = NA,all.x=T) - output_content = order_columns(output_content,ncol,id_type,file) - output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA - write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) - } - -} -protein_features()
