Mercurial > repos > proteore > proteore_prot_features

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/add_protein_features.R	Thu Dec 13 03:57:57 2018 -0500
@@ -0,0 +1,202 @@
+# Read file and return file content as data.frame
+read_file <- function(path,header){
+  file <- try(read.table(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="", check.names = F),silent=TRUE)
+  if (inherits(file,"try-error")){
+    stop("File not found !")
+  }else{
+    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+    return(file)
+  }
+}
+
+order_columns <- function (df,ncol,id_type,file){
+  if (id_type=="Uniprot_AC"){ncol=ncol(file)}
+  if (ncol==1){ #already at the right position
+    return (df)
+  } else {
+    df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])]
+  }
+  return (df)
+}
+
+get_list_from_cp <-function(list){
+  list = strsplit(list, "[ \t\n]+")[[1]]
+  list = gsub("NA","",list)
+  list = list[list != ""]    #remove empty entry
+  list = gsub("-.+", "", list)  #Remove isoform accession number (e.g. "-2")
+  return(list)
+}
+
+get_args <- function(){
+
+  ## Collect arguments
+  args <- commandArgs(TRUE)
+
+  ## Default setting when no arguments passed
+  if(length(args) < 1) {
+    args <- c("--help")
+  }
+
+  ## Help section
+  if("--help" %in% args) {
+    cat("Selection and Annotation HPA
+        Arguments:
+          --inputtype: type of input (list of id or filename)
+        --input: input
+        --nextprot: path to nextprot information file
+        --column: the column number which you would like to apply...
+        --header: true/false if your file contains a header
+        --type: the type of input IDs (Uniprot_AC/EntrezID)
+        --pc_features: IsoPoint,SeqLength,MW
+        --localization: Chr,SubcellLocations
+        --diseases_info: Diseases
+        --output: text output filename \n")
+
+    q(save="no")
+  }
+
+  parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
+  argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
+  args <- as.list(as.character(argsDF$V2))
+  names(args) <- argsDF$V1
+
+  return(args)
+}
+
+str2bool <- function(x){
+  if (any(is.element(c("t","true"),tolower(x)))){
+    return (TRUE)
+  }else if (any(is.element(c("f","false"),tolower(x)))){
+    return (FALSE)
+  }else{
+    return(NULL)
+  }
+}
+
+#take data frame, return  data frame
+split_ids_per_line <- function(line,ncol){
+
+  #print (line)
+  header = colnames(line)
+  line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol])
+
+  if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) {
+    if (length(line)==1 ) {
+      lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F)
+    } else {
+      if (ncol==1) {                                #first column
+        lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)]))
+      } else if (ncol==length(line)) {                 #last column
+        lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";"))))
+      } else {
+        lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)]))
+      }
+    }
+    colnames(lines)=header
+    return(lines)
+  } else {
+    return(line)
+  }
+}
+
+#create new lines if there's more than one id per cell in the columns in order to have only one id per line
+one_id_one_line <-function(tab,ncol){
+
+  if (ncol(tab)>1){
+
+    tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x))
+    header=colnames(tab)
+    res=as.data.frame(matrix(ncol=ncol(tab),nrow=0))
+    for (i in 1:nrow(tab) ) {
+      lines = split_ids_per_line(tab[i,],ncol)
+      res = rbind(res,lines)
+    }
+  }else {
+    res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F)
+    res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F)
+    colnames(res)=colnames(tab)
+  }
+  return(res)
+}
+
+# Get information from neXtProt
+get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){
+  if(diseases_info){
+    cols = c("NextprotID",pc_features,localization,"Diseases")
+  } else {
+    cols = c("NextprotID",pc_features,localization)
+  }
+
+  cols=cols[cols!="None"]
+  info = nextprot[match(input,nextprot$NextprotID),cols]
+  return(info)
+}
+
+protein_features = function() {
+
+  args <- get_args()
+
+  #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda")
+  #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda")
+
+  #setting variables
+  inputtype = args$inputtype
+  if (inputtype == "copy_paste") {
+    input = get_list_from_cp(args$input)
+    file = data.frame(input,stringsAsFactors = F)
+    ncol=1
+  } else if (inputtype == "file") {
+    filename = args$input
+    ncol = args$column
+    # Check ncol
+    if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
+      stop("Please enter an integer for level")
+    } else {
+      ncol = as.numeric(gsub("c", "", ncol))
+    }
+
+    header = str2bool(args$header)
+    file = read_file(filename, header)                                                    # Get file content
+    if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)}
+    if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID"
+    } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) {
+      colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID"
+      colnames(file)[ncol] = "NextprotID"
+    }
+  }
+
+  # Read reference file
+  nextprot = read_file(args$nextprot,T)
+
+  # Parse arguments
+  id_type = args$type
+  pc_features = strsplit(args$pc_features, ",")[[1]]
+  localization = strsplit(args$localization, ",")[[1]]
+  diseases_info = str2bool(args$diseases_info)
+  output = args$output
+
+  # Change the sample ids if they are Uniprot_AC ids to be able to match them with
+  # Nextprot data
+  if (id_type=="Uniprot_AC"){
+    NextprotID = gsub("^NX_$","",gsub("^","NX_",file[,ncol]))
+    file = cbind(file,NextprotID)
+    if (inputtype=="copy_paste") {colnames(file)[1]="Uniprot-AC"}
+    ncol=ncol(file)
+  }
+  NextprotID = file[,ncol]
+
+  #Select user input protein ids in nextprot
+  #NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))])
+  if (all(!NextprotID %in% nextprot[,1])){
+    write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
+  } else {
+    res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info)
+    res = res[!duplicated(res$NextprotID),]
+    output_content = merge(file, res,by.x=ncol,by.y="NextprotID",incomparables = NA,all.x=T)
+    output_content = order_columns(output_content,ncol,id_type,file)
+    output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x)))  #convert "" et " " to NA
+    write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
+  }
+
+}
+protein_features()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/add_protein_features.xml	Thu Dec 13 03:57:57 2018 -0500
@@ -0,0 +1,149 @@
+<tool id="prot_features" name="Add protein features" version="2018.12.12">
+<description>[neXtProt]
+</description>
+<requirements>
+  <requirement type="package" version="3.4.1">R</requirement>
+</requirements>
+<stdio>
+  <exit_code range="1:" />
+</stdio>
+<command><![CDATA[
+
+  Rscript $__tool_directory__/add_protein_features.R
+  --inputtype="$inputtype.filetype"
+  --input='$inputtype.genelist'
+
+  #if $inputtype.filetype == "file"
+    --column='$inputtype.column'
+    --header=$inputtype.header
+  #end if
+
+  --type='$idtype'
+  --pc_features='$Nextprot_params.pc_features'
+  --localization='$Nextprot_params.localization'
+  --diseases_info='$Nextprot_params.diseases_info'
+  --output='$output'
+  --nextprot=$__tool_directory__/tool-data/result_nextprot.txt
+
+]]></command>
+
+<inputs>
+  <conditional name="inputtype">
+    <param name="filetype" type="select" label="Select your type of input file">
+      <option value="file" selected="true">Input file containing your identifiers (neXtProt or Uniprot ID)</option>
+      <option value="copy_paste">Copy/paste your list of IDs</option>
+    </param>
+    <when value="copy_paste">
+      <param name="genelist" type="text" label="Enter a list of identifiers separated by tab, space or carriage return into the form field" help="for example : A0AVI2 A6NGB0">
+        <sanitizer invalid_char="">
+            <valid initial="string.printable">
+                <remove value="&apos;"/>
+            </valid>
+            <mapping initial="none">
+                <add source="&apos;" target="__sq__"/>
+                <add source="&#x20;" target=""/>
+                <add source="&#xA;" target=""/>
+                <add source="&#xD;" target=""/>
+                <add source="&#x9;" target=""/>
+            </mapping>
+        </sanitizer>
+      </param>
+    </when>
+    <when value="file">
+      <param name="genelist" type="data" format="txt,tabular" label="Choose a file that contains your list of IDs" help="This file must imperatively have 1 column filled with IDs consistent with the neXtprot database (Uniprot accession number or neXtProt ID). If this is not the case, please use the ID_Converter tool."/>
+      <param name="column" type="text" label="Please specify the column where are your IDs (e.g : Enter c1 for column n°1)" value="c1"/>
+      <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file have a header?" />
+
+    </when>
+  </conditional>
+
+      <param name="idtype" type="select" label="Type of your input ids" multiple="false" optional="false">
+ 		      <option value="Uniprot_AC" selected="true">Uniprot accession number</option>
+          <option value="NextprotID" selected="false">neXtProt IDs</option>
+      </param>
+      <section name="Nextprot_params" title="Select features of interest (compulsory step)" expanded="True">
+        <param name="pc_features" type="select" label="Physico-Chemical Features" multiple="true" help="Choose the information you want to add to your data from Nextprot" display="checkboxes" optional="true">
+          <option value="SeqLength" selected="false">Sequence Length</option>
+          <option value="MW" selected="false">Molecular Weight</option>
+          <option value="IsoPoint" selected="false">Isoelectric point</option>
+          <option value="TMDomains" selected="false">Number of transmembrane domains</option>
+          <option value="ProteinExistence" selected="false">Protein Existence (evidence score from 1 to 5)</option>
+        </param>
+
+        <param name="localization" type="select" label="Localization" multiple="true" help="Choose the information you want to add to your data from Nextprot" display="checkboxes" optional="true">
+ 		      <option value="Chr" selected="false">Chromosome</option>
+ 		      <option value="SubcellLocations" selected="false">Subcellular Location</option>
+        </param>
+
+        <param name="diseases_info" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Diseases informations" />
+
+      </section>
+
+</inputs>
+
+
+<outputs>
+  <data name="output" format="tsv" label="Add_information_from_neXtProt on ${inputtype.genelist.name}">
+    <filter>inputtype=="file"</filter>
+  </data>
+  <data name="output" format="tsv" label="Add_information_from_neXtProt"/>
+</outputs>
+
+<tests>
+  <test>
+    <conditional name="inputtype">
+      <param name="filetype " value="file"/>
+      <param name="genelist" value="FKW_ID_Converter_Lacombe_et_al_2017_OK.tsv"/>
+      <param name="column" value="c1"/>
+      <param name="header" value="true"/>
+    </conditional>
+
+    <param name="idtype" value="uniprot"/>
+
+    <section name="Nextprot_params">
+      <param name="pc_features" value="SeqLength,MW,IsoPoint,TMDomains,ProteinExistence"/>
+      <param name="localization" value="Chr,SubcellLocations"/>
+      <param name="diseases_info" value="true"/>
+    </section>
+
+    <output name="output" file="Add_information_from_neXtProt.tsv"/>
+  </test>
+</tests>
+
+<help><![CDATA[
+
+This tool add annotation (protein features) from neXtProt database (knowledge base on human proteins) to your protein IDs list.
+
+**Input**
+
+Input can be a file containing multiple fields but with **at least one column of Uniprot accession number or neXtProt IDs**. If your input file contains other type of IDs, please use the ID_Converter tool.
+
+**Databases**
+
+Annotations have been retrieved from the neXtProt released on 21/02/2018 using the latest data from peptideAtlas (release Human 2018-1)
+
+using a REST API (https://academic.oup.com/nar/article/43/D1/D764/2439066#40348985) (Gaudet et  al., 2017)
+
+**Outputs**
+
+The output is a tabular file. The initial columns are kept and columns are be added according to which annotation you have selected.
+
+-----
+
+.. class:: infomark
+
+**Authors**
+
+David Christiany, Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
+
+This work has been partially funded through the French National Agency for Research (ANR) IFB project.
+
+Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
+
+    ]]></help>
+    <citations>
+    </citations>
+
+</tool>
--- a/prot_features.xml	Fri Dec 07 05:09:57 2018 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,149 +0,0 @@
-<tool id="prot_features" name="Add human protein features" version="2018.12.07">
-<description>(neXtProt)
-</description>
-<requirements>
-  <requirement type="package" version="3.4.1">R</requirement>
-</requirements>
-<stdio>
-  <exit_code range="1:" />
-</stdio>
-<command><![CDATA[
-
-  Rscript $__tool_directory__/protein_features.R
-  --inputtype="$inputtype.filetype"
-  --input='$inputtype.genelist'
-
-  #if $inputtype.filetype == "file"
-    --column='$inputtype.column'
-    --header=$inputtype.header
-  #end if
-
-  --type='$idtype'
-  --pc_features='$Nextprot_params.pc_features'
-  --localization='$Nextprot_params.localization'
-  --diseases_info='$Nextprot_params.diseases_info'
-  --output='$output'
-  --nextprot=$__tool_directory__/tool-data/result_nextprot.txt
-
-]]></command>
-
-<inputs>
-  <conditional name="inputtype">
-    <param name="filetype" type="select" label="Select your type of input file">
-      <option value="file" selected="true">Input file containing your identifiers (neXtProt or Uniprot ID)</option>
-      <option value="copy_paste">Copy/paste your list of IDs</option>
-    </param>
-    <when value="copy_paste">
-      <param name="genelist" type="text" label="Enter a list of identifiers separated by tab,space or carriage return into the form field" help="for example : A0AVI2 A6NGB0">
-        <sanitizer invalid_char="">
-            <valid initial="string.printable">
-                <remove value="&apos;"/>
-            </valid>
-            <mapping initial="none">
-                <add source="&apos;" target="__sq__"/>
-                <add source="&#x20;" target=""/>
-                <add source="&#xA;" target=""/>
-                <add source="&#xD;" target=""/>
-                <add source="&#x9;" target=""/>
-            </mapping>
-        </sanitizer>
-      </param>
-    </when>
-    <when value="file">
-      <param name="genelist" type="data" format="txt,tabular" label="Choose a file that contains your list of IDs" help="This file must imperatively have 1 column filled with IDs consistent with the neXtprot database (Uniprot accession number or neXtProt ID). If this is not the case, please use the ID_Converter tool."/>
-      <param name="column" type="text" label="Please specify the column where are your IDs (e.g : Enter c1 for column n°1)" value="c1"/>
-      <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file have a header?" />
-
-    </when>
-  </conditional>
-
-      <param name="idtype" type="select" label="Type of your input ids" multiple="false" optional="false">
- 		      <option value="Uniprot_AC" selected="true">Uniprot accession number</option>
-          <option value="NextprotID" selected="false">neXtProt IDs</option>
-      </param>
-      <section name="Nextprot_params" title="Select features of interest (compulsory step)" expanded="True">
-        <param name="pc_features" type="select" label="Physico-Chemical Features" multiple="true" help="Choose the information you want to add to your data from Nextprot" display="checkboxes" optional="true">
-          <option value="SeqLength" selected="false">Sequence Length</option>
-          <option value="MW" selected="false">Molecular Weight</option>
-          <option value="IsoPoint" selected="false">Isoelectric point</option>
-          <option value="TMDomains" selected="false">Number of transmembrane domains</option>
-          <option value="ProteinExistence" selected="false">Protein Existence (evidence score from 1 to 5)</option>
-        </param>
-
-        <param name="localization" type="select" label="Localization" multiple="true" help="Choose the information you want to add to your data from Nextprot" display="checkboxes" optional="true">
- 		      <option value="Chr" selected="false">Chromosome</option>
- 		      <option value="SubcellLocations" selected="false">Subcellular Location</option>
-        </param>
-
-        <param name="diseases_info" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Diseases informations" />
-
-      </section>
-
-</inputs>
-
-
-<outputs>
-  <data name="output" format="tsv" label="Add_information_from_neXtProt on ${inputtype.genelist.name}">
-    <filter>inputtype=="file"</filter>
-  </data>
-  <data name="output" format="tsv" label="Add_information_from_neXtProt"/>
-</outputs>
-
-<tests>
-  <test>
-    <conditional name="inputtype">
-      <param name="filetype " value="file"/>
-      <param name="genelist" value="FKW_ID_Converter_Lacombe_et_al_2017_OK.tsv"/>
-      <param name="column" value="c1"/>
-      <param name="header" value="true"/>
-    </conditional>
-
-    <param name="idtype" value="uniprot"/>
-
-    <section name="Nextprot_params">
-      <param name="pc_features" value="SeqLength,MW,IsoPoint,TMDomains,ProteinExistence"/>
-      <param name="localization" value="Chr,SubcellLocations"/>
-      <param name="diseases_info" value="true"/>
-    </section>
-
-    <output name="output" file="Add_information_from_neXtProt.tsv"/>
-  </test>
-</tests>
-
-<help><![CDATA[
-
-This tool add annotation (protein features) from neXtProt database (knowledge base on human proteins) to your protein IDs list.
-
-**Input**
-
-Input can be a file containing multiple fields but with **at least one column of Uniprot accession number or neXtProt IDs**. If your input file contains other type of IDs, please use the ID_Converter tool.
-
-**Databases**
-
-Annotations have been retrieved from the neXtProt released on 21/02/2018 using the latest data from peptideAtlas (release Human 2018-1)
-
-using a REST API (https://academic.oup.com/nar/article/43/D1/D764/2439066#40348985) (Gaudet et  al., 2017)
-
-**Outputs**
-
-The output is a tabular file. The initial columns are kept and columns are be added according to which annotation you have selected.
-
------
-
-.. class:: infomark
-
-**Authors**
-
-David Christiany, Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
-
-Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
-
-This work has been partially funded through the French National Agency for Research (ANR) IFB project.
-
-Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
-
-    ]]></help>
-    <citations>
-    </citations>
-
-</tool>
--- a/protein_features.R	Fri Dec 07 05:09:57 2018 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,202 +0,0 @@
-# Read file and return file content as data.frame
-read_file <- function(path,header){
-  file <- try(read.table(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="", check.names = F),silent=TRUE)
-  if (inherits(file,"try-error")){
-    stop("File not found !")
-  }else{
-    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
-    return(file)
-  }
-}
-
-order_columns <- function (df,ncol,id_type,file){
-  if (id_type=="Uniprot_AC"){ncol=ncol(file)}
-  if (ncol==1){ #already at the right position
-    return (df)
-  } else {
-    df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])]
-  }
-  return (df)
-}
-
-get_list_from_cp <-function(list){
-  list = strsplit(list, "[ \t\n]+")[[1]]
-  list = gsub("NA","",list)
-  list = list[list != ""]    #remove empty entry
-  list = gsub("-.+", "", list)  #Remove isoform accession number (e.g. "-2")
-  return(list)
-}
-
-get_args <- function(){
-
-  ## Collect arguments
-  args <- commandArgs(TRUE)
-
-  ## Default setting when no arguments passed
-  if(length(args) < 1) {
-    args <- c("--help")
-  }
-
-  ## Help section
-  if("--help" %in% args) {
-    cat("Selection and Annotation HPA
-        Arguments:
-          --inputtype: type of input (list of id or filename)
-        --input: input
-        --nextprot: path to nextprot information file
-        --column: the column number which you would like to apply...
-        --header: true/false if your file contains a header
-        --type: the type of input IDs (Uniprot_AC/EntrezID)
-        --pc_features: IsoPoint,SeqLength,MW
-        --localization: Chr,SubcellLocations
-        --diseases_info: Diseases
-        --output: text output filename \n")
-
-    q(save="no")
-  }
-
-  parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
-  argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
-  args <- as.list(as.character(argsDF$V2))
-  names(args) <- argsDF$V1
-
-  return(args)
-}
-
-str2bool <- function(x){
-  if (any(is.element(c("t","true"),tolower(x)))){
-    return (TRUE)
-  }else if (any(is.element(c("f","false"),tolower(x)))){
-    return (FALSE)
-  }else{
-    return(NULL)
-  }
-}
-
-#take data frame, return  data frame
-split_ids_per_line <- function(line,ncol){
-
-  #print (line)
-  header = colnames(line)
-  line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol])
-
-  if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) {
-    if (length(line)==1 ) {
-      lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F)
-    } else {
-      if (ncol==1) {                                #first column
-        lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)]))
-      } else if (ncol==length(line)) {                 #last column
-        lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";"))))
-      } else {
-        lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)]))
-      }
-    }
-    colnames(lines)=header
-    return(lines)
-  } else {
-    return(line)
-  }
-}
-
-#create new lines if there's more than one id per cell in the columns in order to have only one id per line
-one_id_one_line <-function(tab,ncol){
-
-  if (ncol(tab)>1){
-
-    tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x))
-    header=colnames(tab)
-    res=as.data.frame(matrix(ncol=ncol(tab),nrow=0))
-    for (i in 1:nrow(tab) ) {
-      lines = split_ids_per_line(tab[i,],ncol)
-      res = rbind(res,lines)
-    }
-  }else {
-    res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F)
-    res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F)
-    colnames(res)=colnames(tab)
-  }
-  return(res)
-}
-
-# Get information from neXtProt
-get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){
-  if(diseases_info){
-    cols = c("NextprotID",pc_features,localization,"Diseases")
-  } else {
-    cols = c("NextprotID",pc_features,localization)
-  }
-
-  cols=cols[cols!="None"]
-  info = nextprot[match(input,nextprot$NextprotID),cols]
-  return(info)
-}
-
-protein_features = function() {
-
-  args <- get_args()
-
-  #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda")
-  #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda")
-
-  #setting variables
-  inputtype = args$inputtype
-  if (inputtype == "copy_paste") {
-    input = get_list_from_cp(args$input)
-    file = data.frame(input,stringsAsFactors = F)
-    ncol=1
-  } else if (inputtype == "file") {
-    filename = args$input
-    ncol = args$column
-    # Check ncol
-    if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
-      stop("Please enter an integer for level")
-    } else {
-      ncol = as.numeric(gsub("c", "", ncol))
-    }
-
-    header = str2bool(args$header)
-    file = read_file(filename, header)                                                    # Get file content
-    if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)}
-    if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID"
-    } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) {
-      colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID"
-      colnames(file)[ncol] = "NextprotID"
-    }
-  }
-
-  # Read reference file
-  nextprot = read_file(args$nextprot,T)
-
-  # Parse arguments
-  id_type = args$type
-  pc_features = strsplit(args$pc_features, ",")[[1]]
-  localization = strsplit(args$localization, ",")[[1]]
-  diseases_info = str2bool(args$diseases_info)
-  output = args$output
-
-  # Change the sample ids if they are Uniprot_AC ids to be able to match them with
-  # Nextprot data
-  if (id_type=="Uniprot_AC"){
-    NextprotID = gsub("^NX_$","",gsub("^","NX_",file[,ncol]))
-    file = cbind(file,NextprotID)
-    if (inputtype=="copy_paste") {colnames(file)[1]="Uniprot-AC"}
-    ncol=ncol(file)
-  }
-  NextprotID = file[,ncol]
-
-  #Select user input protein ids in nextprot
-  #NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))])
-  if (all(!NextprotID %in% nextprot[,1])){
-    write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
-  } else {
-    res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info)
-    res = res[!duplicated(res$NextprotID),]
-    output_content = merge(file, res,by.x=ncol,by.y="NextprotID",incomparables = NA,all.x=T)
-    output_content = order_columns(output_content,ncol,id_type,file)
-    output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x)))  #convert "" et " " to NA
-    write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
-  }
-
-}
-protein_features()