diff test-data/examples/simulation/FASTAProteinAbundanceSampling.py @ 18:6daaa75ccb99 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/openms commit 6e7368b7f178fbd1f08c28eea1b538add6943a65-dirty"
author galaxyp
date Sun, 13 Dec 2020 15:03:50 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/examples/simulation/FASTAProteinAbundanceSampling.py	Sun Dec 13 15:03:50 2020 +0000
@@ -0,0 +1,176 @@
+# --------------------------------------------------------------------------
+#                   OpenMS -- Open-Source Mass Spectrometry
+# --------------------------------------------------------------------------
+# Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
+# ETH Zurich, and Freie Universitaet Berlin 2002-2020.
+#
+# This software is released under a three-clause BSD license:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of any author or any participating institution
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+# For a full list of authors, refer to the file AUTHORS.
+# --------------------------------------------------------------------------
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
+# INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# --------------------------------------------------------------------------
+# $Maintainer: Chris Bielow $
+# $Authors: Chris Bielow $
+# --------------------------------------------------------------------------
+
+import re
+import random
+import math
+import sys
+import argparse
+
+## holds FASTA header + sequence
+class FASTAEntry:
+    pass
+
+## grab entries from FASTA file
+def nextEntry(fileobj):
+        entry = FASTAEntry()
+        entry.header = fileobj.readline()
+        entry.sequence = ""
+        for line in fileobj:
+                if '>' == line[0]:
+                        yield entry
+                        entry.header = line
+                        entry.sequence = ""
+                else:  
+                        entry.sequence += line
+        yield entry
+
+
+## sample abundance from Gaussian in log space
+def sampleAbundance(mu=3, sigma=1):
+    return math.exp(random.gauss(mu, sigma))
+
+
+def main(argv):
+    ## we use ArgumentParser, which requires 2.7
+    if sys.version_info < (2, 7):
+      raise "This script requires python 2.7 or greater"
+
+    ## add weight filtering functionality if BioPython is available
+    try:
+      from Bio.SeqUtils.ProtParam import ProteinAnalysis
+      has_biopython = 1
+    except :
+      has_biopython = 0
+      
+      
+    parser = argparse.ArgumentParser(description='Add abundance to FASTA files.')
+    parser.add_argument('infile', type=argparse.FileType('r'), help='Input FASTA file')
+    parser.add_argument('outfile', type=argparse.FileType('w'), help='Output FASTA file')
+    
+    parser.add_argument('--mu', dest='mu', action='store', default=3, help='mean of gaussian in log space')
+    parser.add_argument('--sigma', dest='sigma', action='store', default=1, help='sd of gaussian in log space')
+    parser.add_argument('--sample', dest='sample', action='store', default=0, help='Number of entries to keep (for sampling a bigger FASTA file)')
+    parser.add_argument('--random', dest='random', action='store_true', help='Randomly shuffle entries before sampling (only if --sample is given). If not given, the first \'X\' samples are used.')
+    if (has_biopython):
+      parser.add_argument('--weight_low', dest='weight_low', action='store', default=0, help='minimum molecular weight of protein')
+      parser.add_argument('--weight_up', dest='weight_up', action='store', default=0, help='Maximum molecular weight of protein (use 0 for unlimited)')
+    else:
+      print ("Warning: protein weight filtering not supported, as BioPython module is not installed.")
+      
+    ## argument parsing
+    args = parser.parse_args()
+    fileobj = args.infile
+    fileoutobj = args.outfile
+    sample_size = int(args.sample)
+    sample_random = bool(args.random)
+    if (has_biopython):
+      weight_low = float(args.weight_low)
+      weight_up = float(args.weight_up)
+      if (weight_up <= 0): weight_up = sys.float_info.max
+      
+    
+    ## list of final entries
+    fasta_entries = []
+    
+    for entry in nextEntry(fileobj):
+        header = entry.header
+        ## check if it contains 'intensity'?
+        rep = re.compile(r"\[# *(.*) *#\]")
+        m = rep.search(header)
+        header_new = ""
+        other = []
+        if (m):
+          header_new = header.replace(m.group(0), "") ## delete meta
+          for element in m.group(1).split(','):
+              #print "element:", element
+              if (element.find("intensity") == -1):
+                  other.append(element)
+        else:
+          header_new = header  ## nothing to replace
+
+        ## create new metainfo array
+        i = "intensity=" + str(sampleAbundance(float(args.mu), float(args.sigma)))
+        other.append(i)
+
+        entry.header = header_new.rstrip() + "[# " + (", ").join(other) + " #]"
+        
+        if (has_biopython):
+          sequence = "".join(entry.sequence.split("\n"))
+          ##
+          ## BioPython does not like some AA letters - they need replacement
+          ##
+          ## replace "U" (Selenocystein) with "C" (Cystein)
+          sequence = sequence.replace("U","C")
+          ## replace "X" (unknown) with "P" (Proline) [arbitrary choice - but weight of 115 is very close to averagine]
+          sequence = sequence.replace("X","P")
+          ## replace "B" (Asparagine or aspartic acid) with "N" (Asparagine) 
+          sequence = sequence.replace("B","N")
+          ## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine) 
+          sequence = sequence.replace("Z","Q")
+          ## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine) 
+          sequence = sequence.replace("Z","Q")
+          ## replace "J" (Leucine or Isoleucine) with "L" (Leucine) 
+          sequence = sequence.replace("J","L")
+          analysed_seq = ProteinAnalysis(sequence)
+          weight = analysed_seq.molecular_weight()
+          if (not(weight_low <= weight and weight <= weight_up)):
+            continue
+        
+        
+        fasta_entries.append(entry.header + "\n" + entry.sequence)
+         
+        ## only read to sample size (the rest is thrown away anyways)
+        if (sample_size > 0 and not(sample_random)):
+          if (len(fasta_entries) >= sample_size):
+            break
+          
+        
+    ## select subset (if required)
+    if (sample_size > 0):
+      indices = range(0,len(fasta_entries))
+      ## random sampling only makes sense if we take a subset
+      if (sample_random and sample_size < len(fasta_entries)):
+        random.shuffle(indices)
+      indices = [indices[i] for i in range(0,sample_size)]
+      fasta_entries = [fasta_entries[i] for i in indices]
+      
+    ## write to file
+    for entry in fasta_entries:
+      fileoutobj.write(entry)
+
+    print ("Generated " + str(len(fasta_entries)) + " protein sequences")
+
+if __name__ == "__main__":
+    main(sys.argv)