Mercurial > repos > galaxyp > openms_idposteriorerrorprobability
comparison test-data/examples/simulation/FASTAProteinAbundanceSampling.py @ 18:6daaa75ccb99 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/openms commit 6e7368b7f178fbd1f08c28eea1b538add6943a65-dirty"
| author | galaxyp |
|---|---|
| date | Sun, 13 Dec 2020 15:03:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:6f6c1cb968a0 | 18:6daaa75ccb99 |
|---|---|
| 1 # -------------------------------------------------------------------------- | |
| 2 # OpenMS -- Open-Source Mass Spectrometry | |
| 3 # -------------------------------------------------------------------------- | |
| 4 # Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, | |
| 5 # ETH Zurich, and Freie Universitaet Berlin 2002-2020. | |
| 6 # | |
| 7 # This software is released under a three-clause BSD license: | |
| 8 # * Redistributions of source code must retain the above copyright | |
| 9 # notice, this list of conditions and the following disclaimer. | |
| 10 # * Redistributions in binary form must reproduce the above copyright | |
| 11 # notice, this list of conditions and the following disclaimer in the | |
| 12 # documentation and/or other materials provided with the distribution. | |
| 13 # * Neither the name of any author or any participating institution | |
| 14 # may be used to endorse or promote products derived from this software | |
| 15 # without specific prior written permission. | |
| 16 # For a full list of authors, refer to the file AUTHORS. | |
| 17 # -------------------------------------------------------------------------- | |
| 18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 19 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 20 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 21 # ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING | |
| 22 # INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| 23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
| 24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
| 25 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
| 26 # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | |
| 27 # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
| 28 # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 # | |
| 30 # -------------------------------------------------------------------------- | |
| 31 # $Maintainer: Chris Bielow $ | |
| 32 # $Authors: Chris Bielow $ | |
| 33 # -------------------------------------------------------------------------- | |
| 34 | |
| 35 import re | |
| 36 import random | |
| 37 import math | |
| 38 import sys | |
| 39 import argparse | |
| 40 | |
| 41 ## holds FASTA header + sequence | |
| 42 class FASTAEntry: | |
| 43 pass | |
| 44 | |
| 45 ## grab entries from FASTA file | |
| 46 def nextEntry(fileobj): | |
| 47 entry = FASTAEntry() | |
| 48 entry.header = fileobj.readline() | |
| 49 entry.sequence = "" | |
| 50 for line in fileobj: | |
| 51 if '>' == line[0]: | |
| 52 yield entry | |
| 53 entry.header = line | |
| 54 entry.sequence = "" | |
| 55 else: | |
| 56 entry.sequence += line | |
| 57 yield entry | |
| 58 | |
| 59 | |
| 60 ## sample abundance from Gaussian in log space | |
| 61 def sampleAbundance(mu=3, sigma=1): | |
| 62 return math.exp(random.gauss(mu, sigma)) | |
| 63 | |
| 64 | |
| 65 def main(argv): | |
| 66 ## we use ArgumentParser, which requires 2.7 | |
| 67 if sys.version_info < (2, 7): | |
| 68 raise "This script requires python 2.7 or greater" | |
| 69 | |
| 70 ## add weight filtering functionality if BioPython is available | |
| 71 try: | |
| 72 from Bio.SeqUtils.ProtParam import ProteinAnalysis | |
| 73 has_biopython = 1 | |
| 74 except : | |
| 75 has_biopython = 0 | |
| 76 | |
| 77 | |
| 78 parser = argparse.ArgumentParser(description='Add abundance to FASTA files.') | |
| 79 parser.add_argument('infile', type=argparse.FileType('r'), help='Input FASTA file') | |
| 80 parser.add_argument('outfile', type=argparse.FileType('w'), help='Output FASTA file') | |
| 81 | |
| 82 parser.add_argument('--mu', dest='mu', action='store', default=3, help='mean of gaussian in log space') | |
| 83 parser.add_argument('--sigma', dest='sigma', action='store', default=1, help='sd of gaussian in log space') | |
| 84 parser.add_argument('--sample', dest='sample', action='store', default=0, help='Number of entries to keep (for sampling a bigger FASTA file)') | |
| 85 parser.add_argument('--random', dest='random', action='store_true', help='Randomly shuffle entries before sampling (only if --sample is given). If not given, the first \'X\' samples are used.') | |
| 86 if (has_biopython): | |
| 87 parser.add_argument('--weight_low', dest='weight_low', action='store', default=0, help='minimum molecular weight of protein') | |
| 88 parser.add_argument('--weight_up', dest='weight_up', action='store', default=0, help='Maximum molecular weight of protein (use 0 for unlimited)') | |
| 89 else: | |
| 90 print ("Warning: protein weight filtering not supported, as BioPython module is not installed.") | |
| 91 | |
| 92 ## argument parsing | |
| 93 args = parser.parse_args() | |
| 94 fileobj = args.infile | |
| 95 fileoutobj = args.outfile | |
| 96 sample_size = int(args.sample) | |
| 97 sample_random = bool(args.random) | |
| 98 if (has_biopython): | |
| 99 weight_low = float(args.weight_low) | |
| 100 weight_up = float(args.weight_up) | |
| 101 if (weight_up <= 0): weight_up = sys.float_info.max | |
| 102 | |
| 103 | |
| 104 ## list of final entries | |
| 105 fasta_entries = [] | |
| 106 | |
| 107 for entry in nextEntry(fileobj): | |
| 108 header = entry.header | |
| 109 ## check if it contains 'intensity'? | |
| 110 rep = re.compile(r"\[# *(.*) *#\]") | |
| 111 m = rep.search(header) | |
| 112 header_new = "" | |
| 113 other = [] | |
| 114 if (m): | |
| 115 header_new = header.replace(m.group(0), "") ## delete meta | |
| 116 for element in m.group(1).split(','): | |
| 117 #print "element:", element | |
| 118 if (element.find("intensity") == -1): | |
| 119 other.append(element) | |
| 120 else: | |
| 121 header_new = header ## nothing to replace | |
| 122 | |
| 123 ## create new metainfo array | |
| 124 i = "intensity=" + str(sampleAbundance(float(args.mu), float(args.sigma))) | |
| 125 other.append(i) | |
| 126 | |
| 127 entry.header = header_new.rstrip() + "[# " + (", ").join(other) + " #]" | |
| 128 | |
| 129 if (has_biopython): | |
| 130 sequence = "".join(entry.sequence.split("\n")) | |
| 131 ## | |
| 132 ## BioPython does not like some AA letters - they need replacement | |
| 133 ## | |
| 134 ## replace "U" (Selenocystein) with "C" (Cystein) | |
| 135 sequence = sequence.replace("U","C") | |
| 136 ## replace "X" (unknown) with "P" (Proline) [arbitrary choice - but weight of 115 is very close to averagine] | |
| 137 sequence = sequence.replace("X","P") | |
| 138 ## replace "B" (Asparagine or aspartic acid) with "N" (Asparagine) | |
| 139 sequence = sequence.replace("B","N") | |
| 140 ## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine) | |
| 141 sequence = sequence.replace("Z","Q") | |
| 142 ## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine) | |
| 143 sequence = sequence.replace("Z","Q") | |
| 144 ## replace "J" (Leucine or Isoleucine) with "L" (Leucine) | |
| 145 sequence = sequence.replace("J","L") | |
| 146 analysed_seq = ProteinAnalysis(sequence) | |
| 147 weight = analysed_seq.molecular_weight() | |
| 148 if (not(weight_low <= weight and weight <= weight_up)): | |
| 149 continue | |
| 150 | |
| 151 | |
| 152 fasta_entries.append(entry.header + "\n" + entry.sequence) | |
| 153 | |
| 154 ## only read to sample size (the rest is thrown away anyways) | |
| 155 if (sample_size > 0 and not(sample_random)): | |
| 156 if (len(fasta_entries) >= sample_size): | |
| 157 break | |
| 158 | |
| 159 | |
| 160 ## select subset (if required) | |
| 161 if (sample_size > 0): | |
| 162 indices = range(0,len(fasta_entries)) | |
| 163 ## random sampling only makes sense if we take a subset | |
| 164 if (sample_random and sample_size < len(fasta_entries)): | |
| 165 random.shuffle(indices) | |
| 166 indices = [indices[i] for i in range(0,sample_size)] | |
| 167 fasta_entries = [fasta_entries[i] for i in indices] | |
| 168 | |
| 169 ## write to file | |
| 170 for entry in fasta_entries: | |
| 171 fileoutobj.write(entry) | |
| 172 | |
| 173 print ("Generated " + str(len(fasta_entries)) + " protein sequences") | |
| 174 | |
| 175 if __name__ == "__main__": | |
| 176 main(sys.argv) |
