Mercurial > repos > galaxyp > msms_extractor
comparison msms_extractor.py @ 0:5212de03419c draft default tip
"planemo upload commit 498440b547e1feaa6a81764b55ac8208626d70a8"
| author | galaxyp |
|---|---|
| date | Tue, 29 Oct 2019 11:09:23 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5212de03419c |
|---|---|
| 1 # | |
| 2 # Developed by Praveen Kumar | |
| 3 # Galaxy-P Team (Griffin's Lab) | |
| 4 # University of Minnesota | |
| 5 # | |
| 6 # | |
| 7 # | |
| 8 | |
| 9 | |
| 10 from pyteomics import mzml | |
| 11 import os | |
| 12 import sys | |
| 13 import shutil | |
| 14 import subprocess | |
| 15 import re | |
| 16 import pandas as pd | |
| 17 from operator import itemgetter | |
| 18 from itertools import groupby | |
| 19 import random | |
| 20 import argparse | |
| 21 | |
| 22 def main(): | |
| 23 if len(sys.argv) >= 7: | |
| 24 parser = argparse.ArgumentParser() | |
| 25 parser.add_argument("msms", help="mzML File") | |
| 26 parser.add_argument("psm", help="PSM Report File") | |
| 27 parser.add_argument("out", help="Output filename") | |
| 28 parser.add_argument("filestring", help="MSMS File string as identifier") | |
| 29 parser.add_argument("remove_retain", help="Remove scans reported in the PSM report") | |
| 30 parser.add_argument("random", help="Random MSMS scans used with to use with --retain (default=0)", default=0) | |
| 31 args = parser.parse_args() | |
| 32 # Start of Reading Scans from PSM file | |
| 33 # Creating dictionary of PSM file: key = filename key = list of scan numbers | |
| 34 | |
| 35 # removeORretain = sys.argv[5].strip() | |
| 36 # randomScans = int(sys.argv[6].strip()) | |
| 37 | |
| 38 removeORretain = args.remove_retain | |
| 39 randomScans = int(args.random) | |
| 40 | |
| 41 | |
| 42 # ScanFile = sys.argv[2] | |
| 43 ScanFile = args.psm | |
| 44 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) | |
| 45 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] | |
| 46 scanDict = {} | |
| 47 for each in scanFileNumber: | |
| 48 if each[0] in scanDict.keys(): | |
| 49 scanDict[each[0]].append(int(each[1])) | |
| 50 else: | |
| 51 scanDict[each[0]] = [int(each[1])] | |
| 52 # End of Reading Scans from PSM file | |
| 53 | |
| 54 # inputPath = sys.argv[1] | |
| 55 inputPath = args.msms | |
| 56 ##outPath = "/".join(sys.argv[3].split("/")[:-1]) | |
| 57 # outPath = sys.argv[3] | |
| 58 outPath = args.out | |
| 59 ##outFile = sys.argv[3].split("/")[-1] | |
| 60 allScanList = [] | |
| 61 # Read all scan numbers using indexedmzML/indexList/index/offset tags | |
| 62 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): | |
| 63 if re.search("scan=(\d+)", k['idRef']): | |
| 64 a = re.search("scan=(\d+)", k['idRef']) | |
| 65 allScanList.append(int(a.group(1))) | |
| 66 allScanList = list(set(allScanList)) | |
| 67 # End of Reading mzML file | |
| 68 # fraction_name = sys.argv[4] | |
| 69 fraction_name = args.filestring | |
| 70 if fraction_name in scanDict.keys(): | |
| 71 scansInList = scanDict[fraction_name] | |
| 72 else: | |
| 73 scansInList = [] | |
| 74 scansNotInList = list(set(allScanList) - set(scansInList)) | |
| 75 flag = 0 | |
| 76 if removeORretain == "remove": | |
| 77 scan2retain = scansNotInList | |
| 78 scan2retain = list(set(scan2retain)) | |
| 79 scan2retain.sort() | |
| 80 scansRemoved = scansInList | |
| 81 # scan2retain contains scans that is to be retained | |
| 82 elif removeORretain == "retain" and randomScans < len(scansNotInList): | |
| 83 # Randomly select spectra | |
| 84 random_scans = random.sample(scansNotInList, randomScans) | |
| 85 | |
| 86 scan2retain = random_scans + scansInList | |
| 87 scan2retain = list(set(scan2retain)) | |
| 88 scan2retain.sort() | |
| 89 scansRemoved = list(set(allScanList) - set(scan2retain)) | |
| 90 # scan2retain contains scans that is to be retained | |
| 91 else: | |
| 92 flag = 1 | |
| 93 | |
| 94 if flag == 1: | |
| 95 scan2retain = scansInList | |
| 96 scan2retain = list(set(scan2retain)) | |
| 97 scan2retain.sort() | |
| 98 scansRemoved = list(set(allScanList) - set(scan2retain)) | |
| 99 | |
| 100 # scan2retain contains scans that is to be retained | |
| 101 print("ERROR: Number of Random Scans queried is more than available. The result has provided zero random scans.", file=sys.stdout) | |
| 102 print("Number of available scans for random selection: " + str(len(scansNotInList)), file=sys.stdout) | |
| 103 print("Try a number less than the available number. Thanks!!", file=sys.stdout) | |
| 104 print("Number of Scans retained: " + str(len(scan2retain)), file=sys.stdout) | |
| 105 else: | |
| 106 # Print Stats | |
| 107 print("Total number of Scan Numbers: " + str(len(list(set(allScanList)))), file=sys.stdout) | |
| 108 print("Number of Scans retained: " + str(len(scan2retain)), file=sys.stdout) | |
| 109 print("Number of Scans removed: " + str(len(scansRemoved)), file=sys.stdout) | |
| 110 | |
| 111 | |
| 112 # Identifying groups of continuous numbers in the scan2retain and creating scanString | |
| 113 scanString = "" | |
| 114 for a, b in groupby(enumerate(scan2retain), lambda x:x[1]-x[0]): | |
| 115 x = list(map(itemgetter(1), b)) | |
| 116 scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] " | |
| 117 # end identifying | |
| 118 # start create filter file | |
| 119 filter_file = open("filter.txt", "w") | |
| 120 filter_file.write("filter=scanNumber %s\n" % scanString) | |
| 121 filter_file.close() | |
| 122 # end create filter file | |
| 123 | |
| 124 # Prepare command for msconvert | |
| 125 inputFile = fraction_name+".mzML" | |
| 126 os.symlink(inputPath,inputFile) | |
| 127 outFile = "filtered_"+fraction_name+".mzML" | |
| 128 # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib" | |
| 129 msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib" | |
| 130 | |
| 131 | |
| 132 # Run msconvert | |
| 133 try: | |
| 134 subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True) | |
| 135 except subprocess.CalledProcessError as e: | |
| 136 sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output )) | |
| 137 sys.exit(e.returncode) | |
| 138 # Copy output to | |
| 139 shutil.copyfile(outFile, outPath) | |
| 140 else: | |
| 141 print("Please contact the admin. Number of inputs are not sufficient to run the program.\n") | |
| 142 | |
| 143 if __name__ == "__main__": | |
| 144 main() | |
| 145 | |
| 146 | |
| 147 | |
| 148 |
