Mercurial > repos > peterjc > predictnls
comparison tools/protein_analysis/predictnls.py @ 0:beaa52cd2954 draft
Uploaded v0.0.4, first public release
| author | peterjc |
|---|---|
| date | Mon, 23 Sep 2013 10:03:02 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:beaa52cd2954 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 #Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK | |
| 4 # | |
| 5 #Licenced under the GPL (GNU General Public Licence) version 3. | |
| 6 # | |
| 7 #Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later | |
| 8 #versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair | |
| 9 #(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab, | |
| 10 #Columbia University http://rostlab.org/ | |
| 11 | |
| 12 """Batch mode predictNLS, for finding nuclear localization signals | |
| 13 | |
| 14 This is a Python script re-implementing the predictNLS method, originally | |
| 15 written in Perl, described here: | |
| 16 | |
| 17 Murat Cokol, Rajesh Nair, and Burkhard Rost. | |
| 18 Finding nuclear localization signals. | |
| 19 EMBO reports 1(5), 411-415, 2000 | |
| 20 | |
| 21 http://dx.doi.org/10.1093/embo-reports/kvd092 | |
| 22 | |
| 23 The original Perl script was designed to work on a single sequence at a time, | |
| 24 but offers quite detailed output, including HTML (webpage). | |
| 25 | |
| 26 This Python version is designed to work on a single FASTA file containing | |
| 27 multiple sequences, and produces a single tabular output file, with one line | |
| 28 per NLS found (i.e. zero or more rows per query sequence). | |
| 29 | |
| 30 It takes either two or three command line arguments: | |
| 31 | |
| 32 predictNLS_batch input_file output_file [nls_motif_file] | |
| 33 | |
| 34 The input file should be protein sequences in FASTA format, the output file | |
| 35 is tab separated plain text, and the NLS motif file defaults to using the | |
| 36 plain text My_NLS_list file located next to the script file, or in a data | |
| 37 subdirectory. | |
| 38 | |
| 39 For convience if using this outside Galaxy, the input filename can be '-' | |
| 40 to mean stdin, and likewise the output filename can be '-' to mean stdout. | |
| 41 | |
| 42 Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to | |
| 43 predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in | |
| 44 August 2010, see the change log included in those tar-balls). | |
| 45 | |
| 46 The Rost Lab provide source code tar balls for predictNLS on the FTP site | |
| 47 ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they | |
| 48 recommend their package repository instead, | |
| 49 https://rostlab.org/owiki/index.php/Packages | |
| 50 """ | |
| 51 | |
| 52 import os | |
| 53 import sys | |
| 54 import re | |
| 55 | |
| 56 def stop_err(msg, return_code=1): | |
| 57 sys.stderr.write(msg.rstrip() + "\n") | |
| 58 sys.exit(return_code) | |
| 59 | |
| 60 if len(sys.argv) == 4: | |
| 61 fasta_filename, tabular_filename, re_filename = sys.argv[1:] | |
| 62 elif len(sys.argv) == 3: | |
| 63 fasta_filename, tabular_filename = sys.argv[1:] | |
| 64 #Use os.path.realpath(...) to handle being called via a symlink | |
| 65 #Try under subdirectory data: | |
| 66 re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), | |
| 67 "data", "My_NLS_list") | |
| 68 if not os.path.isfile(re_filename): | |
| 69 #Try in same directory as this script: | |
| 70 re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), | |
| 71 "My_NLS_list") | |
| 72 else: | |
| 73 stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file") | |
| 74 | |
| 75 if not os.path.isfile(fasta_filename): | |
| 76 stop_err("Could not find FASTA input file: %s" % fasta_filename) | |
| 77 | |
| 78 if not os.path.isfile(re_filename): | |
| 79 stop_err("Could not find NLS motif file: %s" % re_filename) | |
| 80 | |
| 81 def load_re(filename): | |
| 82 """Parse the 5+ column tabular NLS motif file.""" | |
| 83 handle = open(filename, "rU") | |
| 84 for line in handle: | |
| 85 line = line.rstrip("\n") | |
| 86 if not line: | |
| 87 continue | |
| 88 parts = line.split("\t") | |
| 89 assert 5 <= len(parts), parts | |
| 90 regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5] | |
| 91 try: | |
| 92 regex = re.compile(regex) | |
| 93 p_count = int(p_count) | |
| 94 except ValueError: | |
| 95 stop_err("Bad data in line: %s" % line) | |
| 96 if 6 <= len(parts): | |
| 97 proteins = parts[5] | |
| 98 assert p_count == len(proteins.split(",")), line | |
| 99 else: | |
| 100 proteins = "" | |
| 101 assert p_count == 0 | |
| 102 if 7 <= len(parts): | |
| 103 domains = parts[6] | |
| 104 assert int(p_count) == len(domains.split(",")), line | |
| 105 else: | |
| 106 domains = "" | |
| 107 assert p_count == 0 | |
| 108 #There can be further columns (DNA binding?), but we don't use them. | |
| 109 yield regex, evidence, p_count, percent_nuc, proteins, domains | |
| 110 handle.close() | |
| 111 | |
| 112 def fasta_iterator(filename): | |
| 113 """Simple FASTA parser yielding tuples of (name, upper case sequence).""" | |
| 114 if filename == "-": | |
| 115 handle = sys.stdin | |
| 116 else: | |
| 117 handle = open(filename) | |
| 118 name, seq = "", "" | |
| 119 for line in handle: | |
| 120 if line.startswith(">"): | |
| 121 if name: | |
| 122 yield name, seq | |
| 123 #Take the first word only as the name: | |
| 124 name = line[1:].rstrip().split(None,1)[0] | |
| 125 seq = "" | |
| 126 elif name: | |
| 127 #Simple way would leave in any internal white space, | |
| 128 #seq += line.strip().upper() | |
| 129 seq += "".join(line.strip().upper().split()) | |
| 130 elif not line.strip(): | |
| 131 #Ignore blank lines before first record | |
| 132 pass | |
| 133 else: | |
| 134 raise ValueError("Bad FASTA line %r" % line) | |
| 135 if filename != "-": | |
| 136 handle.close() | |
| 137 if name: | |
| 138 yield name, seq | |
| 139 raise StopIteration | |
| 140 | |
| 141 motifs = list(load_re(re_filename)) | |
| 142 print "Looking for %i NLS motifs" % len(motifs) | |
| 143 | |
| 144 if tabular_filename == "-": | |
| 145 out_handle = sys.stdout | |
| 146 else: | |
| 147 out_handle = open(tabular_filename, "w") | |
| 148 out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n") | |
| 149 count = 0 | |
| 150 nls = 0 | |
| 151 for idn, seq in fasta_iterator(fasta_filename): | |
| 152 for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs: | |
| 153 #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40 | |
| 154 #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches | |
| 155 for match in regex.finditer(seq): | |
| 156 #Perl predictnls v1.0.17 (and older) return NLS start position with zero | |
| 157 #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38 | |
| 158 #We therefore also use one based couting, hence the start+1 here: | |
| 159 out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \ | |
| 160 % (idn, match.start()+1, match.group(), | |
| 161 regex.pattern, evidence, p_count, | |
| 162 percent_nuc_prot, proteins, domains)) | |
| 163 nls += 1 | |
| 164 count += 1 | |
| 165 if tabular_filename != "-": | |
| 166 out_handle.close() | |
| 167 print "Found %i NLS motifs in %i sequences" % (nls, count) |
