| 0 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 #Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK | 
|  | 4 # | 
|  | 5 #Licenced under the GPL (GNU General Public Licence) version 3. | 
|  | 6 # | 
|  | 7 #Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later | 
|  | 8 #versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair | 
|  | 9 #(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab, | 
|  | 10 #Columbia University http://rostlab.org/ | 
|  | 11 | 
|  | 12 """Batch mode predictNLS, for finding nuclear localization signals | 
|  | 13 | 
|  | 14 This is a Python script re-implementing the predictNLS method, originally | 
|  | 15 written in Perl, described here: | 
|  | 16 | 
|  | 17 Murat Cokol, Rajesh Nair, and Burkhard Rost. | 
|  | 18 Finding nuclear localization signals. | 
|  | 19 EMBO reports 1(5), 411-415, 2000 | 
|  | 20 | 
|  | 21 http://dx.doi.org/10.1093/embo-reports/kvd092 | 
|  | 22 | 
|  | 23 The original Perl script was designed to work on a single sequence at a time, | 
|  | 24 but offers quite detailed output, including HTML (webpage). | 
|  | 25 | 
|  | 26 This Python version is designed to work on a single FASTA file containing | 
|  | 27 multiple sequences, and produces a single tabular output file, with one line | 
|  | 28 per NLS found (i.e. zero or more rows per query sequence). | 
|  | 29 | 
|  | 30 It takes either two or three command line arguments: | 
|  | 31 | 
|  | 32 predictNLS_batch input_file output_file [nls_motif_file] | 
|  | 33 | 
|  | 34 The input file should be protein sequences in FASTA format, the output file | 
|  | 35 is tab separated plain text, and the NLS motif file defaults to using the | 
|  | 36 plain text My_NLS_list file located next to the script file, or in a data | 
|  | 37 subdirectory. | 
|  | 38 | 
|  | 39 For convience if using this outside Galaxy, the input filename can be '-' | 
|  | 40 to mean stdin, and likewise the output filename can be '-' to mean stdout. | 
|  | 41 | 
|  | 42 Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to | 
|  | 43 predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in | 
|  | 44 August 2010, see the change log included in those tar-balls). | 
|  | 45 | 
|  | 46 The Rost Lab provide source code tar balls for predictNLS on the FTP site | 
|  | 47 ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they | 
|  | 48 recommend their package repository instead, | 
|  | 49 https://rostlab.org/owiki/index.php/Packages | 
|  | 50 """ | 
|  | 51 | 
|  | 52 import os | 
|  | 53 import sys | 
|  | 54 import re | 
|  | 55 | 
|  | 56 def stop_err(msg, return_code=1): | 
|  | 57     sys.stderr.write(msg.rstrip() + "\n") | 
|  | 58     sys.exit(return_code) | 
|  | 59 | 
|  | 60 if len(sys.argv) == 4: | 
|  | 61     fasta_filename, tabular_filename, re_filename = sys.argv[1:] | 
|  | 62 elif len(sys.argv) == 3: | 
|  | 63     fasta_filename, tabular_filename = sys.argv[1:] | 
|  | 64     #Use os.path.realpath(...) to handle being called via a symlink | 
|  | 65     #Try under subdirectory data: | 
|  | 66     re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), | 
|  | 67                                "data", "My_NLS_list") | 
|  | 68     if not os.path.isfile(re_filename): | 
|  | 69         #Try in same directory as this script: | 
|  | 70         re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), | 
|  | 71                                                    "My_NLS_list") | 
|  | 72 else: | 
|  | 73     stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file") | 
|  | 74 | 
|  | 75 if not os.path.isfile(fasta_filename): | 
|  | 76     stop_err("Could not find FASTA input file: %s" % fasta_filename) | 
|  | 77 | 
|  | 78 if not os.path.isfile(re_filename): | 
|  | 79     stop_err("Could not find NLS motif file: %s" % re_filename) | 
|  | 80 | 
|  | 81 def load_re(filename): | 
|  | 82     """Parse the 5+ column tabular NLS motif file.""" | 
|  | 83     handle = open(filename, "rU") | 
|  | 84     for line in handle: | 
|  | 85         line = line.rstrip("\n") | 
|  | 86         if not line: | 
|  | 87             continue | 
|  | 88         parts = line.split("\t") | 
|  | 89         assert 5 <= len(parts), parts | 
|  | 90         regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5] | 
|  | 91         try: | 
|  | 92             regex = re.compile(regex) | 
|  | 93             p_count = int(p_count) | 
|  | 94         except ValueError: | 
|  | 95             stop_err("Bad data in line: %s" % line) | 
|  | 96         if 6 <= len(parts): | 
|  | 97             proteins = parts[5] | 
|  | 98             assert p_count == len(proteins.split(",")), line | 
|  | 99         else: | 
|  | 100             proteins = "" | 
|  | 101             assert p_count == 0 | 
|  | 102         if 7 <= len(parts): | 
|  | 103             domains = parts[6] | 
|  | 104             assert int(p_count) == len(domains.split(",")), line | 
|  | 105         else: | 
|  | 106             domains = "" | 
|  | 107             assert p_count == 0 | 
|  | 108         #There can be further columns (DNA binding?), but we don't use them. | 
|  | 109         yield regex, evidence, p_count, percent_nuc, proteins, domains | 
|  | 110     handle.close() | 
|  | 111 | 
|  | 112 def fasta_iterator(filename): | 
|  | 113     """Simple FASTA parser yielding tuples of (name, upper case sequence).""" | 
|  | 114     if filename == "-": | 
|  | 115         handle = sys.stdin | 
|  | 116     else: | 
|  | 117         handle = open(filename) | 
|  | 118     name, seq = "", "" | 
|  | 119     for line in handle: | 
|  | 120         if line.startswith(">"): | 
|  | 121             if name: | 
|  | 122                 yield name, seq | 
|  | 123             #Take the first word only as the name: | 
|  | 124             name = line[1:].rstrip().split(None,1)[0] | 
|  | 125             seq = "" | 
|  | 126         elif name: | 
|  | 127             #Simple way would leave in any internal white space, | 
|  | 128             #seq += line.strip().upper() | 
|  | 129             seq += "".join(line.strip().upper().split()) | 
|  | 130         elif not line.strip(): | 
|  | 131             #Ignore blank lines before first record | 
|  | 132             pass | 
|  | 133         else: | 
|  | 134             raise ValueError("Bad FASTA line %r" % line) | 
|  | 135     if filename != "-": | 
|  | 136         handle.close() | 
|  | 137     if name: | 
|  | 138         yield name, seq | 
|  | 139     raise StopIteration | 
|  | 140 | 
|  | 141 motifs = list(load_re(re_filename)) | 
|  | 142 print "Looking for %i NLS motifs" % len(motifs) | 
|  | 143 | 
|  | 144 if tabular_filename == "-": | 
|  | 145     out_handle = sys.stdout | 
|  | 146 else: | 
|  | 147     out_handle = open(tabular_filename, "w") | 
|  | 148 out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n") | 
|  | 149 count = 0 | 
|  | 150 nls = 0 | 
|  | 151 for idn, seq in fasta_iterator(fasta_filename): | 
|  | 152     for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs: | 
|  | 153         #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40 | 
|  | 154         #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches | 
|  | 155         for match in regex.finditer(seq): | 
|  | 156             #Perl predictnls v1.0.17 (and older) return NLS start position with zero | 
|  | 157             #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38 | 
|  | 158             #We therefore also use one based couting, hence the start+1 here: | 
|  | 159             out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \ | 
|  | 160                              % (idn, match.start()+1, match.group(), | 
|  | 161                                 regex.pattern, evidence, p_count, | 
|  | 162                                 percent_nuc_prot, proteins, domains)) | 
|  | 163             nls += 1 | 
|  | 164     count += 1 | 
|  | 165 if tabular_filename != "-": | 
|  | 166     out_handle.close() | 
|  | 167 print "Found %i NLS motifs in %i sequences" % (nls, count) |