comparison assign_clades.py @ 0:2e262bbd193f draft

planemo upload for repository https://github.com/dfornika/clade_assignment commit 03a6fa631a506f446f52a7d9bfba649edac8a212
author dfornika
date Wed, 25 Oct 2017 20:54:06 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2e262bbd193f
1 '''Accepts fasta files containing amino acid sequence, reading them in as
2 amino acid sequence objects. Reads influenza clade defintions (i.e. amino
3 acids at certain positions) from .json file into dictionary structure. Searches
4 each of the amino acid sequence objects for the more general clades (i.e. 3C.2a or
5 3C.3a and then 3c.2a1 variants if applicable) and appends and underscore to the
6 Sequence name, followed by either the name of the clade or "undetermined". '''
7
8 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Oct 2017'''
9
10 import sys,string,os, time, Bio, json
11 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord
12 from Bio.SeqRecord import SeqRecord
13 from Bio.Alphabet import IUPAC
14 from Bio.Seq import Seq
15
16 localtime = time.asctime(time.localtime(time.time())) #date and time of analysis
17 inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed
18 inFileHandle2 = sys.argv[2] # .csv file containing clade definitions and "depth"
19 outFileHandle = sys.argv[3] #user-specified name for output file of aa seq's with clade suffixes
20 outFile= open(outFileHandle,'w') #open a writable, appendable output file
21 seqList = [] #list of aa sequence objects to parse for clade definitions
22 cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..})
23
24 '''Searches record for required amino acids at defined positions. If found, assigns
25 clade name to sequence name by appending underscore and clade name to record id.'''
26 def call_clade(record):
27 print "---------------------------------------------------------------------"
28 print "Parsing %s for matching flu clade definitions..." % (record.id)
29 matchList = [] #empty list to hold clades that match 100%
30 #iterate over each tuple in the clade list
31 for clade in cladeList:
32 cladeName = clade[0] #temp variable for name
33 depth = clade[1] #temp variable for depth
34 sites = clade[2] #temp variable for aa def dictionary
35 shouldFind = len(sites) #number of sites that should match
36 found = 0 #a counter to hold matches to antigenic sites
37 #iterate over each position in sites dictionary
38 for pos, aa in sites.iteritems():
39 #translate pos to corresponding index in target sequence
40 index = int(pos) - 1
41 #if record at index has same amino acid as 'aa', increment 'found'
42 if record[index] == aa:
43 found += 1
44 if (found == shouldFind):
45 #print "\n%s contains %i of %i clade \"%s\" antigenic sites!" % (record.id,found, shouldFind, cladeName)
46 #add the matching clade name to a list
47 #matchList.append(cladeName)
48 #add the matching clade tuple to the list of matches
49 matchList.append(clade)
50 return matchList
51
52 '''Compares depth level of clades in a list and returns the most granular one'''
53 def decide_clade_by_depth(matchList):
54 #empty variable for maximum depth encountered
55 max_depth = 0
56 best_match_name = '' #variable to hold most granular clade
57 #for each matching clade, check depth of the corresponding tuple
58 for clade in matchList:
59 #if the current clade is 'deeper' than the one before it
60 if clade[1] > max_depth:
61 #store this depth
62 max_depth = clade[1]
63 #store name of the clade
64 best_match_name = clade[0]
65 return best_match_name
66
67 '''opens the .csv file of clade definitions and clade "depth" '''
68 with open (inFileHandle2, 'r') as clade_file:
69 #remove whitespace from the end of each line and split elements at commas
70 for line in clade_file:
71 #print "Current Line in File:" + line
72 sites={} #initialize a dictionary for clade
73 elementList = line.rstrip().split(',')
74 new_list = [] #start a new list to put non-empty strings into
75 #remove empty stings in list
76 for item in elementList:
77 if item != '':
78 new_list.append(item)
79 name = new_list.pop(0) #move 1st element to name field
80 depth = int(new_list.pop(0)) #move 2nd element to depth field
81 #read remaining pairs of non-null elements into clade def dictionary
82 for i in range(0, len(new_list), 2):
83 #move next 2 items from the list into the dict
84 pos = new_list[i]
85 aa = new_list[i + 1]
86 sites[pos] = aa
87 #add the clade info as a tuple to the cladeList[]
88 oneClade =(name, depth, sites)
89 cladeList.append(oneClade)
90 print "The List of Clades:"
91 for clade in cladeList:
92 print "Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2]))
93 for pos, aa in clade[2].iteritems():
94 print "Pos: %s\tAA: %s" % (pos,aa)
95
96 '''opens readable input file of sequences to parse using filename from cmd line,
97 instantiates as AA Sequence objects, with ppercase sequences'''
98 with open(inFileHandle1,'r') as inFile:
99 #read in Sequences from fasta file, uppercase and add to seqList
100 for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein):
101 record = record.upper()
102 seqList.append(record) #add Seq to list of Sequences
103 print "\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList)
104 #parse each target sequence object
105 for record in seqList:
106 clade_call = '' #empty variale for final clade call on sequence
107 matchingCladeList = call_clade(record) #holds matching clade tuples
108 #if there is more than one clade match
109 if len(matchingCladeList) > 1:
110 #choose the most granular clade based on depth
111 clade_call = decide_clade_by_depth(matchingCladeList)
112 #if there is only one clade call
113 elif len(matchingCladeList) > 0:
114 clade = matchingCladeList[0] #take the first tuple in the list
115 clade_call = clade[0] #clade name is the first item in the tuple
116 #empty list return, no matches
117 else:
118 clade_call = "No_Match"
119 print clade_call
120 seq_name = record.id
121 mod_name = seq_name + "_" + clade_call
122 print "New Sequence Name: " + mod_name
123 record.id = mod_name
124
125
126 #output fasta file with clade calls appended to sequence names
127 SeqIO.write(seqList,outFile,"fasta")
128
129 #print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),outFileHandle)))
130 inFile.close()
131 clade_file.close()
132 outFile.close()
133