Mercurial > repos > dfornika > assign_clades

--- a/README.md	Wed Oct 25 21:09:09 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-# clade_assignment
-Python scripts to assign clade designations to influenza amino acid fasta files.
-Other functions will be added as needed.
--- a/assign_clades.py	Wed Oct 25 21:09:09 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,133 +0,0 @@
-'''Accepts fasta files containing amino acid sequence, reading them in as
-amino acid sequence objects.  Reads influenza clade defintions (i.e. amino
-acids at certain positions) from .json file into dictionary structure. Searches
-each of the amino acid sequence objects for the more general clades (i.e. 3C.2a or
-3C.3a and then 3c.2a1 variants if applicable) and appends and underscore to the
-Sequence name, followed by either the name of the clade or "undetermined". '''
-
-'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Oct 2017'''
-
-import sys,string,os, time, Bio, json
-from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord
-from Bio.SeqRecord import SeqRecord
-from Bio.Alphabet import IUPAC
-from Bio.Seq import Seq
-
-localtime = time.asctime(time.localtime(time.time())) #date and time of analysis
-inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed
-inFileHandle2 = sys.argv[2] # .csv file containing clade definitions and "depth"
-outFileHandle = sys.argv[3] #user-specified name for output file of aa seq's with clade suffixes
-outFile= open(outFileHandle,'w') #open a writable, appendable output file
-seqList = [] #list of aa sequence objects to parse for clade definitions
-cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..})
-
-'''Searches record for required amino acids at defined positions. If found, assigns
-clade name to sequence name by appending underscore and clade name to record id.'''
-def call_clade(record):
-    print "---------------------------------------------------------------------"
-    print "Parsing %s for matching flu clade definitions..." % (record.id)
-    matchList = [] #empty list to hold clades that match 100%
-    #iterate over each tuple in the clade list
-    for clade in cladeList:
-        cladeName = clade[0] #temp variable for name
-        depth = clade[1] #temp variable for depth
-        sites = clade[2] #temp variable for aa def dictionary
-        shouldFind = len(sites) #number of sites that should match
-        found = 0 #a counter to hold matches to antigenic sites
-        #iterate over each position in sites dictionary
-        for pos, aa in sites.iteritems():
-            #translate pos to corresponding index in target sequence
-            index = int(pos) - 1
-            #if record at index has same amino acid as 'aa', increment 'found'
-            if record[index] == aa:
-                found += 1
-        if (found == shouldFind):
-            #print "\n%s contains %i of %i clade \"%s\" antigenic sites!" % (record.id,found, shouldFind, cladeName)
-            #add the matching clade name to a list
-            #matchList.append(cladeName)
-            #add the matching clade tuple to the list of matches
-            matchList.append(clade)
-    return matchList
-
-'''Compares depth level of clades in a list and returns the most granular one'''
-def decide_clade_by_depth(matchList):
-    #empty variable for maximum depth encountered
-    max_depth = 0
-    best_match_name = '' #variable to hold most granular clade
-    #for each matching clade, check depth of the corresponding tuple
-    for clade in matchList:
-        #if the current clade is 'deeper' than the one before it
-        if clade[1] > max_depth:
-            #store this depth
-            max_depth = clade[1]
-            #store name of the clade
-            best_match_name = clade[0]
-    return best_match_name
-
-'''opens the .csv file of clade definitions and clade "depth" '''
-with open (inFileHandle2, 'r') as clade_file:
-    #remove whitespace from the end of each line and split elements at commas
-    for line in clade_file:
-        #print "Current Line in File:" + line
-        sites={} #initialize a dictionary for clade
-        elementList = line.rstrip().split(',')
-        new_list = [] #start a new list to put non-empty strings into
-        #remove empty stings in list
-        for item in elementList:
-            if item != '':
-                new_list.append(item)
-        name = new_list.pop(0) #move 1st element to name field
-        depth = int(new_list.pop(0)) #move 2nd element to depth field
-        #read remaining pairs of non-null elements into clade def dictionary
-        for i in range(0, len(new_list), 2):
-            #move next 2 items from the list into the dict
-            pos = new_list[i]
-            aa = new_list[i + 1]
-            sites[pos] = aa
-        #add the clade info as a tuple to the cladeList[]
-        oneClade =(name, depth, sites)
-        cladeList.append(oneClade)
-    print "The List of Clades:"
-    for clade in cladeList:
-        print "Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2]))
-        for pos, aa in clade[2].iteritems():
-            print "Pos: %s\tAA: %s" % (pos,aa)
-
-'''opens readable input file of sequences to parse using filename from cmd line,
-    instantiates as AA Sequence objects, with ppercase sequences'''
-with open(inFileHandle1,'r') as inFile:
-    #read in Sequences from fasta file, uppercase and add to seqList
-    for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein):
-        record = record.upper()
-        seqList.append(record) #add Seq to list of Sequences
-    print "\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList)
-    #parse each target sequence object
-    for record in seqList:
-        clade_call = '' #empty variale for final clade call on sequence
-        matchingCladeList = call_clade(record) #holds matching clade tuples
-        #if there is more than one clade match
-        if len(matchingCladeList) > 1:
-            #choose the most granular clade based on depth
-            clade_call = decide_clade_by_depth(matchingCladeList)
-        #if there is only one clade call
-        elif len(matchingCladeList) > 0:
-            clade = matchingCladeList[0] #take the first tuple in the list
-            clade_call = clade[0] #clade name is the first item in the tuple
-        #empty list return, no matches
-        else:
-            clade_call = "No_Match"
-        print clade_call
-        seq_name = record.id
-        mod_name = seq_name + "_" + clade_call
-        print "New Sequence Name: " + mod_name
-        record.id = mod_name
-
-
-#output fasta file with clade calls appended to sequence names
-SeqIO.write(seqList,outFile,"fasta")
-
-#print("\n%i Sequences Extracted to Output file: %s"  % ((len(extractedSeqList),outFileHandle)))
-inFile.close()
-clade_file.close()
-outFile.close()
-
--- a/assign_clades.xml	Wed Oct 25 21:09:09 2017 -0400
+++ b/assign_clades.xml	Wed Oct 25 21:53:38 2017 -0400
@@ -1,10 +1,9 @@
 <tool id="assign_clades" name="Assign Clades" version="0.1.0">
   <requirements>
-    <requirement type="package">package_biopython_1_61</requirement>
+    <requirement type="package" version="0.1.0">assign_clades</requirement>
   </requirements>
   <command detect_errors="exit_code"><![CDATA[
-    python
-    $__tool_directory__/assign_clades.py
+    assign_clades.py
     '$input_fasta'
     '$clade_definitions'
     '$output_file'