Mercurial > repos > drosofff > msp_sr_signature
comparison signature.py @ 0:d613dbee3ce4
Imported from capsule None
| author | drosofff |
|---|---|
| date | Mon, 03 Nov 2014 10:29:28 -0500 |
| parents | |
| children | 2b30861d95f4 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d613dbee3ce4 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 # script for computing overlap signatures from a bowtie output | |
| 3 # Christophe Antoniewski <drosofff@gmail.com> | |
| 4 # Usage signature.py <1:input> <2:format of input> <3:minsize query> <4:maxsize query> <5:minsize target> <6:maxsize target> | |
| 5 # <7:minscope> <8:maxscope> <9:output> <10:bowtie index> <11:procedure option> <12: graph (global or lattice)> | |
| 6 # <13: R code> | |
| 7 # version 2.0.0 | |
| 8 | |
| 9 import sys, subprocess, argparse | |
| 10 from smRtools import * | |
| 11 from collections import defaultdict # test whether it is required | |
| 12 | |
| 13 def Parser(): | |
| 14 the_parser = argparse.ArgumentParser() | |
| 15 the_parser.add_argument('--input', action="store", type=str, help="input alignment file") | |
| 16 the_parser.add_argument('--inputFormat', action="store", type=str, choices=["tabular", "bam", "sam"], help="format of alignment file (tabular/bam/sam)") | |
| 17 the_parser.add_argument('--minquery', type=int, help="Minimum readsize of query reads (nt) - must be an integer") | |
| 18 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize of query reads (nt) - must be an integer") | |
| 19 the_parser.add_argument('--mintarget', type=int, help="Minimum readsize of target reads (nt) - must be an integer") | |
| 20 the_parser.add_argument('--maxtarget', type=int, help="Maximum readsize of target reads (nt) - must be an integer") | |
| 21 the_parser.add_argument('--minscope', type=int, help="Minimum overlap analyzed (nt) - must be an integer") | |
| 22 the_parser.add_argument('--maxscope', type=int, help="Maximum overlap analyzed (nt) - must be an integer") | |
| 23 the_parser.add_argument('--outputOverlapDataframe', action="store", type=str, help="Overlap dataframe") | |
| 24 the_parser.add_argument('--referenceGenome', action='store', help="path to the bowtie-indexed or fasta reference") | |
| 25 the_parser.add_argument('--extract_index', action='store_true', help="specify if the reference is an indexed Bowtie reference") | |
| 26 the_parser.add_argument('--graph', action='store', choices=["global", "lattice"], help="small RNA signature is computed either globally or by item (global-lattice)") | |
| 27 the_parser.add_argument('--rcode', type=str, help="R code to be passed to the python script") | |
| 28 args = the_parser.parse_args() | |
| 29 return args | |
| 30 | |
| 31 args = Parser() | |
| 32 | |
| 33 if args.extract_index: | |
| 34 GenomeFormat = "bowtieIndex" | |
| 35 else: | |
| 36 GenomeFormat = "fastaSource" | |
| 37 | |
| 38 if args.inputFormat == "tabular": | |
| 39 Genome = HandleSmRNAwindows (args.input, args.inputFormat, args.referenceGenome, GenomeFormat) | |
| 40 elif args.inputFormat == "sam": | |
| 41 Genome = HandleSmRNAwindows (args.input, args.inputFormat, args.referenceGenome, GenomeFormat) | |
| 42 else: | |
| 43 Genome = HandleSmRNAwindows (args.input, args.inputFormat, args.referenceGenome, GenomeFormat) | |
| 44 | |
| 45 # replace objDic by Genome.instanceDict or... objDic = Genome.instanceDict | |
| 46 objDic = Genome.instanceDict | |
| 47 | |
| 48 args.maxscope += 1 | |
| 49 | |
| 50 general_frequency_table = dict ([(i,0) for i in range(args.minscope, args.maxscope)]) | |
| 51 general_percent_table = dict ([(i,0) for i in range(args.minscope, args.maxscope)]) | |
| 52 OUT = open (args.outputOverlapDataframe, "w") | |
| 53 | |
| 54 if args.graph == "global": | |
| 55 ###### for normalized summing of local_percent_table(s) | |
| 56 readcount_dic = {} | |
| 57 Total_read_in_objDic = 0 | |
| 58 for item in objDic: | |
| 59 readcount_dic[item] = objDic[item].readcount(args.minquery, args.maxquery) | |
| 60 Total_read_in_objDic += readcount_dic[item] | |
| 61 ###### | |
| 62 for x in (objDic): | |
| 63 local_frequency_table = objDic[x].signature( args.minquery, args.maxquery, args.mintarget, args.maxtarget, range(args.minscope, args.maxscope) ) | |
| 64 local_percent_table = objDic[x].hannon_signature( args.minquery, args.maxquery, args.mintarget, args.maxtarget, range(args.minscope, args.maxscope) ) | |
| 65 try: | |
| 66 for overlap in local_frequency_table.keys(): | |
| 67 general_frequency_table[overlap] = general_frequency_table.get(overlap, 0) + local_frequency_table[overlap] | |
| 68 except: | |
| 69 pass | |
| 70 try: | |
| 71 for overlap in local_percent_table.keys(): | |
| 72 general_percent_table[overlap] = general_percent_table.get(overlap, 0) + (1./Total_read_in_objDic*readcount_dic[x]*local_percent_table[overlap]) | |
| 73 except: | |
| 74 pass | |
| 75 print >> OUT, "overlap\tnum of pairs\tprobability" | |
| 76 for classe in sorted(general_frequency_table): | |
| 77 print >> OUT, "%i\t%i\t%f" % (classe, general_frequency_table[classe], general_percent_table[classe]) | |
| 78 | |
| 79 else: | |
| 80 print >> OUT, "overlap\tnum of pairs\tprobability\titem" | |
| 81 for x in (objDic): | |
| 82 local_frequency_table = objDic[x].signature( args.minquery, args.maxquery, args.mintarget, args.maxtarget, range(args.minscope, args.maxscope) ) | |
| 83 local_percent_table = objDic[x].hannon_signature( args.minquery, args.maxquery, args.mintarget, args.maxtarget, range(args.minscope, args.maxscope) ) | |
| 84 for classe in range(args.minscope, args.maxscope): | |
| 85 print >> OUT, "%i\t%i\t%f\t%s" % (classe, local_frequency_table[classe], local_percent_table[classe], x) | |
| 86 | |
| 87 OUT.close() | |
| 88 | |
| 89 ## Run the R script that is defined in the xml using the Rscript binary provided with R. | |
| 90 R_command="Rscript "+ args.rcode | |
| 91 process = subprocess.Popen(R_command.split()) |
