Mercurial > repos > devteam > cummerbund_to_tabular
comparison cummerbund_to_tabular.py @ 0:21d03c46f286 draft default tip
Uploaded
| author | devteam |
|---|---|
| date | Thu, 02 Apr 2015 16:13:37 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:21d03c46f286 |
|---|---|
| 1 import os | |
| 2 import argparse | |
| 3 import sys | |
| 4 import string | |
| 5 import sqlite3 | |
| 6 | |
| 7 import logging | |
| 8 | |
| 9 | |
| 10 class CummerbundParser(object): | |
| 11 | |
| 12 def __init__(self, opts): | |
| 13 self.cummerbund_db = opts.filename | |
| 14 self.session = sqlite3.connect( os.path.abspath( self.cummerbund_db ) ) | |
| 15 | |
| 16 def generate_file( self, table ): | |
| 17 if hasattr( self, table ): | |
| 18 with open( '%s.tabular' % table, 'w' ) as self.fh: | |
| 19 getattr( self, table )() | |
| 20 else: | |
| 21 print 'Table %s is not supported or does not exist.' % table | |
| 22 | |
| 23 def __write_line(self, line): | |
| 24 columns = [] | |
| 25 for col in line: | |
| 26 if isinstance( col, float ): | |
| 27 if str( col ) in [ '-inf', 'inf' ]: | |
| 28 columns.append( str( col ) ) | |
| 29 elif col == int(col): | |
| 30 columns.append( str( int( col ) ) ) | |
| 31 else: | |
| 32 columns.append( str( col ) ) | |
| 33 elif col is None: | |
| 34 columns.append( '-' ) | |
| 35 else: | |
| 36 columns.append( str( col ) ) | |
| 37 print >>self.fh, '\t'.join( columns ) | |
| 38 | |
| 39 def __get_diff_from_table( self, table, identifier ): | |
| 40 columns = [ '${table}.${identifier}', '${table}.gene_id', 'genes.gene_short_name', 'genes.locus', | |
| 41 '${table}.sample_1', '${table}.sample_2', '${table}.status', | |
| 42 '${table}.value_1', '${table}.value_2', '${table}.JS_dist', | |
| 43 '${table}.test_stat', '${table}.p_value', '${table}.q_value', | |
| 44 '${table}.significant' ] | |
| 45 query = string.Template( 'SELECT %s FROM ${table} JOIN genes on ${table}.gene_id = genes.gene_id' % ', '.join(columns) ) | |
| 46 result = self.session.execute( query.safe_substitute( table=table, identifier=identifier ) ) | |
| 47 self.__write_line( [ 'test_id', 'gene_id', 'gene', 'locus', 'sample_1', | |
| 48 'sample_2', 'status', 'value_1', 'value_2', 'sqrt(JS)', | |
| 49 'test_stat', 'p_value', 'q_value', 'significant' ] ) | |
| 50 for row in result: | |
| 51 self.__write_line( row ) | |
| 52 | |
| 53 def __get_read_group_data( self, table, identifier ): | |
| 54 header = [ 'tracking_id', 'condition', 'replicate', 'raw_frags', | |
| 55 'internal_scaled_frags', 'external_scaled_frags', 'FPKM', | |
| 56 'effective_length', 'status' ] | |
| 57 columns = [ identifier, 'sample_name', 'replicate', 'raw_frags', | |
| 58 'internal_scaled_frags', 'external_scaled_frags', 'fpkm', | |
| 59 'effective_length', 'status' ] | |
| 60 self.__write_line( header ) | |
| 61 for row in self.session.execute( 'SELECT %s FROM %s' % ( ', '.join( columns ), table ) ): | |
| 62 self.__write_line( row ) | |
| 63 | |
| 64 | |
| 65 def __get_exp_diff( self, table, data_table, data_table_as, column ): | |
| 66 header = [ 'test_id', 'gene_id', 'gene', 'locus', 'sample_1', 'sample_2', | |
| 67 'status', 'value_1', 'value_2', 'log2(fold_change)', 'test_stat', | |
| 68 'p_value', 'q_value', 'significant' ] | |
| 69 columns = [ '${dtas}.${column}', '${table}.gene_id', '${table}.gene_short_name', '${table}.locus', | |
| 70 '${dtas}.sample_1', '${dtas}.sample_2', '${dtas}.status', | |
| 71 '${dtas}.value_1', '${dtas}.value_2', '${dtas}.log2_fold_change', | |
| 72 '${dtas}.test_stat', '${dtas}.p_value', '${dtas}.q_value', | |
| 73 '${dtas}.significant' ] | |
| 74 query = string.Template( 'SELECT %s FROM ${dtab} as ${dtas} JOIN ${table} on ${dtas}.${column} = ${table}.${column}' % ', '.join( columns ) ) | |
| 75 self.__write_line( header ) | |
| 76 for row in self.session.execute( query.safe_substitute( dtas=data_table_as, dtab=data_table, table=table, column=column ) ): | |
| 77 self.__write_line( row ) | |
| 78 | |
| 79 def __get_per_sample_fpkm( self, identifiers, table, column ): | |
| 80 columns = [] | |
| 81 for identifier in identifiers: | |
| 82 samples = self.session.execute( "SELECT sample_name FROM %s WHERE %s = '%s' ORDER BY sample_name ASC" % ( table, column, identifier[0] ) ) | |
| 83 for sample in samples: | |
| 84 sample_name = sample[0] | |
| 85 columns.extend( [ '%s_FPKM' % sample_name, | |
| 86 '%s_conf_lo' % sample_name, | |
| 87 '%s_conf_hi' % sample_name, | |
| 88 '%s_status' % sample_name ] ) | |
| 89 return columns | |
| 90 | |
| 91 def __get_fpkms( self, table, data_table, column ): | |
| 92 tss_columns = [ column, 'class_code', 'nearest_ref_id', 'gene_id', | |
| 93 'gene_short_name', column, 'locus', 'length', 'coverage' ] | |
| 94 output_cols = [ 'tracking_id', 'class_code', 'nearest_ref_id', 'gene_id', 'gene_short_name', | |
| 95 'tss_id', 'locus', 'length', 'coverage' ] | |
| 96 tss_groups = self.session.execute( 'SELECT %s FROM %s LIMIT 1' % ( ', '.join( tss_columns ), table ) ) | |
| 97 output_cols.extend( self.__get_per_sample_fpkm( identifiers=tss_groups, column=column, table=data_table ) ) | |
| 98 self.__write_line( output_cols ) | |
| 99 tss_groups = self.session.execute( 'SELECT %s FROM %s' % ( ', '.join( tss_columns ), table ) ) | |
| 100 for tss_group in tss_groups: | |
| 101 out_data = list( tss_group ) | |
| 102 samples = self.session.execute( "SELECT fpkm, conf_hi, conf_lo, quant_status FROM %s WHERE %s = '%s' ORDER BY sample_name ASC" % ( data_table, column, tss_group[0] ) ) | |
| 103 for sample in samples: | |
| 104 out_data.extend( list( sample ) ) | |
| 105 self.__write_line( out_data ) | |
| 106 | |
| 107 def __get_count_data( self, table, column ): | |
| 108 output_cols = [ 'tracking_id' ] | |
| 109 tss_groups = self.session.execute( 'SELECT %s FROM %s LIMIT 1' % ( column, table ) ) | |
| 110 output_cols.extend( self.__get_per_sample_count_cols( identifiers=tss_groups, table=table, column=column ) ) | |
| 111 self.__write_line( output_cols ) | |
| 112 self.__get_per_sample_count_data( table=table, column=column ) | |
| 113 | |
| 114 def __get_per_sample_count_data( self, table, column ): | |
| 115 result = self.session.execute( 'SELECT DISTINCT(%s) FROM %s' % ( column, table ) ) | |
| 116 for row in result: | |
| 117 isoform_id = row[0] | |
| 118 output_data = [ isoform_id ] | |
| 119 per_sample = self.session.execute( "SELECT count, variance, uncertainty, dispersion, status FROM %s WHERE %s = '%s' ORDER BY sample_name ASC" % ( table, column, isoform_id ) ) | |
| 120 for samplerow in per_sample: | |
| 121 output_data.extend( list( samplerow ) ) | |
| 122 self.__write_line( output_data ) | |
| 123 | |
| 124 def __get_per_sample_count_cols( self, identifiers, table, column ): | |
| 125 columns = [] | |
| 126 for identifier in identifiers: | |
| 127 samples = self.session.execute( "SELECT sample_name FROM %s WHERE %s = '%s' ORDER BY sample_name ASC" % ( table, column, identifier[0] ) ) | |
| 128 for sample in samples: | |
| 129 sample_name = sample[0] | |
| 130 columns.extend( [ '%s_count' % sample_name, | |
| 131 '%s_count_variance' % sample_name, | |
| 132 '%s_count_uncertainty_var' % sample_name, | |
| 133 '%s_count_dispersion_var' % sample_name, | |
| 134 '%s_status' % sample_name ] ) | |
| 135 return columns | |
| 136 | |
| 137 def splicing_diff( self ): | |
| 138 self.__get_diff_from_table( 'splicingDiffData', 'TSS_group_id' ) | |
| 139 | |
| 140 def promoters_diff( self ): | |
| 141 self.__get_diff_from_table( 'promoterDiffData', 'gene_id' ) | |
| 142 | |
| 143 def cds_diff( self ): | |
| 144 self.__get_diff_from_table( 'CDSDiffData', 'gene_id' ) | |
| 145 | |
| 146 def tss_fpkm( self ): | |
| 147 data_table = 'TSSData' | |
| 148 table = 'TSS' | |
| 149 column = 'TSS_group_id' | |
| 150 self.__get_fpkms( data_table=data_table, table=table, column=column ) | |
| 151 | |
| 152 def isoform_fpkm( self ): | |
| 153 data_table = 'isoformData' | |
| 154 table = 'isoforms' | |
| 155 column = 'isoform_id' | |
| 156 self.__get_fpkms( data_table=data_table, table=table, column=column ) | |
| 157 | |
| 158 def genes_fpkm( self ): | |
| 159 output_cols = [ 'tracking_id', 'class_code', 'nearest_ref_id', 'gene_id', 'gene_short_name', | |
| 160 'tss_id', 'locus', 'length', 'coverage' ] | |
| 161 iso_groups = self.session.execute( 'SELECT gene_id FROM genes LIMIT 1' ) | |
| 162 output_cols.extend( self.__get_per_sample_fpkm( identifiers=iso_groups, column='gene_id', table='geneData' ) ) | |
| 163 self.__write_line( output_cols ) | |
| 164 data_columns = [ 'genes.gene_id', 'genes.class_code', 'genes.nearest_ref_id', 'genes.gene_id', 'genes.gene_short_name', | |
| 165 'GROUP_CONCAT(TSS.TSS_group_id)', 'genes.locus', 'genes.length', 'genes.coverage' ] | |
| 166 query = 'SELECT %s FROM genes JOIN TSS on TSS.gene_id = genes.gene_id GROUP BY genes.gene_id' % ', '.join( data_columns ) | |
| 167 result = self.session.execute( query ) | |
| 168 for row in result: | |
| 169 gene_id = row[0] | |
| 170 output_data = list( row ) | |
| 171 per_sample = self.session.execute( "SELECT fpkm, conf_lo, conf_hi, quant_status FROM geneData WHERE gene_id = '%s' ORDER BY sample_name ASC" % gene_id ) | |
| 172 for samplerow in per_sample: | |
| 173 output_data.extend( list( samplerow ) ) | |
| 174 self.__write_line( output_data ) | |
| 175 | |
| 176 def cds_fpkm( self ): | |
| 177 output_cols = [ 'tracking_id', 'class_code', 'nearest_ref_id', 'gene_id', 'gene_short_name', | |
| 178 'tss_id', 'locus', 'length', 'coverage' ] | |
| 179 iso_groups = self.session.execute( 'SELECT CDS_id FROM CDS LIMIT 1' ) | |
| 180 output_cols.extend( self.__get_per_sample_fpkm( identifiers=iso_groups, column='CDS_id', table='CDSData' ) ) | |
| 181 self.__write_line( output_cols ) | |
| 182 data_columns = [ 'CDS_id', 'class_code', 'nearest_ref_id', 'gene_id', 'gene_short_name', | |
| 183 'GROUP_CONCAT(TSS_group_id)', 'locus', 'length', 'coverage' ] | |
| 184 query = 'SELECT %s FROM CDS GROUP BY CDS_id' % ', '.join( data_columns ) | |
| 185 result = self.session.execute( query ) | |
| 186 for row in result: | |
| 187 CDS_id = row[0] | |
| 188 output_data = list( row ) | |
| 189 per_sample = self.session.execute( "SELECT fpkm, conf_lo, conf_hi, quant_status FROM CDSData WHERE CDS_id = '%s' ORDER BY sample_name ASC" % CDS_id ) | |
| 190 for samplerow in per_sample: | |
| 191 output_data.extend( list( samplerow ) ) | |
| 192 self.__write_line( output_data ) | |
| 193 | |
| 194 def tss_count_tracking( self ): | |
| 195 self.__get_count_data( table='TSSCount', column='TSS_group_id' ) | |
| 196 | |
| 197 def isoform_count( self ): | |
| 198 self.__get_count_data( table='isoformCount', column='isoform_id' ) | |
| 199 | |
| 200 def genes_count( self ): | |
| 201 self.__get_count_data( table='geneCount', column='gene_id' ) | |
| 202 | |
| 203 def cds_count( self ): | |
| 204 self.__get_count_data( table='CDSCount', column='CDS_id' ) | |
| 205 | |
| 206 def tss_group_exp( self ): | |
| 207 columns = [ 'TEDD.TSS_group_id', 'TSS.gene_id', 'TSS.gene_short_name', 'TSS.locus', | |
| 208 'TEDD.sample_1', 'TEDD.sample_2', 'TEDD.status', | |
| 209 'TEDD.value_1', 'TEDD.value_2', 'TEDD.log2_fold_change', | |
| 210 'TEDD.test_stat', 'TEDD.p_value', 'TEDD.q_value', 'TEDD.significant' ] | |
| 211 query = [ 'SELECT %s FROM TSSExpDiffData AS TEDD' % ', '.join(columns), | |
| 212 'JOIN TSS on TEDD.TSS_group_id = TSS.TSS_group_id' ] | |
| 213 self.__write_line( [ 'test_id', 'gene_id', 'gene', 'locus', | |
| 214 'sample_1', 'sample_2', 'status', 'value_1', | |
| 215 'value_2', 'log2(fold_change)', 'test_stat', | |
| 216 'p_value', 'q_value', 'significant' ] ) | |
| 217 for row in self.session.execute( ' '.join( query ) ): | |
| 218 self.__write_line( row ) | |
| 219 | |
| 220 def run_info( self ): | |
| 221 self.__write_line( [ 'param', 'value' ] ) | |
| 222 for row in self.session.execute( 'SELECT param, value FROM runInfo' ): | |
| 223 self.__write_line( row ) | |
| 224 | |
| 225 def read_groups( self ): | |
| 226 self.__write_line( [ 'file', 'condition', 'replicate_num', 'total_mass', 'norm_mass', 'internal_scale', 'external_scale' ] ) | |
| 227 for row in self.session.execute( 'SELECT file, sample_name, replicate, total_mass, norm_mass, internal_scale, external_scale FROM replicates' ): | |
| 228 self.__write_line( row ) | |
| 229 | |
| 230 def isoform_exp_diff( self ): | |
| 231 self.__get_exp_diff( table='isoforms', data_table='isoformExpDiffData', data_table_as='iED', column='isoform_id' ) | |
| 232 | |
| 233 def gene_exp_diff( self ): | |
| 234 self.__get_exp_diff( table='genes', data_table='geneExpDiffData', data_table_as='gEDD', column='gene_id' ) | |
| 235 | |
| 236 def cds_exp_diff( self ): | |
| 237 self.__get_exp_diff( table='CDS', data_table='CDSExpDiffData', data_table_as='CED', column='CDS_id' ) | |
| 238 | |
| 239 def tss_rg( self ): | |
| 240 self.__get_read_group_data( table='TSSReplicateData', identifier='TSS_group_id' ) | |
| 241 | |
| 242 def isoform_rg( self ): | |
| 243 self.__get_read_group_data( table='isoformReplicateData', identifier='isoform_id' ) | |
| 244 | |
| 245 def gene_rg( self ): | |
| 246 self.__get_read_group_data( table='geneReplicateData', identifier='gene_id' ) | |
| 247 | |
| 248 def cds_rg( self ): | |
| 249 self.__get_read_group_data( table='CDSReplicateData', identifier='CDS_id' ) | |
| 250 | |
| 251 def var_model( self ): | |
| 252 header = [ 'condition', 'locus', 'compatible_count_mean', 'compatible_count_var', 'total_count_mean', 'total_count_var', 'fitted_var' ] | |
| 253 self.__write_line( header ) | |
| 254 for row in self.session.execute( 'SELECT %s FROM varModel' % ', '.join( header ) ): | |
| 255 self.__write_line( row ) | |
| 256 | |
| 257 if __name__ == '__main__': | |
| 258 parser = argparse.ArgumentParser() | |
| 259 parser.add_argument( '--file', dest='filename' ) | |
| 260 parser.add_argument( '--tables', dest='tables', action='append' ) | |
| 261 opts = parser.parse_args() | |
| 262 cb = CummerbundParser( opts ) | |
| 263 for table in opts.tables: | |
| 264 cb.generate_file( table ) |
