Mercurial > repos > p.lucas > taxid_genusexpand_taxid2acc
comparison TAXID_genusexpand_taxid2acc.py @ 12:19e175a84d0e draft
Uploaded updated python script
| author | p.lucas |
|---|---|
| date | Thu, 21 Sep 2023 15:19:55 +0000 |
| parents | 3d116861e380 |
| children | 932ba9e04f3a |
comparison
equal
deleted
inserted
replaced
| 11:3d116861e380 | 12:19e175a84d0e |
|---|---|
| 8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) | 8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) |
| 9 ### | 9 ### |
| 10 | 10 |
| 11 ### Libraries to import: | 11 ### Libraries to import: |
| 12 import argparse, os, sys, csv, warnings, re, itertools, operator | 12 import argparse, os, sys, csv, warnings, re, itertools, operator |
| 13 #from subprocess import Popen,PIPE | |
| 14 import subprocess | |
| 13 from os import path | 15 from os import path |
| 14 import ncbi_genome_download as ngd | 16 import ncbi_genome_download as ngd |
| 15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) | 17 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) |
| 16 from ete3 import NCBITaxa | 18 from ete3 import NCBITaxa |
| 17 | 19 |
| 18 # to be able to report line number in error messages | 20 # to be able to report line number in error messages |
| 19 import inspect | 21 import inspect |
| 20 frame = inspect.currentframe() | 22 def lineno(): |
| 23 """Returns the current line number in our program.""" | |
| 24 return str(inspect.currentframe().f_back.f_lineno) | |
| 25 | |
| 21 | 26 |
| 22 # debug | 27 # debug |
| 23 b_test_load_taxids = False # ok 2023 03 07 | 28 test_dir = 'test_TAXID_genusexpand_taxid2acc/' |
| 24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07 | 29 b_test_load_taxids = False # ok 2023 08 25 |
| 30 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 08 25 | |
| 25 | 31 |
| 26 prog_tag = '[' + os.path.basename(__file__) + ']' | 32 prog_tag = '[' + os.path.basename(__file__) + ']' |
| 27 | 33 |
| 28 # boolean to know if we dowload ncbi taxonomy file in current env | 34 # boolean to know if we dowload ncbi taxonomy file in current env |
| 29 b_load_ncbi_tax_f = False | 35 b_load_ncbi_tax_f = False |
| 34 accnrlist = [] | 40 accnrlist = [] |
| 35 | 41 |
| 36 # order = -4 | 42 # order = -4 |
| 37 # family or clade = -3 | 43 # family or clade = -3 |
| 38 # subtribe or genus = -2 | 44 # subtribe or genus = -2 |
| 39 curr_index_in_lineage = -2 | 45 curr_index_in_lineage = -1 |
| 40 min_index_in_lineage = -4 | 46 min_index_in_lineage = -4 |
| 41 | 47 |
| 42 # boolean to know if we download ncbi taxonomy file in current env | 48 # boolean to know if we download ncbi taxonomy file in current env |
| 43 b_load_ncbi_tax_f = False | 49 b_load_ncbi_tax_f = False |
| 44 b_test_all = False | 50 b_test_all = False |
| 45 b_test = False | 51 b_test = False |
| 46 b_acc_in_f = False | 52 b_acc_in_f = False |
| 47 b_acc_out_f = False | 53 b_acc_out_f = False |
| 48 | 54 |
| 49 b_verbose = False | 55 b_verbose = False |
| 56 | |
| 57 # variables for ncbi-genome-download | |
| 58 ncbigenomedownload_section = 'refseq' # genbank | |
| 59 organisms_to_search_in = 'vertebrate_other,vertebrate_mammalian,plant,invertebrate' | |
| 60 assembly_levels = 'complete,chromosome' | |
| 50 | 61 |
| 51 # rank = '' # rank level retained by user | 62 # rank = '' # rank level retained by user |
| 52 # rank_num = index of rank retained by user | 63 # rank_num = index of rank retained by user |
| 53 | 64 |
| 54 # # set to check that provided rank exist to be sure to be able to use it | 65 # # set to check that provided rank exist to be sure to be able to use it |
| 133 ' '.join(['./TAXID_genusexpand_taxid2acc.py', | 144 ' '.join(['./TAXID_genusexpand_taxid2acc.py', |
| 134 '-i taxid_accnr_list.tsv', | 145 '-i taxid_accnr_list.tsv', |
| 135 '-o accnr_out_list.txt']),"\n\n" ])) | 146 '-o accnr_out_list.txt']),"\n\n" ])) |
| 136 parser.print_help() | 147 parser.print_help() |
| 137 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + | 148 print(prog_tag + "[Error] we found "+str(len(sys.argv)) + |
| 138 " arguments, exit line "+str(frame.f_lineno)) | 149 " arguments, exit line "+lineno()) |
| 139 sys.exit(0) | 150 sys.exit(0) |
| 140 | 151 |
| 141 # print('args:', args) | 152 # print('args:', args) |
| 142 # if(not b_test): | 153 # if(not b_test): |
| 143 if args.ncbi_tax_f is not None: | 154 if args.ncbi_tax_f is not None: |
| 144 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) | 155 ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) |
| 145 else: | 156 else: |
| 146 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" | 157 # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" |
| 147 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") | 158 ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") |
| 148 if args.taxid_acc_in_f is not None: | 159 if args.taxid_acc_in_f is not None: |
| 149 taxid_acc_in_f = os.path.abspath(args.taxid_acc_in_f) | 160 taxid_acc_in_f = os.path.abspath(args.taxid_acc_f) |
| 150 b_acc_in_f = True | 161 b_acc_in_f = True |
| 151 elif(not b_test): | 162 elif(not b_test): |
| 152 sys.exit("[Error] You must provide taxid_acc_in_f") | 163 sys.exit("[Error] You must provide taxid_acc_in_f") |
| 153 if args.acc_out_f is not None: | 164 if args.acc_out_f is not None: |
| 154 acc_out_f = os.path.abspath(args.acc_out_f) | 165 acc_out_f = os.path.abspath(args.acc_out_f) |
| 172 | 183 |
| 173 # -------------------------------------------------------------------------- | 184 # -------------------------------------------------------------------------- |
| 174 # to sort uniq, for a list, only need to add list conversion | 185 # to sort uniq, for a list, only need to add list conversion |
| 175 # -------------------------------------------------------------------------- | 186 # -------------------------------------------------------------------------- |
| 176 mapper= map # Python ≥ 3 | 187 mapper= map # Python ≥ 3 |
| 177 def sort_uniq(sequence): | 188 def sort_uniq(sequence: list): |
| 178 return mapper( | 189 return mapper( |
| 179 operator.itemgetter(0), | 190 operator.itemgetter(0), |
| 180 itertools.groupby(sorted(sequence))) | 191 itertools.groupby(sorted(sequence))) |
| 181 # -------------------------------------------------------------------------- | 192 # -------------------------------------------------------------------------- |
| 182 | 193 |
| 183 # -------------------------------------------------------------------------- | 194 # -------------------------------------------------------------------------- |
| 184 # load taxid acc list, return taxidlist | 195 # Procedure: load taxid acc list, return taxidlist |
| 185 # -------------------------------------------------------------------------- | 196 # -------------------------------------------------------------------------- |
| 186 def load_taxids(taxid_acc_tabular_f): | 197 def load_taxids(taxid_acc_tabular_f: str): |
| 187 | 198 |
| 188 if not path.exists(taxid_acc_tabular_f): | 199 if not path.exists(taxid_acc_tabular_f): |
| 189 sys.exit("Error " + taxid_acc_tabular_f + | 200 sys.exit("Error " + taxid_acc_tabular_f + |
| 190 " file does not exist, line "+ str(sys._getframe().f_lineno) ) | 201 " file does not exist, line "+ lineno() ) |
| 191 | 202 |
| 192 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " | 203 cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort -u " |
| 193 | 204 |
| 194 for line in os.popen(cmd).readlines(): | 205 for line in os.popen(cmd).readlines(): |
| 195 if line.rstrip() != "": | 206 if line.rstrip() != "": |
| 196 k, v = line.rstrip().split() | 207 k, v = line.rstrip().split() |
| 197 taxidlist.append(k) | 208 taxidlist.append(k) |
| 198 accnrlist.append(v) | 209 accnrlist.append(v) |
| 199 | 210 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") |
| 200 return taxidlist | 211 |
| 201 # -------------------------------------------------------------------------- | 212 # -------------------------------------------------------------------------- |
| 202 | 213 |
| 203 # test load_taxids function | 214 # test load_taxids function |
| 204 # display taxidlist, then exit | 215 # display taxidlist, then exit |
| 205 if b_test_load_taxids: | 216 if b_test_load_taxids: |
| 206 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | 217 taxid_acc_tabular_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv' |
| 207 print("START b_test_load_taxids") | 218 print("START b_test_load_taxids") |
| 208 print("loading "+taxid_acc_tabular_f+" file") | 219 print("loading "+taxid_acc_tabular_f+" file") |
| 209 taxidlist = load_taxids(taxid_acc_tabular_f) | 220 load_taxids(taxid_acc_tabular_f) |
| 210 for i in range(len(taxidlist)): | 221 for i in range(len(taxidlist)): |
| 211 print(f"{taxidlist[i]}\t{accnrlist[i]}") | 222 print(f"{taxidlist[i]}\t{accnrlist[i]}") |
| 212 print("END b_test_load_taxids") | 223 print("END b_test_load_taxids") |
| 213 if not b_test_add_host_chr_taxids_accnr_from_ori_list: | 224 if not b_test_add_host_chr_taxids_accnr_from_ori_list: |
| 214 sys.exit() | 225 sys.exit() |
| 231 # function to retain the most recent acc nr for host complete genome found: | 242 # function to retain the most recent acc nr for host complete genome found: |
| 232 # - return acc nr of most recent complete genome | 243 # - return acc nr of most recent complete genome |
| 233 # - print accnr species and name retained | 244 # - print accnr species and name retained |
| 234 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp | 245 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp |
| 235 # -------------------------------------------------------------------------- | 246 # -------------------------------------------------------------------------- |
| 236 def retain_1accnr(accnrlisttmp, speciestmp, nametmp): | 247 def retain_1accnr(accnrlisttmp: list, speciestmp: list, nametmp: list) -> str: |
| 237 | 248 |
| 238 max_accnr_version = 0 | 249 max_accnr_version = 0 |
| 239 curr_accnr_version = 0 | 250 curr_accnr_version = 0 |
| 240 max_accnr_nr = 0 | 251 max_accnr_nr = 0 |
| 241 curr_accnr_nr = 0 | 252 curr_accnr_nr = 0 |
| 268 | 279 |
| 269 return kept_accn | 280 return kept_accn |
| 270 # -------------------------------------------------------------------------- | 281 # -------------------------------------------------------------------------- |
| 271 | 282 |
| 272 # -------------------------------------------------------------------------- | 283 # -------------------------------------------------------------------------- |
| 273 # procedure to find complete genome closely related to current taxid | 284 # Function to find complete genome closely related to current taxid |
| 274 # goes upper in taxonomy if nothing found until order | 285 # goes upper in taxonomy if nothing found until order |
| 275 # -------------------------------------------------------------------------- | 286 # -------------------------------------------------------------------------- |
| 276 def ngd_upper_lineage(curr_index_in_lineage, | 287 def ngd_upper_lineage(curr_index_in_lineage: int, |
| 277 lineage, | 288 lineage: list, |
| 278 ncbi, | 289 ncbi: NCBITaxa, |
| 279 accnrlisttmp, # current working list | 290 accnrlisttmp: list, # current working list |
| 280 accnrlist, # final list, if something added (or min index reached), recursivity stop | 291 accnrlist: list, # final list, if something added (or min index reached), recursivity stop |
| 281 speciestmp, | 292 speciestmp: list, |
| 282 nametmp | 293 nametmp: list |
| 283 ): | 294 ) -> str: |
| 284 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") | 295 print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") |
| 285 | 296 |
| 286 # deduce up rank, search complet genome/chr in | 297 # deduce up rank, search complet genome/chr in |
| 287 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species | 298 upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species |
| 288 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) | 299 rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) |
| 291 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, | 302 leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, |
| 292 intermediate_nodes=False, | 303 intermediate_nodes=False, |
| 293 collapse_subspecies=False, | 304 collapse_subspecies=False, |
| 294 return_tree=False | 305 return_tree=False |
| 295 ) | 306 ) |
| 307 | |
| 296 # int conversion to strings | 308 # int conversion to strings |
| 297 leaves_taxids = list(map(str, leaves_taxids)) | 309 leaves_taxids = list(map(str, leaves_taxids)) |
| 298 leaves_taxids_list = ','.join(leaves_taxids) | 310 leaves_taxids_list = ','.join(leaves_taxids) |
| 299 cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | 311 |
| 300 # print(f"{prog_tag} cmd:{cmd}") | 312 if b_verbose: |
| 313 print(f"{prog_tag} leaves_taxids for taxid {upper_taxid}:{leaves_taxids_list}") | |
| 314 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {leaves_taxids_list} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1" | |
| 315 if b_verbose: | |
| 316 print(f"{prog_tag} cmd:{cmd}") | |
| 301 | 317 |
| 302 # specific to retain_1accn to avoid lists are crashed by other ngd call | 318 # specific to retain_1accn to avoid lists are crashed by other ngd call |
| 303 accnrlisttmp_r = [] | 319 accnrlisttmp_r = [] |
| 304 speciestmp_r = [] | 320 speciestmp_r = [] |
| 305 nametmp_r = [] | 321 nametmp_r = [] |
| 322 | |
| 306 for line in os.popen(cmd).readlines(): | 323 for line in os.popen(cmd).readlines(): |
| 307 | 324 # print(f"line 314:{line.rstrip()}") |
| 308 if not re.match("^Considering", line): | 325 if not re.match("^Considering", line): |
| 309 # print(f"line:{line.rstrip()}") | 326 # print(f"line 316:{line.rstrip()}") |
| 310 if re.match("^(?:ERROR|Error): No downloads", line): | 327 if re.match("^(?:ERROR|Error): No downloads", line): |
| 311 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") | 328 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") |
| 312 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted | 329 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted |
| 313 curr_index_in_lineage = curr_index_in_lineage - 1 | 330 curr_index_in_lineage = curr_index_in_lineage - 1 |
| 314 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | 331 print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") |
| 315 return ngd_upper_lineage(curr_index_in_lineage, | 332 return ngd_upper_lineage(curr_index_in_lineage, |
| 316 lineage, | 333 lineage, |
| 317 ncbi, | 334 ncbi, |
| 318 accnrlisttmp, # current working list | 335 accnrlisttmp, # current working list |
| 319 accnrlist, # final list, if something added (or min index reached), recursivity stop | 336 accnrlist, # final list, if something added (or min index reached), recursivity stop |
| 320 speciestmp, | 337 speciestmp, |
| 321 nametmp | 338 nametmp |
| 322 ) | 339 ) |
| 323 else: | 340 else: |
| 324 acc_nr, species, name = line.rstrip().split("\t") | 341 # print(f"line 331:{line}") |
| 342 try: | |
| 343 acc_nr, species, name = line.rstrip().split("\t") | |
| 344 except ValueError as ve: | |
| 345 sys.exit(f"ValueError {ve}: for split of line '{line}'") | |
| 325 accnrlisttmp_r.append(acc_nr) | 346 accnrlisttmp_r.append(acc_nr) |
| 326 speciestmp_r.append(species) | 347 speciestmp_r.append(species) |
| 327 nametmp_r.append(name) | 348 nametmp_r.append(name) |
| 328 if b_verbose: | 349 if b_verbose: |
| 329 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | 350 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") |
| 337 | 358 |
| 338 # -------------------------------------------------------------------------- | 359 # -------------------------------------------------------------------------- |
| 339 # read taxids, deduce complete genomes available in genblank, provides in output file | 360 # read taxids, deduce complete genomes available in genblank, provides in output file |
| 340 # the acc number in addition to those already listed | 361 # the acc number in addition to those already listed |
| 341 # -------------------------------------------------------------------------- | 362 # -------------------------------------------------------------------------- |
| 342 def add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 363 def add_host_chr_taxids_accnr_from_ori_list(taxidlist: list, |
| 343 accnrlist, | 364 accnrlist: list, |
| 344 acc_out_f): | 365 acc_out_f: str): |
| 345 | 366 |
| 346 # store all accnr found for complete genome of current taxid (or from same family/order) | 367 # store all accnr found for complete genome of current taxid (or from same family/order) |
| 347 # the aim is to keep only the most recent/complete | 368 # the aim is to keep only the most recent/complete |
| 348 accnrlisttmp = [] | 369 accnrlisttmp = [] |
| 349 speciestmp = [] | 370 speciestmp = [] |
| 353 taxids_list=','.join(taxidlist) | 374 taxids_list=','.join(taxidlist) |
| 354 | 375 |
| 355 # # ------------------------------------------ | 376 # # ------------------------------------------ |
| 356 # # ncbi-genome-download as a library | 377 # # ncbi-genome-download as a library |
| 357 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' | 378 # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' |
| 358 # ngd.download(section='genbank', | 379 # ngd.download(section=ncbigenomedownload_section, |
| 359 # taxids=taxids_list, | 380 # taxids=taxids_list, |
| 360 # assembly_levels='chromosome', | 381 # assembly_levels=assembly_levels, |
| 361 # flat_output=True, | 382 # flat_output=True, |
| 362 # output=ngd_out_f, | 383 # output=ngd_out_f, |
| 363 # groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate', | 384 # groups=organisms_to_search_in, |
| 364 # dry_run=True | 385 # dry_run=True |
| 365 # ) | 386 # ) |
| 366 | 387 |
| 367 | 388 |
| 368 # cmd = "cut -f 1,2 "+ngd_out_f | 389 # cmd = "cut -f 1,2 "+ngd_out_f |
| 369 | 390 |
| 388 ncbi.update_taxonomy_database() | 409 ncbi.update_taxonomy_database() |
| 389 except: | 410 except: |
| 390 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") | 411 warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") |
| 391 | 412 |
| 392 for taxid_u in taxidlist: | 413 for taxid_u in taxidlist: |
| 393 cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | 414 print(f"{prog_tag} treating global taxid:{taxid_u}") |
| 415 cmd = f"ncbi-genome-download -s {ncbigenomedownload_section} --taxids {taxid_u} --assembly-levels {assembly_levels} --dry-run {organisms_to_search_in} 2>&1" | |
| 394 for line in os.popen(cmd).readlines(): | 416 for line in os.popen(cmd).readlines(): |
| 417 if b_verbose: | |
| 418 print(f"{prog_tag} cmd:{cmd} ran, read output") | |
| 395 # ERROR: No downloads matched your filter. Please check your options. | 419 # ERROR: No downloads matched your filter. Please check your options. |
| 396 if re.match("^(?:ERROR|Error): No downloads", line): | 420 if re.match("^(?:ERROR|Error): No downloads", line): |
| 397 # get complete lineage: accept ONLY leave taxid? (species) | 421 # get complete lineage: accept ONLY leave taxid? (species) |
| 398 lineage = ncbi.get_lineage(int(taxid_u)) | 422 lineage = ncbi.get_lineage(int(taxid_u)) |
| 399 name = ncbi.translate_to_names(lineage) | 423 name = ncbi.translate_to_names(lineage) |
| 400 if b_verbose: | 424 if b_verbose: |
| 401 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") | 425 print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") |
| 402 | 426 |
| 403 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome | 427 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome |
| 404 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | 428 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {lineno()}") |
| 405 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, | 429 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, |
| 406 lineage, | 430 lineage, |
| 407 ncbi, | 431 ncbi, |
| 408 accnrlisttmp, # current working list | 432 accnrlisttmp, # current working list |
| 409 accnrlist, # final list, if something added (or min index reached), recursivity stop | 433 accnrlist, # final list, if something added (or min index reached), recursivity stop |
| 410 speciestmp, | 434 speciestmp, |
| 411 nametmp | 435 nametmp |
| 412 ) | 436 ) |
| 413 accnrlist.append( new_acc_nr ) | 437 if new_acc_nr is None: |
| 438 print(f"No acc_nr found after going up in taxonomy, line {lineno()}") | |
| 439 else: | |
| 440 accnrlist.append( new_acc_nr ) | |
| 441 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") | |
| 414 | 442 |
| 415 # initialize for next search | 443 # initialize for next search |
| 416 accnrlisttmp = [] | 444 accnrlisttmp = [] |
| 417 speciestmp = [] | 445 speciestmp = [] |
| 418 nametmp = [] | 446 nametmp = [] |
| 419 | 447 |
| 420 elif not re.match("^Considering", line): | 448 elif not re.match("^Considering", line): |
| 421 # print(f"line:{line.rstrip()}") | 449 # print(f"line:{line.rstrip()}") |
| 422 acc_nr, species, name = line.rstrip().split("\t") | 450 acc_nr, species, name = line.rstrip().split("\t") |
| 423 accnrlisttmp.append(acc_nr) | 451 accnrlisttmp.append(acc_nr) |
| 452 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") | |
| 453 | |
| 424 speciestmp.append(species) | 454 speciestmp.append(species) |
| 425 nametmp.append(name) | 455 nametmp.append(name) |
| 426 if b_verbose: | 456 if b_verbose: |
| 427 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | 457 print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") |
| 428 | 458 |
| 429 # retain only the most recent complete genome for current treated taxid | 459 # retain only the most recent complete genome for current treated taxid |
| 430 if len(accnrlisttmp): | 460 if len(accnrlisttmp): |
| 431 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) | 461 accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) |
| 462 # print(f"last item added to accnrlist:{accnrlist[-1]}, line {lineno()}") | |
| 432 | 463 |
| 433 # remove redundant accnr | 464 # remove redundant accnr |
| 465 print(f"accnrlist to sort:{accnrlist}") | |
| 434 accnrlist = list(sort_uniq(accnrlist)) | 466 accnrlist = list(sort_uniq(accnrlist)) |
| 435 with open(acc_out_f, "w") as record_file: | 467 with open(acc_out_f, "w") as record_file: |
| 436 for accnr in accnrlist: | 468 for accnr in accnrlist: |
| 437 record_file.write("%s\n" % (accnr)) | 469 record_file.write("%s\n" % (accnr)) |
| 438 # ------------------------------------------ | 470 # ------------------------------------------ |
| 440 print(f"{prog_tag} {acc_out_f} file created") | 472 print(f"{prog_tag} {acc_out_f} file created") |
| 441 | 473 |
| 442 # -------------------------------------------------------------------------- | 474 # -------------------------------------------------------------------------- |
| 443 # test | 475 # test |
| 444 if b_test_add_host_chr_taxids_accnr_from_ori_list: | 476 if b_test_add_host_chr_taxids_accnr_from_ori_list: |
| 445 taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | 477 taxid_acc_in_f = test_dir + 'megablast_out_f_taxid_acc_host.tsv' |
| 446 taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv' | 478 acc_out_f = test_dir + 'megablast_out_f_taxid_acc_hostexpanded.tsv' |
| 447 acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv' | |
| 448 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") | 479 print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") |
| 449 print(f"{prog_tag} loading {taxid_acc_tabular_f} file") | 480 print(f"{prog_tag} loading {taxid_acc_in_f} file") |
| 450 taxidlist = load_taxids(taxid_acc_in_f) | 481 load_taxids(taxid_acc_in_f) |
| 482 for i in range(len(taxidlist)): | |
| 483 print(f"{taxidlist[i]}\t{accnrlist[i]}") | |
| 451 print(f"{prog_tag} end loading") | 484 print(f"{prog_tag} end loading") |
| 452 | 485 |
| 453 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 486 add_host_chr_taxids_accnr_from_ori_list(taxidlist, |
| 454 accnrlist, | 487 accnrlist, |
| 455 acc_out_f) | 488 acc_out_f) |
| 461 # MAIN | 494 # MAIN |
| 462 # -------------------------------------------------------------------------- | 495 # -------------------------------------------------------------------------- |
| 463 ##### MAIN | 496 ##### MAIN |
| 464 def __main__(): | 497 def __main__(): |
| 465 # load taxid_acc file | 498 # load taxid_acc file |
| 466 load_taxids(taxid_acc_in_f) | 499 load_taxids(taxid_acc_tabular_f) |
| 467 # check in ncbi taxonomy which acc number are in and out of given taxid | 500 # check in ncbi taxonomy which acc number are in and out of given taxid |
| 468 add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 501 add_host_chr_taxids_accnr_from_ori_list(taxidlist, |
| 469 accnrlist, | 502 accnrlist, |
| 470 acc_out_f) | 503 acc_out_f) |
| 471 # -------------------------------------------------------------------------- | 504 # -------------------------------------------------------------------------- |
