| 0 | 1 #!/usr/bin/env python3 | 
|  | 2 # -*- coding: utf-8 -*- | 
|  | 3 ### | 
|  | 4 # USE PYTHON3 | 
|  | 5 # From a file of taxid and accession numbers (tsv), deduce species taxids, get ref genome acc nr list (all chr). (it will allow to have complete genomes when aligning with host to remove host reads) | 
|  | 6 # provide 2 files: | 
|  | 7 # - file with all acc numbers that are included in taxid(s) provided by user (extended to genus level) | 
|  | 8 # - file with all acc numbers that are excluded in taxid(s) provided by user (extended to genus level) | 
|  | 9 ### | 
|  | 10 | 
|  | 11 ### Libraries to import: | 
|  | 12 import argparse, os, sys, csv, warnings, re, itertools, operator | 
|  | 13 from os import path | 
|  | 14 import ncbi_genome_download as ngd | 
|  | 15 # to find all lineage and in case of no complete genome, the deduction of closests complete genomes (same genus, order...) | 
|  | 16 from ete3 import NCBITaxa | 
|  | 17 | 
|  | 18 # to be able to report line number in error messages | 
|  | 19 import inspect | 
|  | 20 frame = inspect.currentframe() | 
|  | 21 | 
|  | 22 # debug | 
|  | 23 b_test_load_taxids = False                            # ok 2023 03 07 | 
|  | 24 b_test_add_host_chr_taxids_accnr_from_ori_list = False # ok 2023 03 07 | 
|  | 25 | 
|  | 26 prog_tag = '[' + os.path.basename(__file__) + ']' | 
|  | 27 | 
|  | 28 # boolean to know if we dowload ncbi taxonomy file in current env | 
|  | 29 b_load_ncbi_tax_f = False | 
|  | 30 | 
|  | 31 # list of interesting taxid (fathers) | 
|  | 32 taxidlist_f = '' | 
|  | 33 taxidlist = [] | 
|  | 34 accnrlist = [] | 
|  | 35 | 
|  | 36 # order = -4 | 
|  | 37 # family or clade = -3 | 
|  | 38 # subtribe or genus = -2 | 
|  | 39 curr_index_in_lineage = -2 | 
|  | 40 min_index_in_lineage = -4 | 
|  | 41 | 
|  | 42 # boolean to know if we download ncbi taxonomy file in current env | 
|  | 43 b_load_ncbi_tax_f = False | 
|  | 44 b_test_all        = False | 
|  | 45 b_test            = False | 
|  | 46 b_acc_in_f        = False | 
|  | 47 b_acc_out_f       = False | 
|  | 48 | 
|  | 49 b_verbose         = False | 
|  | 50 | 
|  | 51 # rank = '' # rank level retained by user | 
|  | 52 # rank_num = index of rank retained by user | 
|  | 53 | 
|  | 54 # # set to check that provided rank exist to be sure to be able to use it | 
|  | 55 # ranks = { | 
|  | 56 #     'superkingdom' => 0, | 
|  | 57 #     # 'clade', # no, several clade, name is too generic | 
|  | 58 #     'kingdom'      => 1, | 
|  | 59 #     'phylum'       => 2, | 
|  | 60 #     'subphylum'    => 3, | 
|  | 61 #     'superclass'   => 4, | 
|  | 62 #     'class'        => 5, | 
|  | 63 #     'superorder'   => 6, | 
|  | 64 #     'order'        => 7, | 
|  | 65 #     'suborder'     => 8, | 
|  | 66 #     'infraorder'   => 9, | 
|  | 67 #     'parvorder'    => 10, | 
|  | 68 #     'superfamily'  => 11, | 
|  | 69 #     'family'       => 12, | 
|  | 70 #     'subfamily'    => 13, | 
|  | 71 #     'genus'        => 14, | 
|  | 72 #     'species'      => 15, | 
|  | 73 #     'subspecies'   => 16 | 
|  | 74 #     } | 
|  | 75 | 
|  | 76 parser = argparse.ArgumentParser() | 
|  | 77 parser.add_argument("-i", "--taxid_acc_in_f", dest='taxid_acc_in_f', | 
|  | 78                     help="taxid acc_number list in tsv (tabular separated at each line)", | 
|  | 79                     metavar="FILE") | 
|  | 80 parser.add_argument("-o", "--acc_out_f", dest='acc_out_f', | 
|  | 81                     help="[optional if --taxid_acc_in_f provided] Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree", | 
|  | 82                     metavar="FILE") | 
|  | 83 # parser.add_argument("-r", "--rank", dest='rank', | 
|  | 84 #                     help="[Optional] default: genus, rank to retain for each acc number provided. We will retain all the acc number descendant from this 'rank' (genus) taxid list", | 
|  | 85 #                     action="store_const") | 
|  | 86 parser.add_argument("-n", "--ncbi_tax_f", dest='ncbi_tax_f', | 
|  | 87                     help="[Optional] ncbi tabular file with taxonomy organized to represent a tree", | 
|  | 88                     metavar="FILE") | 
|  | 89 parser.add_argument("-l", "--load_ncbi_tax_f", dest='b_load_ncbi_tax_f', | 
|  | 90                     help="[Optional] load ncbi tabular file with taxonomy organized to represent a tree in current env at default location (~/.etetoolkit/taxa.sqlite). Only needed for first run", | 
|  | 91                     action='store_true') | 
|  | 92 parser.add_argument("-z", "--test_all", dest='b_test_all', | 
|  | 93                     help="[Optional] run all tests. Additionally, with --load_ncbi_tax_f, allow to download ncbi ete3 tax db the first time you use the script", | 
|  | 94                     action='store_true') | 
|  | 95 parser.add_argument("-v", "--verbose", dest='b_verbose', | 
|  | 96                     help="[Optional] To have details on records when running", | 
|  | 97                     action='store_true') | 
|  | 98 parser.set_defaults(b_load_ncbi_tax_f=False) | 
|  | 99 parser.set_defaults(b_test_all=False) | 
|  | 100 parser.set_defaults(b_verbose=False) | 
|  | 101 | 
|  | 102 # get absolute path in case of files | 
|  | 103 args = parser.parse_args() | 
|  | 104 | 
|  | 105 # ------------------------------------------- | 
|  | 106 # check arguments | 
|  | 107 b_test_all = args.b_test_all | 
|  | 108 | 
|  | 109 if b_test_all: | 
|  | 110     b_test_load_taxids = False | 
|  | 111     b_test_add_host_chr_taxids_accnr_from_ori_list = True | 
|  | 112     b_test = True | 
|  | 113     b_acc_in_f = True | 
|  | 114     b_acc_out_f = True | 
|  | 115 else: | 
|  | 116     b_test = (b_test_load_taxids or | 
|  | 117               b_test_add_host_chr_taxids_accnr_from_ori_list) | 
|  | 118 | 
|  | 119 if ((not b_test)and | 
|  | 120     ((len(sys.argv) < 1) or (len(sys.argv) > 5))): | 
|  | 121     print("\n".join(["To use this scripts, run:", | 
|  | 122                                 "conda activate TAXID_genusexpand_taxid2acc", | 
|  | 123                                 "./TAXID_genusexpand_taxid2acc.py --test_all --load_ncbi_tax_f", | 
|  | 124                                 " ", | 
|  | 125                                 "Then you won't need --test_all --load_ncbi_tax_f options\n\n", | 
|  | 126                                 "Then, as an example:\n\n", | 
|  | 127                      ' '.join(['./TAXID_genusexpand_taxid2acc.py', | 
|  | 128                                                   '-i taxid_accnr_list.tsv', | 
|  | 129                                                   '-o accnr_out_list.txt']),"\n\n" ])) | 
|  | 130     parser.print_help() | 
|  | 131     print(prog_tag + "[Error] we found "+str(len(sys.argv)) + | 
|  | 132           " arguments, exit line "+str(frame.f_lineno)) | 
|  | 133     sys.exit(0) | 
|  | 134 | 
|  | 135 # print('args:', args) | 
|  | 136 # if(not b_test): | 
|  | 137 if args.ncbi_tax_f is not None: | 
|  | 138     ncbi_tax_f = os.path.abspath(args.ncbi_tax_f) | 
|  | 139 else: | 
|  | 140     # ncbi_tax_f = "/nfs/data/db/ete3/taxa.sqlite" | 
|  | 141     ncbi_tax_f = os.path.expanduser("~/.etetoolkit/taxa.sqlite") | 
|  | 142 if args.taxid_acc_in_f is not None: | 
|  | 143     taxid_acc_in_f = os.path.abspath(args.taxid_acc_f) | 
|  | 144     b_acc_in_f = True | 
|  | 145 elif(not b_test): | 
|  | 146     sys.exit("[Error] You must provide taxid_acc_in_f") | 
|  | 147 if args.acc_out_f is not None: | 
|  | 148     acc_out_f = os.path.abspath(args.acc_out_f) | 
|  | 149     b_acc_out_f = True | 
|  | 150 elif(not b_test): | 
|  | 151     sys.exit("-acc_out_f <accnr_file>n must be provided\n") | 
|  | 152 # if args.rank is not None: | 
|  | 153 #     rank = 'genus' | 
|  | 154 # else: | 
|  | 155 #     rank = args.rank | 
|  | 156 | 
|  | 157 if args.b_verbose is not None: | 
|  | 158     b_verbose = int(args.b_verbose) | 
|  | 159 | 
|  | 160 if(not b_test): | 
|  | 161     if (not b_acc_in_f) and (not b_acc_out_f): | 
|  | 162         sys.exit(prog_tag + "[Error] You must provide either --acc_f <file> and -acc_out_f <file>") | 
|  | 163 | 
|  | 164 # # store index of the rank expected by user | 
|  | 165 # rank_num = ranks{ rank } | 
|  | 166 | 
|  | 167 # -------------------------------------------------------------------------- | 
|  | 168 # to sort uniq, for a list, only need to add list conversion | 
|  | 169 # -------------------------------------------------------------------------- | 
|  | 170 mapper= map # Python ≥ 3 | 
|  | 171 def sort_uniq(sequence): | 
|  | 172     return mapper( | 
|  | 173         operator.itemgetter(0), | 
|  | 174         itertools.groupby(sorted(sequence))) | 
|  | 175 # -------------------------------------------------------------------------- | 
|  | 176 | 
|  | 177 # -------------------------------------------------------------------------- | 
|  | 178 # load taxid acc list, return taxidlist | 
|  | 179 # -------------------------------------------------------------------------- | 
|  | 180 def load_taxids(taxid_acc_tabular_f): | 
|  | 181 | 
|  | 182     if not path.exists(taxid_acc_tabular_f): | 
|  | 183         sys.exit("Error " + taxid_acc_tabular_f + | 
|  | 184                  " file does not exist, line "+ str(sys._getframe().f_lineno) ) | 
|  | 185 | 
|  | 186     cmd = "cut -f 1,2 "+taxid_acc_tabular_f+" | sort | uniq " | 
|  | 187     # cmd = "cut -f 1 "+taxid_acc_tabular_f+" | sort | uniq " | 
|  | 188 | 
|  | 189     for line in os.popen(cmd).readlines(): | 
|  | 190         k, v = line.rstrip().split() | 
|  | 191         taxidlist.append(k) | 
|  | 192         accnrlist.append(v) | 
|  | 193 | 
|  | 194     return taxidlist | 
|  | 195 # -------------------------------------------------------------------------- | 
|  | 196 | 
|  | 197 # test load_taxids function | 
|  | 198 # display taxidlist, then exit | 
|  | 199 if b_test_load_taxids: | 
|  | 200     taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | 
|  | 201     print("START b_test_load_taxids") | 
|  | 202     print("loading "+taxid_acc_tabular_f+" file") | 
|  | 203     taxidlist = load_taxids(taxid_acc_tabular_f) | 
|  | 204     for i in range(len(taxidlist)): | 
|  | 205         print(f"{taxidlist[i]}\t{accnrlist[i]}") | 
|  | 206     print("END b_test_load_taxids") | 
|  | 207     if not b_test_add_host_chr_taxids_accnr_from_ori_list: | 
|  | 208         sys.exit() | 
|  | 209 # -------------------------------------------------------------------------- | 
|  | 210 | 
|  | 211 # # -------------------------------------------------------------------------- | 
|  | 212 # # needs internet connexion, not possible | 
|  | 213 # # -------------------------------------------------------------------------- | 
|  | 214 # def get_leave_taxid_from_acc_nr(accnrlist): | 
|  | 215 | 
|  | 216 #     # deduce a list of taxid from a list of accession numbers | 
|  | 217 #     cmd = "cat megablast_out_f_acc_out_taxid.tsv | epost -db nuccore | esummary | xtract -pattern DocumentSummary -element TaxId | sort -u" | 
|  | 218 #     for line in os.popen(cmd).readlines(): | 
|  | 219 #         taxidlist.append(line.rstrip()) | 
|  | 220 | 
|  | 221 #     return taxidlist | 
|  | 222 # # -------------------------------------------------------------------------- | 
|  | 223 | 
|  | 224 # -------------------------------------------------------------------------- | 
|  | 225 # function to retain the most recent acc nr for host complete genome found: | 
|  | 226 # - return acc nr of most recent complete genome | 
|  | 227 # - print accnr species and name retained | 
|  | 228 # - reinitiate tmp lists of accnrlisttmp speciestmp and nametmp | 
|  | 229 # -------------------------------------------------------------------------- | 
|  | 230 def retain_1accnr(accnrlisttmp, speciestmp, nametmp): | 
|  | 231 | 
|  | 232     max_accnr_version  = 0 | 
|  | 233     curr_accnr_version = 0 | 
|  | 234     max_accnr_nr       = 0 | 
|  | 235     curr_accnr_nr      = 0 | 
|  | 236     kept_accnr_i       = 0 | 
|  | 237     p = re.compile(".*?(\d+)\.(\d+)$") | 
|  | 238 | 
|  | 239     # print(f"{prog_tag} retain_1accnr({accnrlisttmp}, {speciestmp}, {nametmp}") | 
|  | 240 | 
|  | 241     for i in range(len(accnrlisttmp)): | 
|  | 242         m = p.match( accnrlisttmp[i] ) | 
|  | 243         if m: | 
|  | 244             # print('Match found: ', m.group(2)) | 
|  | 245             curr_accnr_version = int(m.group(2)) | 
|  | 246             accnr_nr = int(m.group(1)) | 
|  | 247             if curr_accnr_version > max_accnr_version: | 
|  | 248                 max_accnr_version = curr_accnr_version | 
|  | 249                 kept_accnr_i = i | 
|  | 250                 # print(f"record kept_accnr_i:{kept_accnr_i}") | 
|  | 251             elif  curr_accnr_nr > max_accnr_nr: | 
|  | 252                 max_accnr_nr = curr_accnr_nr | 
|  | 253                 kept_accnr_i = i | 
|  | 254                 # print(f"record kept_accnr_i:{kept_accnr_i}") | 
|  | 255 | 
|  | 256         else: | 
|  | 257             sys.exit(f"{prog_tag} No version found for accnr:{accnrlisttmp[i]}") | 
|  | 258 | 
|  | 259     print(f"retained accnr:{accnrlisttmp[kept_accnr_i]}\tspecies:{speciestmp[kept_accnr_i]}\tname:{nametmp[kept_accnr_i]}") | 
|  | 260     kept_accn = accnrlisttmp[kept_accnr_i] | 
|  | 261 | 
|  | 262     return kept_accn | 
|  | 263 # -------------------------------------------------------------------------- | 
|  | 264 | 
|  | 265 # -------------------------------------------------------------------------- | 
|  | 266 # procedure to find complete genome closely related to current taxid | 
|  | 267 # goes upper in taxonomy if nothing found until order | 
|  | 268 # -------------------------------------------------------------------------- | 
|  | 269 def ngd_upper_lineage(curr_index_in_lineage, | 
|  | 270                       lineage, | 
|  | 271                       ncbi, | 
|  | 272                       accnrlisttmp, # current working list | 
|  | 273                       accnrlist,    # final list, if something added (or min index reached), recursivity stop | 
|  | 274                       speciestmp, | 
|  | 275                       nametmp | 
|  | 276                       ): | 
|  | 277     print(f"{prog_tag} ngd_upper_lineage with curr_index_in_lineage:{curr_index_in_lineage}") | 
|  | 278 | 
|  | 279     # deduce up rank, search complet genome/chr in | 
|  | 280     upper_taxid=str(lineage[curr_index_in_lineage]) # order when last is species | 
|  | 281     rank = ncbi.get_rank([lineage[curr_index_in_lineage]]) | 
|  | 282     name = ncbi.get_taxid_translator([lineage[curr_index_in_lineage]]) | 
|  | 283     print(f"{prog_tag} test with taxid:{upper_taxid} corresponding to rank:{rank}") | 
|  | 284     leaves_taxids = ncbi.get_descendant_taxa(upper_taxid, | 
|  | 285                                              intermediate_nodes=False, | 
|  | 286                                              collapse_subspecies=False, | 
|  | 287                                              return_tree=False | 
|  | 288                                              ) | 
|  | 289     # int conversion to strings | 
|  | 290     leaves_taxids = list(map(str, leaves_taxids)) | 
|  | 291     leaves_taxids_list = ','.join(leaves_taxids) | 
|  | 292     cmd = f"ncbi-genome-download -s genbank --taxids {leaves_taxids_list} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | 
|  | 293     # print(f"{prog_tag} cmd:{cmd}") | 
|  | 294 | 
|  | 295     # specific to retain_1accn to avoid lists are crashed by other ngd call | 
|  | 296     accnrlisttmp_r = [] | 
|  | 297     speciestmp_r   = [] | 
|  | 298     nametmp_r      = [] | 
|  | 299     for line in os.popen(cmd).readlines(): | 
|  | 300 | 
|  | 301         if not re.match("^Considering", line): | 
|  | 302             # print(f"line:{line.rstrip()}") | 
|  | 303             if re.match("^(?:ERROR|Error): No downloads", line): | 
|  | 304                 print(f"{prog_tag} No chr/complete genome for taxid:{upper_taxid} rank:{rank} (expanding name:{name})") | 
|  | 305                 if curr_index_in_lineage > min_index_in_lineage: # need to go on with upper lineage if not last accepted | 
|  | 306                     curr_index_in_lineage = curr_index_in_lineage - 1 | 
|  | 307                     print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | 
|  | 308                     return ngd_upper_lineage(curr_index_in_lineage, | 
|  | 309                                              lineage, | 
|  | 310                                              ncbi, | 
|  | 311                                              accnrlisttmp, # current working list | 
|  | 312                                              accnrlist,    # final list, if something added (or min index reached), recursivity stop | 
|  | 313                                              speciestmp, | 
|  | 314                                              nametmp | 
|  | 315                                              ) | 
|  | 316             else: | 
|  | 317                 acc_nr, species, name = line.rstrip().split("\t") | 
|  | 318                 accnrlisttmp_r.append(acc_nr) | 
|  | 319                 speciestmp_r.append(species) | 
|  | 320                 nametmp_r.append(name) | 
|  | 321                 if b_verbose: | 
|  | 322                     print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | 
|  | 323 | 
|  | 324                 # retain only the most recent complete genome for current treated taxid | 
|  | 325                 return retain_1accnr(accnrlisttmp_r, speciestmp_r, nametmp_r) | 
|  | 326 | 
|  | 327         # else: | 
|  | 328         #     print(f"line matching Considering:{line}") | 
|  | 329 # -------------------------------------------------------------------------- | 
|  | 330 | 
|  | 331 # -------------------------------------------------------------------------- | 
|  | 332 # read taxids, deduce complete genomes available in genblank, provides in output file | 
|  | 333 # the acc number in addition  to those already listed | 
|  | 334 # -------------------------------------------------------------------------- | 
|  | 335 def add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 
|  | 336                                             accnrlist, | 
|  | 337                                             acc_out_f): | 
|  | 338 | 
|  | 339     # store all accnr found for complete genome of current taxid (or from same family/order) | 
|  | 340     # the aim is to keep only the most recent/complete | 
|  | 341     accnrlisttmp = [] | 
|  | 342     speciestmp   = [] | 
|  | 343     nametmp      = [] | 
|  | 344 | 
|  | 345     # get host complete genome when found using ncbi_genome_download | 
|  | 346     taxids_list=','.join(taxidlist) | 
|  | 347 | 
|  | 348     # # ------------------------------------------ | 
|  | 349     # # ncbi-genome-download as a library | 
|  | 350     # ngd_out_f= os.getcwd()+'/accnr_sp_accnr.tsv' | 
|  | 351     # ngd.download(section='genbank', | 
|  | 352     #              taxids=taxids_list, | 
|  | 353     #              assembly_levels='chromosome', | 
|  | 354     #              flat_output=True, | 
|  | 355     #              output=ngd_out_f, | 
|  | 356     #              groups='vertebrate_other,vertebrate_mammalian,plant,invertebrate', | 
|  | 357     #              dry_run=True | 
|  | 358     #              ) | 
|  | 359 | 
|  | 360 | 
|  | 361     # cmd = "cut -f 1,2 "+ngd_out_f | 
|  | 362 | 
|  | 363     # for line in os.popen(cmd).readlines(): | 
|  | 364     #     acc_nr, species = line.rstrip().split() | 
|  | 365     #     accnrlist.append(acc_nr) | 
|  | 366 | 
|  | 367     #     if b_verbose: | 
|  | 368     #         print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr}") | 
|  | 369     # # ------------------------------------------ | 
|  | 370 | 
|  | 371     # ------------------------------------------ | 
|  | 372     # ncbi-genome-download as executable script | 
|  | 373     # ------------------------------------------ | 
|  | 374 | 
|  | 375     # load NCBITaxa | 
|  | 376     ncbi = NCBITaxa()   # Install ete3 db in local user file (.ete_toolkit/ directory) | 
|  | 377     print(prog_tag + " Try to load ncbi tax db file:"+ncbi_tax_f) | 
|  | 378     ncbi = NCBITaxa(dbfile=ncbi_tax_f) | 
|  | 379     if (not os.path.isfile(ncbi_tax_f)) or b_load_ncbi_tax_f: | 
|  | 380         try: | 
|  | 381             ncbi.update_taxonomy_database() | 
|  | 382         except: | 
|  | 383             warnings.warn(prog_tag+"[SQLite Integrity error/warning] due to redundant IDs") | 
|  | 384 | 
|  | 385     for taxid_u in taxidlist: | 
|  | 386         cmd = f"ncbi-genome-download -s genbank --taxids {taxid_u} --assembly-level complete,chromosome --dry-run vertebrate_other,vertebrate_mammalian,plant,invertebrate 2>&1" | 
|  | 387         for line in os.popen(cmd).readlines(): | 
|  | 388             # ERROR: No downloads matched your filter. Please check your options. | 
|  | 389             if re.match("^(?:ERROR|Error): No downloads", line): | 
|  | 390                 # get complete lineage: accept ONLY leave taxid? (species) | 
|  | 391                 lineage = ncbi.get_lineage(int(taxid_u)) | 
|  | 392                 name = ncbi.translate_to_names(lineage) | 
|  | 393                 if b_verbose: | 
|  | 394                     print(f"taxid:{taxid_u}\tlineage:{lineage}\tname:{name}") | 
|  | 395 | 
|  | 396                 # same search but going upper in taxonomy, finding leaves taxid to find new closeley related complete genome | 
|  | 397                 # print(f"{prog_tag} ngd_upper_lineage call {curr_index_in_lineage} line {str(sys._getframe().f_lineno)}") | 
|  | 398                 new_acc_nr = ngd_upper_lineage(curr_index_in_lineage, | 
|  | 399                                                lineage, | 
|  | 400                                                ncbi, | 
|  | 401                                                accnrlisttmp, # current working list | 
|  | 402                                                accnrlist, # final list, if something added (or min index reached), recursivity stop | 
|  | 403                                                speciestmp, | 
|  | 404                                                nametmp | 
|  | 405                                                ) | 
|  | 406                 accnrlist.append( new_acc_nr ) | 
|  | 407 | 
|  | 408                 # initialize for next search | 
|  | 409                 accnrlisttmp = [] | 
|  | 410                 speciestmp   = [] | 
|  | 411                 nametmp      = [] | 
|  | 412 | 
|  | 413             elif not re.match("^Considering", line): | 
|  | 414                 # print(f"line:{line.rstrip()}") | 
|  | 415                 acc_nr, species, name = line.rstrip().split("\t") | 
|  | 416                 accnrlisttmp.append(acc_nr) | 
|  | 417                 speciestmp.append(species) | 
|  | 418                 nametmp.append(name) | 
|  | 419                 if b_verbose: | 
|  | 420                     print(f"{prog_tag} we found for {species} chr fasta for host genome with accnr {acc_nr} (name:{name})") | 
|  | 421 | 
|  | 422         # retain only the most recent complete genome for current treated taxid | 
|  | 423         if len(accnrlisttmp): | 
|  | 424             accnrlist.append( retain_1accnr(accnrlisttmp, speciestmp, nametmp) ) | 
|  | 425 | 
|  | 426     # remove redundant accnr | 
|  | 427     accnrlist = list(sort_uniq(accnrlist)) | 
|  | 428     with open(acc_out_f, "w") as record_file: | 
|  | 429         for accnr in accnrlist: | 
|  | 430             record_file.write("%s\n" % (accnr)) | 
|  | 431     # ------------------------------------------ | 
|  | 432 | 
|  | 433     print(f"{prog_tag} {acc_out_f} file created") | 
|  | 434 | 
|  | 435 # -------------------------------------------------------------------------- | 
|  | 436 # test | 
|  | 437 if b_test_add_host_chr_taxids_accnr_from_ori_list: | 
|  | 438    taxid_acc_tabular_f = 'megablast_out_f_taxid_acc_host.tsv' | 
|  | 439    taxid_acc_in_f = 'megablast_out_f_taxid_acc_host.tsv' | 
|  | 440    acc_out_f = 'megablast_out_f_taxid_acc_hostexpanded.tsv' | 
|  | 441    print(f"{prog_tag} START b_test_add_host_chr_taxids_accnr_from_ori_list") | 
|  | 442    print(f"{prog_tag} loading {taxid_acc_tabular_f} file") | 
|  | 443    taxidlist = load_taxids(taxid_acc_in_f) | 
|  | 444    print(f"{prog_tag} end loading") | 
|  | 445 | 
|  | 446    add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 
|  | 447                                            accnrlist, | 
|  | 448                                            acc_out_f) | 
|  | 449    print(f"{prog_tag} END b_test_add_host_chr_taxids_accnr_from_ori_list") | 
|  | 450    sys.exit() | 
|  | 451 # -------------------------------------------------------------------------- | 
|  | 452 | 
|  | 453 # -------------------------------------------------------------------------- | 
|  | 454 # MAIN | 
|  | 455 # -------------------------------------------------------------------------- | 
|  | 456 ##### MAIN | 
|  | 457 def __main__(): | 
|  | 458     # load taxid_acc file | 
|  | 459     load_taxids(taxid_acc_tabular_f) | 
|  | 460     # check in ncbi taxonomy which acc number are in and out of given taxid | 
|  | 461     add_host_chr_taxids_accnr_from_ori_list(taxidlist, | 
|  | 462                                             accnrlist, | 
|  | 463                                             acc_out_f) | 
|  | 464     # -------------------------------------------------------------------------- | 
|  | 465 #### MAIN END | 
|  | 466 if __name__ == "__main__": __main__() | 
|  | 467 |