changeset 4:826ddf832bef draft default tip

"planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/match_plasmid_to_reference commit dcdac86bce5c44043516fbd472ab7c19d7bf4d50-dirty"
author dfornika
date Wed, 06 Nov 2019 13:52:40 -0500
parents d56b4f743779
children
files match_plasmid_to_reference.py match_plasmid_to_reference.xml test-data/CP008719.fa test-data/JQ739157.fa test-data/outdir/reference_plasmid.fasta
diffstat 5 files changed, 291 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
--- a/match_plasmid_to_reference.py	Wed Nov 06 01:20:36 2019 -0500
+++ b/match_plasmid_to_reference.py	Wed Nov 06 13:52:40 2019 -0500
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-from __future__ import print_function
+from __future__ import print_function, division
 
 import argparse
 import csv
@@ -56,34 +56,53 @@
             mob_typer_report.append(row)
     return mob_typer_report
 
-def parse_genbank_accession(genbank_file_path):
-    with open(genbank_file_path, 'r') as f:
+def parse_genbank_accession(genbank_path):
+    with open(genbank_path, 'r') as f:
         while True:
             line = f.readline()
-            # break while statement if it is not a comment line
-            # i.e. does not startwith #
             if line.startswith('ACCESSION'):
                 return line.strip().split()[1]
 
+def parse_fasta_accession(fasta_path):
+    with open(fasta_path, 'r') as f:
+        while True:
+            line = f.readline()
+            if line.startswith('>'):
+                return line.strip().split()[0][1:]
 
-def count_contigs(plasmid_fasta_path):
+def count_fasta_contigs(fasta_path):
     contigs = 0
-    with open(plasmid_fasta_path, 'r') as f:
+    with open(fasta_path, 'r') as f:
         for line in f:
             if line.startswith('>'):
                 contigs += 1
     return contigs
 
-def count_bases(plasmid_fasta_path):
+def count_fasta_bases(fasta_path):
     bases = 0
-    with open(plasmid_fasta_path, 'r') as f:
+    with open(fasta_path, 'r') as f:
         for line in f:
             line = line.strip()
             if not line.startswith('>'):
                 bases += len(line)
     return bases
 
+def compute_fasta_gc_percent(fasta_path):
+    gc_count = 0
+    total_bases_count = 0
+    with open(fasta_path, 'r') as f:
+        for line in f:
+            if not line.startswith('>'):
+                line = line.strip()
+                line_c_count = line.count('c') + line.count('C')
+                line_g_count = line.count('g') + line.count('G')
+                line_total_bases_count = len(line)
+                gc_count += line_c_count + line_g_count
+                total_bases_count += line_total_bases_count
+    return 100 * (gc_count / total_bases_count)
+
 def main(args):
+
     # create output directory
     try:
         os.mkdir(args.outdir)
@@ -95,18 +114,29 @@
 
     # parse mob_typer report
     mob_typer_report = parse_mob_typer_report(args.mob_typer_report)
-    num_plasmid_contigs = count_contigs(args.plasmid)
-    num_plasmid_bases = count_bases(args.plasmid)
-
+    num_plasmid_contigs = count_fasta_contigs(args.plasmid)
+    num_plasmid_bases = count_fasta_bases(args.plasmid)
+    plasmid_gc_percent = compute_fasta_gc_percent(args.plasmid)
+    
     with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f:
         mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
         mob_typer_record_writer.writeheader()
         for record in mob_typer_report:
-            if num_plasmid_contigs == int(record['num_contigs']) and num_plasmid_bases == int(record['total_length']):
-                for reference_plasmid in args.reference_plasmids:
+            # match the plasmid against three properties in the MOB-Typer report:
+            # 1. number of contigs
+            # 2. total length of all contigs
+            # 3. G/C percent (within +/-0.1%)
+            if num_plasmid_contigs == int(record['num_contigs']) and \
+               num_plasmid_bases == int(record['total_length']) and \
+               abs(plasmid_gc_percent - float(record['gc'])) < 0.1: 
+                for reference_plasmid in args.reference_plasmids_genbank:
                     if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']:
                         shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk"))
-                        mob_typer_record_writer.writerow(record)
+
+                for reference_plasmid in args.reference_plasmids_fasta:
+                    if re.match(record['mash_nearest_neighbor'], parse_fasta_accession(reference_plasmid)) is not None:
+                        shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.fasta"))
+                mob_typer_record_writer.writerow(record)
 
     shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta"))
 
@@ -114,7 +144,8 @@
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("--plasmid", help="plasmid assembly (fasta)")
-    parser.add_argument("--reference_plasmids", nargs='+', help="reference plasmids (genbank)")    
+    parser.add_argument("--reference_plasmids_genbank", nargs='+', help="reference plasmids (genbank)")
+    parser.add_argument("--reference_plasmids_fasta", nargs='+', help="reference plasmids (fasta)")
     parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)")
     parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory")
     args = parser.parse_args()
--- a/match_plasmid_to_reference.xml	Wed Nov 06 01:20:36 2019 -0500
+++ b/match_plasmid_to_reference.xml	Wed Nov 06 13:52:40 2019 -0500
@@ -5,8 +5,12 @@
     <command detect_errors="exit_code"><![CDATA[
         '$__tool_directory__/match_plasmid_to_reference.py'
         --plasmid '${plasmid}'
-        --reference_plasmids
-        #for $reference_plasmid in $reference_plasmids:
+        --reference_plasmids_genbank
+        #for $reference_plasmid in $reference_plasmids_genbank:
+          '${reference_plasmid}'
+        #end for
+        --reference_plasmids_fasta
+        #for $reference_plasmid in $reference_plasmids_fasta:
           '${reference_plasmid}'
         #end for
         --mob_typer_report '${concatenated_mob_typer_reports}'
@@ -14,26 +18,35 @@
     ]]></command>
     <inputs>
         <param name="plasmid" type="data" format="fasta" />
-        <param name="reference_plasmids" type="data_collection" collection_type="list" format="genbank" />
+        <param name="reference_plasmids_genbank" type="data_collection" collection_type="list" format="genbank" />
+	<param name="reference_plasmids_fasta" type="data_collection" collection_type="list" format="fasta" />
         <param name="concatenated_mob_typer_reports" type="data" format="tabular" />
     </inputs>
     <outputs>
         <data name="output_plasmid" from_work_dir="outdir/plasmid.fasta" label="Plasmid" format="fasta"/>
-        <data name="matched_reference_plasmid" from_work_dir="outdir/reference_plasmid.gbk" label="Reference Plasmid" format="genbank"/>
+        <data name="matched_reference_plasmid_genbank" from_work_dir="outdir/reference_plasmid.gbk" label="Reference Plasmid (genbank)" format="genbank"/>
+	<data name="matched_reference_plasmid_fasta" from_work_dir="outdir/reference_plasmid.fasta" label="Reference Plasmid (fasta)" format="fasta"/>
 	<data name="matched_mob_typer_record" from_work_dir="outdir/mob_typer_record.tsv" label="Matched MOB-Typer Record" format="tabular"/>
     </outputs>
     <tests>
         <test>
             <param name="plasmid" value="SRR9113487_plasmid_2719.fasta"/>
-            <param name="reference_plasmids">
+            <param name="reference_plasmids_genbank">
                 <collection type="list">
                     <element name="CP008719" value="CP008719.gbk" ftype="genbank" />
                     <element name="JQ739157" value="JQ739157.gbk" ftype="genbank" />
                 </collection>
             </param>
+            <param name="reference_plasmids_fasta">
+                <collection type="list">
+                    <element name="CP008719" value="CP008719.fa" ftype="fasta" />
+                    <element name="JQ739157" value="JQ739157.fa" ftype="fasta" />
+                </collection>
+            </param>
             <param name="concatenated_mob_typer_reports" value="concatenated_mob_typer_reports.tsv" />
             <output name="output_plasmid" file="outdir/plasmid.fasta" />
-            <output name="matched_reference_plasmid" file="outdir/reference_plasmid.gbk" />
+            <output name="matched_reference_plasmid_genbank" file="outdir/reference_plasmid.gbk" />
+            <output name="matched_reference_plasmid_fasta" file="outdir/reference_plasmid.fasta" />
             <output name="matched_mob_typer_record" file="outdir/mob_typer_record.tsv" />
         </test>
     </tests>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/CP008719.fa	Wed Nov 06 13:52:40 2019 -0500
@@ -0,0 +1,33 @@
+>CP008719.1 Escherichia coli strain ST648 plasmid pEC648_5, complete sequence
+TAGATTTAAACGGTATCAAGTTTGGATTTTTAAGAACGCATTCTTAGTTCTGGAAAAGAGCCAGCGGCAG
+GCTGAGGTGATAGGTACGAGATTGCATGCAATCTCTAGTGCTCTGTCTATCCTGCATTATCCTCAGCATT
+ATCCTCAGCATTATCCTCAGCCTTGCCAACTCGACACCAATGCAGGATAGACAATCCGATGTCAAATGTT
+AACACTCTGCGAGTGGTACATTTTCCCCGGATTATCGTCCTGAGCCTGCCGCTGGCTCTCTTTCTACCGC
+CTCGCTTTGCTCGTTGCTCAACGCCTCACAGACACGGATTAAAATCCGCATCCGTTCACCGTTTTTTAAA
+GTCCGTTAAAAGCATGATGCCATCTCCGAGAGTTAATCTCGTCAAATGCTAAATCGTGGGGGTCCCCTTT
+GGGGTTCCGATTTAGTGATTGACGACACCACCGATTAAAAAACTTATGCGGGGTGGATGGTTTCACGAAG
+TGAGGCCATCCACCTGTAAGACAGGGTTTTGTTTTTATTCCCTGTTTTGGTGATCGGGTGTGTGGAAAAG
+GTTGGGGTAAGCCGTTCGGGGGTGCTTGTTTTGGGGGGTTAAAATTGTGGTTATTTTTTGCGCAATTCTC
+GCGCGTGATCCTTGTATTTATACTTAAGGGATAAATGGCGGATATGAAATAGTGGTTTAGCCCAGTAATG
+ACGAGGCTTTGAGTGGGTTTTGACAGGTCAAAGAAAATGGAGCAGAATTGAGGCGTTTTTAATCGGCGTT
+GGGGAGTGCGTCAACACTCCCCAACATTTCGAATGTGTCACCTCAGCGGCAAACTCTGGTGACATGTACT
+GGCTCGCAATGCACAGGTACGTGATGAATATACCACATCAAATCACAGCCTGCCCAGATCGGAGCAGGCT
+TAATGTCAGAAGATAAATTCCTTTCGGACTACAGCCCCCGTGATGCAGTTTGGGATACCCAGCGCACGCT
+TACCGATTCTGTCGGGGGTATCTACCAGACTGCTGCTGAATTCGAGCGCTATGCACTCCGTATGGCCTCC
+TGTAGCGGTTTGTTACGTTTTGGTTGGTCTACCATCATGGAAACCGGAGAAACGCGCCTACGGCTTCGTA
+GTGCGCAATTTTGCCGTGTCCGTCATTGCCCTGTCTGCCAGTGGAGAAGAACCCTCATGTGGCAAGCCCG
+TTTTTATCAGGCTCTACCGAAAATCGTTGTGGATTACCCGTCTTCCCGATGGTTGTTTCTGACGTTAACT
+GTCAGGAACTGCGAGATAGGTGAACTTGGAACAGTCCTTACAGCAATGAATGCGGCGTTTAAGCGAATGG
+AAAAGCGAAAGGAGCTATCACCTGTTCAGGGGTGGATCAGGGCTACGGAGGTGACGCGAGGTAAGGATGG
+CAGCGCACATCCGCATTTTCACTGTCTGCTGATGGTGCAACCTTCTTGGTTTAAAGGGAAGAACTACGTT
+AAGCACGAACGTTGGGTAGAACTCTGGCGCGATTGCTTGCGGGTGAACTATGAGCCGAATATCGATATTC
+GGGCAGTAAAAACTAAGACAGGTGAGGTTGTGGCCAACGTTGCCGAGCAACTGCAAAGCGCGGTTGCTGA
+AACGCTGAAATACTCCGTTAAACCGGAAGATATGGCAAACGATCCTGAGTGGTTTCTTGAGCTGACGCGG
+CAGCTTCACAAGCGCCGTTTTATCTCGACCGGTGGGGCGCTAAAAAACGTCCTCCAGTTGGATCGAGAAA
+CCAATGAGGATCTTGTCATTGCCGACGATGTAGGGGATGGCACTGATGACGGGAAGCGGACGGCGTTTGT
+CTGGGATTCAGGTAAACGGCGTTACAAACGCGCCCCTGAGAAGGATAAATCGGATTAACGTATGAATATT
+AATATTGAATACCTGAATGGAAATAAGACTATTGGTTTATTTTTTTTAAGAAGTGAAGCGGTGATTCCTG
+ACAGGTTTAAAAACCTTATTTTGCTTATTGATGGATTAAGTTTTGGCACATTTGGTTTTCATCCGCACGA
+AGGTTTTGAGGATGAATTAATTTTATATATTCAGAAAACAAACGAGAGGGTAAAAACTCTTTTTGTGAAA
+A
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/JQ739157.fa	Wed Nov 06 13:52:40 2019 -0500
@@ -0,0 +1,96 @@
+>JQ739157.2 Acinetobacter pittii strain ABCA95 plasmid pABCA95, complete sequence
+AAGCTTATAGCAGTGTCACAGATGCGAAAAAGCAATTAAGTGCATATTTTGAGTTTTATAATTTGAAACG
+ACCTCATTCGAGTCTAGACAAAATGACACCAAATGAGTTTTACTATGATCAGCTACCCCAACAAAACAAG
+GTGGCTTAACTAGAGCGGAATATCACTTATAAATACGCTTTTAGTTGTTCAAACAAGTGGGACCACCTCT
+CTCGCATTTGCGGGGTTTTTAATGCTGAATAAAAGGAAAACTTGATGGAATTGCCCAATATTATTCAACA
+ATTTATTGGAAACAGCGTTTTAGAGCCAAATAAAATTGGTCAGTCGCCATCGGATGTTTATTCTTTTAAT
+CGAAATAATGAAACTTTTTTTCTTAAGCGATCTAGCACTTTATATACAGAGACCACATACAGTGTCTCTC
+GCGAAGCGAAAATGTTGAGTTGGCTCTCTGAGAAATTAAAGGTGCCTGAACTCATCATGACTTTTCAGGA
+TGAGCAGTTTGAATTAATGATCACTAAAGCGATCAATGCAAAACCAATTTCAGCGCTTTTTTTAACAGAC
+CAAGAATTGCTTGCTATCTATAAGGAGGCACTCAATCTGTTAAATTCAGTTGCTATTATTGATTGTCCAT
+TTATTTCAAACATTGATCATCGGTTAAAAGAGTCAAAATTTTTTATTGATAACCAACTCCTTGACGATAT
+AGATCAAGATGATTTTGACGCTGAATTATGGGGAGACCATAGAACTTACCTAAGTCTATGGAATGAGTTA
+ACTGAGACTCGTGTTGAAGAAAGATTGGTTTTTTCTCATGGCGATATCACGGATAGTAATATTTTTATAG
+ATAAATTCAATGAAATTTACTTTTTAGATCTTGGCCGTGCTGGGTTAGCTGATGAATTTGTAGATATATC
+CTTTGTTGAACGTTGCCTAAGAGAGGATGCCTCGGAGGAAACTGCTAAAATATTTTTAAAGCATTTAAAA
+AATGATAGACCTGACAAAAGGAATTATTTTTTAAAACTTGATGAATTGAATTGATTCTAAGCATTATCTA
+AAAATACTTAATTGTCTTTTAACGTCGCTAAATTTTAAATAAATAAGTGAAGAGTGTTAGTGGAGCCACT
+GATTTAAAGTTGGCAGAGTAAAACTTGAAGTGCGACATAAACCACCTAATTAATTTAAAGGGTTTATGGA
+GTATATAAAATTGTCATACCATCATCTTAACTTTGAAGATCGTACTGCATTAATGCTTGAGTCAAGAAAA
+GAAGGCTTTTCAGCCAGAAAATTTGCTGAACTCATTAAAAGACATCCTAGTACGATCTATCGTGAGCTTA
+AAAGAAATAGCATCAATGACGTTTATCAAGCTCGATATGCTTCTGATAACACCTTCGCTAGACGTAGACG
+TGGTCACAGAAAACTCAAAATCGATTCAATCCTCTGGAAATTTATTGTTGAAGCGATCCGTTGTTTATGG
+TCTCCTCAGCAAATAGCAAAGCGTTTAAAGACATTTCCTGATTTGGATCAAACAATGAATGTAAGCCATA
+CAACGATTTATTCAACGATACGAGCATTACCAAAGGGTGAGTTGAAAAAAGACTTATTATCCTGTCTACG
+TCATGAAAATAAAAAGCGAAAAGCTAACGGTGAACCTAAAAAAGATTCTATATTACAGGATATTAAAACT
+ATTCATGAGCGCCCAGCCGAAGTTCAAGAAAGAAAAATACCGGGTCATTGGGAAGCTGATTTAATTAAAG
+GTAAAGACAATAAAAGTTCGATAGCAACACTTATTGAACGAAATACACGGCTCTGTATCTTGGCAACATT
+ACCTGATGCAAAGGCAGAATCAGTGCGCAAGGCTTTAACTGAAGCTCTGAAATATTTACCTGCAGAACTG
+CGTAAAACGTTGACCTATGACCGTGGACGTGAGATGTCAGAACATAAAATACTCGAAGAAGATTTAGGCA
+TAGATGTATATTTCTGTGACCCACATTCACCCTGGCAAAAAGGCACATGCGAAAATATGAATGGTTTAAT
+TAGGCAATATTTACCTAAAGGGATTGATTTAAATCAGGCAGATCAGCATTATTTAAATCAAGTTGCCATG
+TCACTGAATACTCGTCCTAGAAAGGCGTTAGATTGGCTTACACCATTAGAGAAATTTGCTCAGCTTGTTG
+ATTATCATATGGCTTTTGAAACTGTCGCACCTCATGTTTGAATTCGCCCCATATTTTTGCTACAGTGAAC
+CAAATTAAGATCATCTATTTACTAGGCCTCGCATTTGCGGGGTTTTTAATGCTGAATAAAAGGAAAACTT
+GATGGAATTGCCCAATATTATGCACCCGGTCGCGAAGCTGAGCACCGCATTAGCCGCTGCATTGATGCTG
+AGCGGGTGCATGCCCGGTGAAATCCGCCCGACGATTGGCCAGCAAATGGAAACTGGCGACCAACGGTTTG
+GCGATCTGGTTTTCCGCCAGCTCGCACCGAATGTCTGGCAGCACACTTCCTATCTCGACATGCCGGGTTT
+CGGGGCAGTCGCTTCCAACGGTTTGATCGTCAGGGATGGCGGCCGCGTGCTGGTGGTCGATACCGCCTGG
+ACCGATGACCAGACCGCCCAGATCCTCAACTGGATCAAGCAGGAGATCAACCTGCCGGTCGCGCTGGCGG
+TGGTGACTCACGCGCATCAGGACAAGATGGGCGGTATGGACGCGCTGCATGCGGCGGGGATTGCGACTTA
+TGCCAATGCGTTGTCGAACCAGCTTGCCCCGCAAGAGGGGATGGTTGCGGCGCAACACAGCCTGACTTTC
+GCCGCCAATGGCTGGGTCGAACCAGCAACCGCGCCCAACTTTGGCCCGCTCAAGGTATTTTACCCCGGCC
+CCGGCCACACCAGTGACAATATCACCGTTGGGATCGACGGCACCGACATCGCTTTTGGTGGCTGCCTGAT
+CAAGGACAGCAAGGCCAAGTCGCTCGGCAATCTCGGTGATGCCGACACTGAGCACTACGCCGCGTCAGCG
+CGCGCGTTTGGTGCGGCGTTCCCCAAGGCCAGCATGATCGTGATGAGCCATTCCGCCCCCGATAGCCGCG
+CCGCAATCACTCATACGGCCCGCATGGCCGACAAGCTGCGCTGAGCCATGGCTGACCACGTCACCCCCAA
+TCTGCCATCGCGCGATTTCGATGTGACAGAGGCGTTTTATGCGAAGCTGGGCTTTGCGACGAGTTGGAAG
+GATCGCGGCTGGATGATCCTGCAGCGCGGCGGTTTGCAGCTCGAATTCTTCCCCTATCCTGACCTCGACC
+CAGCTACGAGCTCGTTCGGCTGTTGCCTGCGGTTGGATGATCTCGATGCCATGGTGGCATTGGTGAACGC
+GGCGGGAGCCGAGGAAAAAAGCACCGGCTGGCCGCGCTTCAAAGCTCCGCAACTGGAGGCGAGCGGCCTG
+AGGATCGGCTACCTGATCGATCCCGACTGCACGCTGGTGCGGCTGATCCAGAACCCCGACTGACCGCATG
+CCCGCGAAAATCAAGATTTGCGGGATCAGCACACCCGAGGCGCTCGATGCGACCATCGCGGCGCGGGCGG
+ACTATGCCGGGTTGGTGTTCTATCCAGCGTCGCCCCGTGCGGTTACGTCGAATGTCGCGGGCGCTTTGAC
+ATCGCGCGCAGCTGGCCAGATCGCCATGGTCGGTTTGTTCGTCGATGCGGATGATGCTGTCATCGCCGAC
+GCACTGGTGGCAGCCAAGCTGAACGCGCTGCAGCTGCACGGTTCGGAATCGCCCGAACGCGTGGCCCAGT
+TGCGCGCGCGGTTTGGCAAGCCGGTGTGGAAGGCGCTGCCCGTCGCCAGCGCCAGCGATGTCGCACGCGC
+CGCAGCCTATGCCGGGGCGGCGGACTTGATCTTGTTCGACGCCAAGACCCCCAAAGGCGCGCTGCCCGGC
+GGCATGGGGTTGGCGTTCGACTGGTCGCTGCTGGCCGGATATCGCGGTGCCTTGCCGTGGGGGCTGGCAG
+GCGGGCTAAATCCGACGAATGTTGCCGAGGCGATTGCGCGCACCGGAGCGCCGCTGGTCGATACCTCCAG
+CGGCGTCGAAAGCGCGCCGGGCGTCAAGGATACCGACAAGATTACCAATTTCGCCTTTGCGGTGCGCTTG
+GCCTAAATCGCGTCGATCAATAGGCGTCGTTCAGCGCAAAGATCGGCTTGCGGGTGCGCCACTGCCCTCG
+GGTGAAGTCGGGAAAATCTAACGTGCGATTGCCCTCAGCAATCGATTGTTCCGACAGAGGCGTGATCGCG
+CTCCAGGCCAGCGCGTCGTAAATGTCGATTGGCATCGGGGCCTTGGCCTTCAGCGCCTCGACAAAAGCGT
+GGATCACGAACCAGTCCATCCCGCCATGCCCGGCCCCTGCCGCCAGATCGGCGTAGCGTTTCCATAGCGG
+GTGATCGTATTTCGCAAACCAGCCCTCGGCAGGCTCCCAGCGGTGCGGCTGTGGGCTCTTGCCCTCCAGA
+TAGATCGACTTGTTGACGTCCATCCACAGCCCCTCGGTGCCTTGCACCCGAAAGCCGAGAGAATAGGGGC
+GCGGCAGCGAGGTGTCGTGGCACAGCATGATCGTTTCACCATTAGTGCAGCCGATCATGGTGTTGACCAC
+ATCACCCAGTGCGAATTTCACCTCGGCGTTGGGATGATCGGCAGAGCCGTTCTTGACGACATAATCATGC
+AGCCCGCGCGCCTTACAGCCGAAGCCGCCAGCGCCCGCTTCGCCCGGCAACGCGACCTTCAGGGTGCGGG
+TCTGCGGCGGGTAGCACACGCCGGCATCGGCGCAGCCCTGGTACTTCACGGTCAGGGTGGTCGCGCTCGC
+GCCGGCCGCGGGCGTGCCGGTGAGGGTGCCGAGCAATTCCTTGCGGTAGGTTTCGACGTCGCCGAAGAAT
+TCGTCGCGGTAGGCCTTGCCCTTCGGCAGCGCCATGGTCGCGCCGGTGAAGGCGGCATCGGCCTTGACCG
+AGGTGCGGTGCCGGTACAGGTAATAGCCGTCGGCGATCCGCCAGCGCACCTCGATGCGGTCCGGCGCGGT
+GGCCTGCGCGGACAGGACGAAGACCTCGTCGACCGGCGGCAGTTCGAAGTCCTGGGCGACGGCCGAGGTC
+GCGGGCAGCGCAAGCAGCAGGGCGAGCCCGGCCAGCCAGCGGCGCAGGCGGATCGTGGATGCGGTCATTG
+GCTCAGTTTACCGGTCGGCTCTCGGCGGCCAGCCATTGCAGGTATTCGGGCAGGCCGGACGCGGCTTCGA
+CCGCGAGCAGCTCCGGGAGTTCGTAGGGATATAGTATTTTATTAAATTCTTATGGGAAATGACGAATGTT
+AAATTATCTTAAGAGCTTTAATAATATCAATACTTATTTGATTTTATCGATAATTCTGCTGTTAATCATA
+ATATCTCTAGATTATTTCTAAACTGAATGAATGTTTATAATGAGTGATTCATATTGCTATTGAAATCGCC
+TTCTCACTTTGAAAGAAGGCGAGGATGAGGGACTTTTATGTTGAATTATCATTTTAAAAATGCCTTATAA
+AAGAAGCTTAATGTGTTTTCTTATATAGGTTTAAACATAATTGTTGTATATCTTAAATCCAATTGATCTT
+AAAATTTTCCTTTATTTTTTGTTATGAGTGCGAGAAAATTGTCAAAAAGGTCAATCAGACTGGGCGTTAA
+TTTGTTTTGCATACTTTTTCCTATATCGAATTAAAGTCATATAACTAACACCATAATCTTTAGCTATTTG
+AGTGAAAGGGTATGAATCGTCCTTATTTTTAAGGGTATGAATTAACTCTTTTAGTTTTTCTTCTGTAATC
+GCAGGCGATCTTCCCTTGTATTTACCTTTCTTTTTTTTAGCTAATTTAATTCCCTCTGCTTGATTCTCAC
+TAATAATACCCCTTTCAAGTTCAGCTACAGCGCCTAATACATGGAGTTGAAACTTATCGAACTTGTCATC
+TGAATTGGGGGTAAAGTTCAGGTTATTTTTGACAATATGAACAGACACTCCTTTTTTATTTAGCTTTTGA
+ACAATGGTTACAAGGTCAATCAAGCTACGTGCCAATCTAAAAACATCATGAGCGTACACAATGTCCCCAC
+TACGGACATAATCGAACATTTCCTGAAGTGCAGGGCGTTTGGCAGTCTTTCCGCTAAAATGATCAATAAA
+AGTTTTATCTAGCTCAAAGGGTAGATCATGGAGCTGTCTTTCAGGGTTTTGGTCTTTAGTGGATACACGG
+ATATACCCCACTCTTTGAAAGGGTGTGTTTTTAATTTGATCTTCAATATCTAAATTTTCTTTTTCCATAA
+CCAGTATAACAAAATTAGATAACCTCAATGTTATATCACATTAGATTAACAAAACAACCCTATTGTTATA
+GGGTTTTTAGGGTGTATTATTATATAACAATAGGGTATACCCTATTGTTATATATCTTCAGGTATAAGGA
+AAAATAACGATGATTAATTTTAATGATCTAAGCGAATCTGAATTATTAAGGATTGCACAGACTGGCATAT
+CAAACCGTATAGGATTGCGTACTTCAGGACATTG
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outdir/reference_plasmid.fasta	Wed Nov 06 13:52:40 2019 -0500
@@ -0,0 +1,96 @@
+>JQ739157.2 Acinetobacter pittii strain ABCA95 plasmid pABCA95, complete sequence
+AAGCTTATAGCAGTGTCACAGATGCGAAAAAGCAATTAAGTGCATATTTTGAGTTTTATAATTTGAAACG
+ACCTCATTCGAGTCTAGACAAAATGACACCAAATGAGTTTTACTATGATCAGCTACCCCAACAAAACAAG
+GTGGCTTAACTAGAGCGGAATATCACTTATAAATACGCTTTTAGTTGTTCAAACAAGTGGGACCACCTCT
+CTCGCATTTGCGGGGTTTTTAATGCTGAATAAAAGGAAAACTTGATGGAATTGCCCAATATTATTCAACA
+ATTTATTGGAAACAGCGTTTTAGAGCCAAATAAAATTGGTCAGTCGCCATCGGATGTTTATTCTTTTAAT
+CGAAATAATGAAACTTTTTTTCTTAAGCGATCTAGCACTTTATATACAGAGACCACATACAGTGTCTCTC
+GCGAAGCGAAAATGTTGAGTTGGCTCTCTGAGAAATTAAAGGTGCCTGAACTCATCATGACTTTTCAGGA
+TGAGCAGTTTGAATTAATGATCACTAAAGCGATCAATGCAAAACCAATTTCAGCGCTTTTTTTAACAGAC
+CAAGAATTGCTTGCTATCTATAAGGAGGCACTCAATCTGTTAAATTCAGTTGCTATTATTGATTGTCCAT
+TTATTTCAAACATTGATCATCGGTTAAAAGAGTCAAAATTTTTTATTGATAACCAACTCCTTGACGATAT
+AGATCAAGATGATTTTGACGCTGAATTATGGGGAGACCATAGAACTTACCTAAGTCTATGGAATGAGTTA
+ACTGAGACTCGTGTTGAAGAAAGATTGGTTTTTTCTCATGGCGATATCACGGATAGTAATATTTTTATAG
+ATAAATTCAATGAAATTTACTTTTTAGATCTTGGCCGTGCTGGGTTAGCTGATGAATTTGTAGATATATC
+CTTTGTTGAACGTTGCCTAAGAGAGGATGCCTCGGAGGAAACTGCTAAAATATTTTTAAAGCATTTAAAA
+AATGATAGACCTGACAAAAGGAATTATTTTTTAAAACTTGATGAATTGAATTGATTCTAAGCATTATCTA
+AAAATACTTAATTGTCTTTTAACGTCGCTAAATTTTAAATAAATAAGTGAAGAGTGTTAGTGGAGCCACT
+GATTTAAAGTTGGCAGAGTAAAACTTGAAGTGCGACATAAACCACCTAATTAATTTAAAGGGTTTATGGA
+GTATATAAAATTGTCATACCATCATCTTAACTTTGAAGATCGTACTGCATTAATGCTTGAGTCAAGAAAA
+GAAGGCTTTTCAGCCAGAAAATTTGCTGAACTCATTAAAAGACATCCTAGTACGATCTATCGTGAGCTTA
+AAAGAAATAGCATCAATGACGTTTATCAAGCTCGATATGCTTCTGATAACACCTTCGCTAGACGTAGACG
+TGGTCACAGAAAACTCAAAATCGATTCAATCCTCTGGAAATTTATTGTTGAAGCGATCCGTTGTTTATGG
+TCTCCTCAGCAAATAGCAAAGCGTTTAAAGACATTTCCTGATTTGGATCAAACAATGAATGTAAGCCATA
+CAACGATTTATTCAACGATACGAGCATTACCAAAGGGTGAGTTGAAAAAAGACTTATTATCCTGTCTACG
+TCATGAAAATAAAAAGCGAAAAGCTAACGGTGAACCTAAAAAAGATTCTATATTACAGGATATTAAAACT
+ATTCATGAGCGCCCAGCCGAAGTTCAAGAAAGAAAAATACCGGGTCATTGGGAAGCTGATTTAATTAAAG
+GTAAAGACAATAAAAGTTCGATAGCAACACTTATTGAACGAAATACACGGCTCTGTATCTTGGCAACATT
+ACCTGATGCAAAGGCAGAATCAGTGCGCAAGGCTTTAACTGAAGCTCTGAAATATTTACCTGCAGAACTG
+CGTAAAACGTTGACCTATGACCGTGGACGTGAGATGTCAGAACATAAAATACTCGAAGAAGATTTAGGCA
+TAGATGTATATTTCTGTGACCCACATTCACCCTGGCAAAAAGGCACATGCGAAAATATGAATGGTTTAAT
+TAGGCAATATTTACCTAAAGGGATTGATTTAAATCAGGCAGATCAGCATTATTTAAATCAAGTTGCCATG
+TCACTGAATACTCGTCCTAGAAAGGCGTTAGATTGGCTTACACCATTAGAGAAATTTGCTCAGCTTGTTG
+ATTATCATATGGCTTTTGAAACTGTCGCACCTCATGTTTGAATTCGCCCCATATTTTTGCTACAGTGAAC
+CAAATTAAGATCATCTATTTACTAGGCCTCGCATTTGCGGGGTTTTTAATGCTGAATAAAAGGAAAACTT
+GATGGAATTGCCCAATATTATGCACCCGGTCGCGAAGCTGAGCACCGCATTAGCCGCTGCATTGATGCTG
+AGCGGGTGCATGCCCGGTGAAATCCGCCCGACGATTGGCCAGCAAATGGAAACTGGCGACCAACGGTTTG
+GCGATCTGGTTTTCCGCCAGCTCGCACCGAATGTCTGGCAGCACACTTCCTATCTCGACATGCCGGGTTT
+CGGGGCAGTCGCTTCCAACGGTTTGATCGTCAGGGATGGCGGCCGCGTGCTGGTGGTCGATACCGCCTGG
+ACCGATGACCAGACCGCCCAGATCCTCAACTGGATCAAGCAGGAGATCAACCTGCCGGTCGCGCTGGCGG
+TGGTGACTCACGCGCATCAGGACAAGATGGGCGGTATGGACGCGCTGCATGCGGCGGGGATTGCGACTTA
+TGCCAATGCGTTGTCGAACCAGCTTGCCCCGCAAGAGGGGATGGTTGCGGCGCAACACAGCCTGACTTTC
+GCCGCCAATGGCTGGGTCGAACCAGCAACCGCGCCCAACTTTGGCCCGCTCAAGGTATTTTACCCCGGCC
+CCGGCCACACCAGTGACAATATCACCGTTGGGATCGACGGCACCGACATCGCTTTTGGTGGCTGCCTGAT
+CAAGGACAGCAAGGCCAAGTCGCTCGGCAATCTCGGTGATGCCGACACTGAGCACTACGCCGCGTCAGCG
+CGCGCGTTTGGTGCGGCGTTCCCCAAGGCCAGCATGATCGTGATGAGCCATTCCGCCCCCGATAGCCGCG
+CCGCAATCACTCATACGGCCCGCATGGCCGACAAGCTGCGCTGAGCCATGGCTGACCACGTCACCCCCAA
+TCTGCCATCGCGCGATTTCGATGTGACAGAGGCGTTTTATGCGAAGCTGGGCTTTGCGACGAGTTGGAAG
+GATCGCGGCTGGATGATCCTGCAGCGCGGCGGTTTGCAGCTCGAATTCTTCCCCTATCCTGACCTCGACC
+CAGCTACGAGCTCGTTCGGCTGTTGCCTGCGGTTGGATGATCTCGATGCCATGGTGGCATTGGTGAACGC
+GGCGGGAGCCGAGGAAAAAAGCACCGGCTGGCCGCGCTTCAAAGCTCCGCAACTGGAGGCGAGCGGCCTG
+AGGATCGGCTACCTGATCGATCCCGACTGCACGCTGGTGCGGCTGATCCAGAACCCCGACTGACCGCATG
+CCCGCGAAAATCAAGATTTGCGGGATCAGCACACCCGAGGCGCTCGATGCGACCATCGCGGCGCGGGCGG
+ACTATGCCGGGTTGGTGTTCTATCCAGCGTCGCCCCGTGCGGTTACGTCGAATGTCGCGGGCGCTTTGAC
+ATCGCGCGCAGCTGGCCAGATCGCCATGGTCGGTTTGTTCGTCGATGCGGATGATGCTGTCATCGCCGAC
+GCACTGGTGGCAGCCAAGCTGAACGCGCTGCAGCTGCACGGTTCGGAATCGCCCGAACGCGTGGCCCAGT
+TGCGCGCGCGGTTTGGCAAGCCGGTGTGGAAGGCGCTGCCCGTCGCCAGCGCCAGCGATGTCGCACGCGC
+CGCAGCCTATGCCGGGGCGGCGGACTTGATCTTGTTCGACGCCAAGACCCCCAAAGGCGCGCTGCCCGGC
+GGCATGGGGTTGGCGTTCGACTGGTCGCTGCTGGCCGGATATCGCGGTGCCTTGCCGTGGGGGCTGGCAG
+GCGGGCTAAATCCGACGAATGTTGCCGAGGCGATTGCGCGCACCGGAGCGCCGCTGGTCGATACCTCCAG
+CGGCGTCGAAAGCGCGCCGGGCGTCAAGGATACCGACAAGATTACCAATTTCGCCTTTGCGGTGCGCTTG
+GCCTAAATCGCGTCGATCAATAGGCGTCGTTCAGCGCAAAGATCGGCTTGCGGGTGCGCCACTGCCCTCG
+GGTGAAGTCGGGAAAATCTAACGTGCGATTGCCCTCAGCAATCGATTGTTCCGACAGAGGCGTGATCGCG
+CTCCAGGCCAGCGCGTCGTAAATGTCGATTGGCATCGGGGCCTTGGCCTTCAGCGCCTCGACAAAAGCGT
+GGATCACGAACCAGTCCATCCCGCCATGCCCGGCCCCTGCCGCCAGATCGGCGTAGCGTTTCCATAGCGG
+GTGATCGTATTTCGCAAACCAGCCCTCGGCAGGCTCCCAGCGGTGCGGCTGTGGGCTCTTGCCCTCCAGA
+TAGATCGACTTGTTGACGTCCATCCACAGCCCCTCGGTGCCTTGCACCCGAAAGCCGAGAGAATAGGGGC
+GCGGCAGCGAGGTGTCGTGGCACAGCATGATCGTTTCACCATTAGTGCAGCCGATCATGGTGTTGACCAC
+ATCACCCAGTGCGAATTTCACCTCGGCGTTGGGATGATCGGCAGAGCCGTTCTTGACGACATAATCATGC
+AGCCCGCGCGCCTTACAGCCGAAGCCGCCAGCGCCCGCTTCGCCCGGCAACGCGACCTTCAGGGTGCGGG
+TCTGCGGCGGGTAGCACACGCCGGCATCGGCGCAGCCCTGGTACTTCACGGTCAGGGTGGTCGCGCTCGC
+GCCGGCCGCGGGCGTGCCGGTGAGGGTGCCGAGCAATTCCTTGCGGTAGGTTTCGACGTCGCCGAAGAAT
+TCGTCGCGGTAGGCCTTGCCCTTCGGCAGCGCCATGGTCGCGCCGGTGAAGGCGGCATCGGCCTTGACCG
+AGGTGCGGTGCCGGTACAGGTAATAGCCGTCGGCGATCCGCCAGCGCACCTCGATGCGGTCCGGCGCGGT
+GGCCTGCGCGGACAGGACGAAGACCTCGTCGACCGGCGGCAGTTCGAAGTCCTGGGCGACGGCCGAGGTC
+GCGGGCAGCGCAAGCAGCAGGGCGAGCCCGGCCAGCCAGCGGCGCAGGCGGATCGTGGATGCGGTCATTG
+GCTCAGTTTACCGGTCGGCTCTCGGCGGCCAGCCATTGCAGGTATTCGGGCAGGCCGGACGCGGCTTCGA
+CCGCGAGCAGCTCCGGGAGTTCGTAGGGATATAGTATTTTATTAAATTCTTATGGGAAATGACGAATGTT
+AAATTATCTTAAGAGCTTTAATAATATCAATACTTATTTGATTTTATCGATAATTCTGCTGTTAATCATA
+ATATCTCTAGATTATTTCTAAACTGAATGAATGTTTATAATGAGTGATTCATATTGCTATTGAAATCGCC
+TTCTCACTTTGAAAGAAGGCGAGGATGAGGGACTTTTATGTTGAATTATCATTTTAAAAATGCCTTATAA
+AAGAAGCTTAATGTGTTTTCTTATATAGGTTTAAACATAATTGTTGTATATCTTAAATCCAATTGATCTT
+AAAATTTTCCTTTATTTTTTGTTATGAGTGCGAGAAAATTGTCAAAAAGGTCAATCAGACTGGGCGTTAA
+TTTGTTTTGCATACTTTTTCCTATATCGAATTAAAGTCATATAACTAACACCATAATCTTTAGCTATTTG
+AGTGAAAGGGTATGAATCGTCCTTATTTTTAAGGGTATGAATTAACTCTTTTAGTTTTTCTTCTGTAATC
+GCAGGCGATCTTCCCTTGTATTTACCTTTCTTTTTTTTAGCTAATTTAATTCCCTCTGCTTGATTCTCAC
+TAATAATACCCCTTTCAAGTTCAGCTACAGCGCCTAATACATGGAGTTGAAACTTATCGAACTTGTCATC
+TGAATTGGGGGTAAAGTTCAGGTTATTTTTGACAATATGAACAGACACTCCTTTTTTATTTAGCTTTTGA
+ACAATGGTTACAAGGTCAATCAAGCTACGTGCCAATCTAAAAACATCATGAGCGTACACAATGTCCCCAC
+TACGGACATAATCGAACATTTCCTGAAGTGCAGGGCGTTTGGCAGTCTTTCCGCTAAAATGATCAATAAA
+AGTTTTATCTAGCTCAAAGGGTAGATCATGGAGCTGTCTTTCAGGGTTTTGGTCTTTAGTGGATACACGG
+ATATACCCCACTCTTTGAAAGGGTGTGTTTTTAATTTGATCTTCAATATCTAAATTTTCTTTTTCCATAA
+CCAGTATAACAAAATTAGATAACCTCAATGTTATATCACATTAGATTAACAAAACAACCCTATTGTTATA
+GGGTTTTTAGGGTGTATTATTATATAACAATAGGGTATACCCTATTGTTATATATCTTCAGGTATAAGGA
+AAAATAACGATGATTAATTTTAATGATCTAAGCGAATCTGAATTATTAAGGATTGCACAGACTGGCATAT
+CAAACCGTATAGGATTGCGTACTTCAGGACATTG
+