Mercurial > repos > iuc > minimap2
comparison minimap2.xml @ 0:9b44afd426fd draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/minimap2 commit d467ba21d5a83a68e87625c9e9157b37eac03eda-dirty
| author | iuc |
|---|---|
| date | Fri, 03 Nov 2017 16:57:51 -0400 |
| parents | |
| children | acb6d016cabe |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9b44afd426fd |
|---|---|
| 1 <?xml version="1.0"?> | |
| 2 <tool id="minimap2" name="Map with minimap2" version="2.3" profile="17.01"> | |
| 3 <description>- A fast pairwise aligner for genomic and spliced nucleotide sequences</description> | |
| 4 <requirements> | |
| 5 <requirement type="package" version="2.3">minimap2</requirement> | |
| 6 <requirement type="package" version="1.6">samtools</requirement> | |
| 7 </requirements> | |
| 8 <version_command>minimap2 --version</version_command> | |
| 9 <command> | |
| 10 <![CDATA[ | |
| 11 ln -f -s '$reference_source.ref_file' reference.fa && | |
| 12 minimap2 -a | |
| 13 -x $analysis_type_selector | |
| 14 ## indexing options | |
| 15 #if $indexing_options.k: | |
| 16 -k $indexing_options.k | |
| 17 #end if | |
| 18 #if $indexing_options.w: | |
| 19 -w $indexing_options.w | |
| 20 #end if | |
| 21 #if $indexing_options.I: | |
| 22 -I $indexing_options.I | |
| 23 #end if | |
| 24 ## Mapping options | |
| 25 #if $mapping_options.f: | |
| 26 -f $mapping_options.f | |
| 27 #end if | |
| 28 #if $mapping_options.g: | |
| 29 -g $mapping_options.g | |
| 30 #end if | |
| 31 #if $mapping_options.G: | |
| 32 -G $mapping_options.G | |
| 33 #end if | |
| 34 #if $mapping_options.F: | |
| 35 -F $mapping_options.F | |
| 36 #end if | |
| 37 #if $mapping_options.r: | |
| 38 -r $mapping_options.r | |
| 39 #end if | |
| 40 #if $mapping_options.n: | |
| 41 -n $mapping_options.n | |
| 42 #end if | |
| 43 #if $mapping_options.m: | |
| 44 -m $mapping_options.m | |
| 45 #end if | |
| 46 $mapping_options.X | |
| 47 #if $mapping_options.p: | |
| 48 -p $mapping_options.p | |
| 49 #end if | |
| 50 #if $mapping_options.N: | |
| 51 -N $mapping_options.N | |
| 52 #end if | |
| 53 ## Alignment options | |
| 54 #if $alignment_options.A: | |
| 55 -A $alignment_options.A | |
| 56 #end if | |
| 57 #if $alignment_options.B: | |
| 58 -B $alignment_options.B | |
| 59 #end if | |
| 60 #if $alignment_options.O: | |
| 61 -O $alignment_options.O | |
| 62 #end if | |
| 63 #if $alignment_options.E: | |
| 64 -E $alignment_options.E | |
| 65 #end if | |
| 66 #if $alignment_options.z: | |
| 67 $alignment_options.z | |
| 68 #end if | |
| 69 #if $alignment_options.s: | |
| 70 -s $alignment_options.s | |
| 71 #end if | |
| 72 #if $alignment_options.u: | |
| 73 -u $alignment_options.u | |
| 74 #end if | |
| 75 ## Output options | |
| 76 $io_options.Q | |
| 77 $io_options.L | |
| 78 #if $io_options.cs: | |
| 79 --cs $io_options.cs | |
| 80 #end if | |
| 81 #if $io_options.K: | |
| 82 -K $io_options.K | |
| 83 #end if | |
| 84 -t \${GALAXY_SLOTS:-4} | |
| 85 reference.fa | |
| 86 #if $fastq_input.fastq_input_selector in ['single', 'paired_iv']: | |
| 87 '$fastq_input.fastq_input1' | |
| 88 #else if $fastq_input.fastq_input_selector == 'paired': | |
| 89 '$fastq_input.fastq_input1' '$fastq_input.fastq_input2' | |
| 90 #else if $fastq_input.fastq_input_selector == 'paired_collection': | |
| 91 '$fastq_input.fastq_input1.forward' '$fastq_input.fastq_input2.reverse' | |
| 92 #end if | |
| 93 | samtools sort | |
| 94 -@\${GALAXY_SLOTS:-2} | |
| 95 -O BAM | |
| 96 #if $io_options.output_format == 'CRAM': | |
| 97 -l 0| samtools view -T reference.fa -C | |
| 98 #end if | |
| 99 > '$alignment_output' | |
| 100 ]]> | |
| 101 </command> | |
| 102 <inputs> | |
| 103 <conditional name="reference_source"> | |
| 104 <param name="reference_source_selector" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options. See `Indexes` section of help below"> | |
| 105 <option value="cached">Use a built-in genome index</option> | |
| 106 <option value="history">Use a genome from history and build index</option> | |
| 107 </param> | |
| 108 <when value="cached"> | |
| 109 <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list"> | |
| 110 <options from_data_table="all_fasta"> | |
| 111 <filter type="sort_by" column="2" /> | |
| 112 <validator type="no_options" message="No reference genomes are available" /> | |
| 113 </options> | |
| 114 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> | |
| 115 </param> | |
| 116 </when> | |
| 117 <when value="history"> | |
| 118 <param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" /> | |
| 119 </when> | |
| 120 </conditional> | |
| 121 <section name="indexing_options" title="Indexing options"> | |
| 122 <!-- Homopolymer setting seems to not properly overwrite sr preset | |
| 123 <param argument="-H" name="H" type="boolean" optional="true" truevalue="-H" falsevalue="" label="Use homopolymer-compressed k-mer ?"/> | |
| 124 --> | |
| 125 <param argument="-k" type="integer" min="4" max="28" optional="true" label="k-mer size" help=""/> | |
| 126 <param argument="-w" type="integer" min="1" optional="true" label="minimizer window size" help=""/> | |
| 127 <param argument="-I" type="integer" min="1" optional="true" label="split index for every N input gigabases" help=""/> | |
| 128 </section> | |
| 129 <!-- start unchanged copy from bwa-mem --> | |
| 130 <conditional name="fastq_input"> | |
| 131 <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data"> | |
| 132 <option value="single">Single</option> | |
| 133 <option value="paired">Paired</option> | |
| 134 <option value="paired_collection">Paired Collection</option> | |
| 135 <option value="paired_iv">Paired Interleaved</option> | |
| 136 </param> | |
| 137 <when value="paired"> | |
| 138 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select first set of reads" help="Specify dataset with forward reads"/> | |
| 139 <param name="fastq_input2" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select second set of reads" help="Specify dataset with reverse reads"/> | |
| 140 </when> | |
| 141 <when value="single"> | |
| 142 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with single reads"/> | |
| 143 </when> | |
| 144 <when value="paired_collection"> | |
| 145 <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fasta" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/> | |
| 146 </when> | |
| 147 <when value="paired_iv"> | |
| 148 <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with interleaved reads"/> | |
| 149 </when> | |
| 150 </conditional> | |
| 151 <!-- end unchanged copy from bwa-mem --> | |
| 152 <param name="analysis_type_selector" type="select" label="Select analysis mode (sets default)"> | |
| 153 <option value="map-pb">-Hk19 (PacBio vs reference mapping)</option> | |
| 154 <option value="map-ont">-k15 (Oxford Nanopore vs reference mapping)</option> | |
| 155 <option value="asm5">-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5% div.)</option> | |
| 156 <option value="asm10">-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10% div.)</option> | |
| 157 <option value="ava-pb">-Hk19 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (PacBio read overlap)</option> | |
| 158 <option value="ava-ont">-k15 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (ONT read overlap)</option> | |
| 159 <option value="splice">long-read spliced alignment</option> | |
| 160 <option value="sr">short single-end reads without splicing</option> | |
| 161 </param> | |
| 162 <section name="mapping_options" title="Set advanced mapping options" help="Sets -f, -g, -G, -F, -r, -n, -m, -X, -p, and -N options." expanded="False"> | |
| 163 <param argument="-f" type="float" value="" optional="true" label="filter out top FLOAT fraction of repetitive minimizers" help="default=0.0002"/> | |
| 164 <param argument="-g" type="integer" value="" optional="true" label="stop chain enlongation if there are no minimizers in INT-bp" help="default=5000"/> | |
| 165 <param argument="-G" type="integer" value="" optional="true" label="max intron length in thousand (effective with -xsplice; changing -r)" help="default=200"/> | |
| 166 <param argument="-F" type="integer" value="" optional="true" label="max fragment length (effective with -xsr or in the fragment mode)" help="default=800" /> | |
| 167 <param argument="-r" type="integer" value="" optional="true" label="bandwidth used in chaining and DP-based alignment" help="default=500" /> | |
| 168 <param argument="-n" type="integer" value="" optional="true" label="minimal number of minimizers on a chain" help="default=3"/> | |
| 169 <param argument="-m" type="integer" value="" optional="true" label="minimal chaining score (matching bases minus log gap penalty)" help="default=40"/> | |
| 170 <param argument="-X" type="boolean" truevalue="-X" falsevalue="" optional="true" label="skip self and dual mappings (for the all-vs-all mode)"/> | |
| 171 <param argument="-p" type="float" value="" max="1" optional="true" label="min secondary-to-primary score ratio" help="default=0.8"/> | |
| 172 <param argument="-N" type="integer" min="0" optional="true" label="retain at most INT secondary alignments" help="default=5"/> | |
| 173 </section> | |
| 174 <section name="alignment_options" title="Set advanced alignment options" help="Sets -Q, -L, -R, -c, --cs, and -K options." expanded="False"> | |
| 175 <param argument="-A" type="integer" optional="true" label="Score for a sequence match" help="default=2"/> | |
| 176 <param argument="-B" type="integer" optional="true" label="Penalty for a mismatch" help="-B; default=4" /> | |
| 177 <param argument="-O" type="text" optional="true" label="Gap open penalties for deletions and insertions" help="-O; default=4,24"> | |
| 178 <sanitizer invalid_char=""> | |
| 179 <valid initial="string.digits"><add value=","/> </valid> | |
| 180 </sanitizer> | |
| 181 </param> | |
| 182 <param argument="-E" type="text" optional="true" label="Gap extension penalties; a gap of size k cost '-O + -E*k'. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion" help="-E; default=2,1"> | |
| 183 <sanitizer invalid_char=""> | |
| 184 <valid initial="string.digits"><add value=","/> </valid> | |
| 185 </sanitizer> | |
| 186 </param> | |
| 187 <param argument="-z" type="integer" optional="true" label="Z-drop score" help="default=400"/> | |
| 188 <param argument="-s" type="integer" optional="true" label="minimal peak DP alignment score" help="default=80"/> | |
| 189 <param argument="-u" type="select" optional="true" label="how to find GT-AG"> | |
| 190 <option value="n">don't match GT-AG</option> | |
| 191 <option value="f">transcript strand</option> | |
| 192 <option value="b">both strands</option> | |
| 193 </param> | |
| 194 </section> | |
| 195 <section name="io_options" title="Set advanced output options" help="Sets -T, -h, -a, -C, -V, -Y, and -M options." expanded="False"> | |
| 196 <param name="output_format" type="select" label="Produce BAM or CRAM file?"> | |
| 197 <option value="BAM">BAM</option> | |
| 198 <option value="CRAM">CRAM</option> | |
| 199 </param> | |
| 200 <param argument="-Q" type="boolean" truevalue="-Q" falsevalue="" optional="true" label="don't output base quality"/> | |
| 201 <param argument="-L" type="boolean" truevalue="-L" falsevalue="" optional="true" label="write CIGAR with >65535 ops to the CG tag" help="Useful for very long reads in SAM/BAM format"/> | |
| 202 <param argument="-K" type="integer" optional="true" label="minibatch size for mapping (in megabyte)" help="default=500M"/> | |
| 203 <param argument="--cs" type="select" optional="true" label="Output cs tag?" help="The cs tag is a more compact standalone representation of the MD tag, see help below."> | |
| 204 <option value="none">no</option> | |
| 205 <option value="short">short</option> | |
| 206 <option value="long">long</option> | |
| 207 </param> | |
| 208 </section> | |
| 209 </inputs> | |
| 210 <outputs> | |
| 211 <data format="bam" name="alignment_output" label="${tool.name} on ${on_string} (mapped reads in ${io_options.output_format} format)"> | |
| 212 <actions> | |
| 213 <conditional name="reference_source.reference_source_selector"> | |
| 214 <when value="cached"> | |
| 215 <action type="metadata" name="dbkey"> | |
| 216 <option type="from_data_table" name="all_fasta" column="1" offset="0"> | |
| 217 <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/> | |
| 218 <filter type="param_value" ref="reference_source.ref_file" column="0"/> | |
| 219 </option> | |
| 220 </action> | |
| 221 </when> | |
| 222 <when value="history"> | |
| 223 <action type="metadata" name="dbkey"> | |
| 224 <option type="from_param" name="reference_source.ref_file" param_attribute="dbkey" /> | |
| 225 </action> | |
| 226 </when> | |
| 227 </conditional> | |
| 228 </actions> | |
| 229 <change_format> | |
| 230 <when input="io_options.output_format" value="CRAM" format="cram" /> | |
| 231 </change_format> | |
| 232 </data> | |
| 233 </outputs> | |
| 234 <tests> | |
| 235 <test> | |
| 236 <param name="reference_source_selector" value="history" /> | |
| 237 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> | |
| 238 <param name="fastq_input_selector" value="paired"/> | |
| 239 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/> | |
| 240 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/> | |
| 241 <param name="analysis_type_selector" value="sr"/> | |
| 242 <output name="alignment_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="2" /> | |
| 243 </test> | |
| 244 <test> | |
| 245 <param name="reference_source_selector" value="history" /> | |
| 246 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> | |
| 247 <param name="fastq_input_selector" value="single"/> | |
| 248 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/> | |
| 249 <param name="analysis_type_selector" value="sr"/> | |
| 250 <output name="alignment_output" ftype="bam" file="bwa-mem-test1-fasta.bam" lines_diff="2" /> | |
| 251 </test> | |
| 252 <test> | |
| 253 <param name="reference_source_selector" value="history" /> | |
| 254 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> | |
| 255 <param name="fastq_input_selector" value="paired"/> | |
| 256 <param name="fastq_input1" ftype="fastqsanger.gz" value="bwa-mem-fastq1.fq.gz"/> | |
| 257 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/> | |
| 258 <param name="analysis_type_selector" value="sr"/> | |
| 259 <output name="alignment_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="2" /> | |
| 260 </test> | |
| 261 <test> | |
| 262 <param name="reference_source_selector" value="history" /> | |
| 263 <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> | |
| 264 <param name="fastq_input_selector" value="paired"/> | |
| 265 <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/> | |
| 266 <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/> | |
| 267 <param name="analysis_type_selector" value="sr"/> | |
| 268 <output name="alignment_output" ftype="bam" file="bwa-mem-test2.bam" lines_diff="2" /> | |
| 269 </test> | |
| 270 </tests> | |
| 271 <help> | |
| 272 | |
| 273 Users’ Guide | |
| 274 ------------ | |
| 275 | |
| 276 Minimap2 is a versatile sequence alignment program that aligns DNA or | |
| 277 mRNA sequences against a large reference database. Typical use cases | |
| 278 include: (1) mapping PacBio or Oxford Nanopore genomic reads to the | |
| 279 human genome; (2) finding overlaps between long reads with error rate up | |
| 280 to ~15%; (3) splice-aware alignment of PacBio Iso-Seq or Nanopore cDNA | |
| 281 or Direct RNA reads against a reference genome; (4) aligning Illumina | |
| 282 single- or paired-end reads; (5) assembly-to-assembly alignment; (6) | |
| 283 full-genome alignment between two closely related species with | |
| 284 divergence below ~15%. | |
| 285 | |
| 286 For ~10kb noisy reads sequences, minimap2 is tens of times faster than | |
| 287 mainstream long-read mappers such as BLASR, BWA-MEM, NGMLR and GMAP. It | |
| 288 is more accurate on simulated long reads and produces biologically | |
| 289 meaningful alignment ready for downstream analyses. For >100bp Illumina | |
| 290 short reads, minimap2 is three times as fast as BWA-MEM and Bowtie2, and | |
| 291 as accurate on simulated data. Detailed evaluations are available from | |
| 292 the `minimap2 preprint`. | |
| 293 | |
| 294 General usage | |
| 295 ~~~~~~~~~~~~~ | |
| 296 | |
| 297 Minimap2 seamlessly works with gzip’d FASTA and FASTQ formats as input. | |
| 298 You don’t need to convert between FASTA and FASTQ or decompress gzip’d | |
| 299 files first. | |
| 300 | |
| 301 For the human reference genome, minimap2 takes a few minutes to generate | |
| 302 a minimizer index for the reference before mapping. To reduce indexing | |
| 303 time, you can optionally save the index with option **-d** and replace | |
| 304 the reference sequence file with the index file on the minimap2 command | |
| 305 line: | |
| 306 | |
| 307 ***Importantly***, it should be noted that once you build the index, | |
| 308 indexing parameters such as **-k**, **-w**, **-H** and **-I** can’t be | |
| 309 changed during mapping. If you are running minimap2 for different data | |
| 310 types, you will probably need to keep multiple indexes generated with | |
| 311 different parameters. This makes minimap2 different from BWA which | |
| 312 always uses the same index regardless of query data types. | |
| 313 | |
| 314 Use cases | |
| 315 ~~~~~~~~~ | |
| 316 | |
| 317 Minimap2 uses the same base algorithm for all applications. However, due | |
| 318 to the different data types it supports (e.g. short vs long reads; DNA | |
| 319 vs mRNA reads), minimap2 needs to be tuned for optimal performance and | |
| 320 accuracy. It is usually recommended to choose a preset with option | |
| 321 **-x**, which sets multiple parameters at the same time. The default | |
| 322 setting is the same as ``map-ont``. | |
| 323 | |
| 324 Map long noisy genomic reads | |
| 325 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| 326 | |
| 327 The difference between ``map-pb`` and ``map-ont`` is that ``map-pb`` | |
| 328 uses homopolymer-compressed (HPC) minimizers as seeds, while ``map-ont`` | |
| 329 uses ordinary minimizers as seeds. Emperical evaluation suggests HPC | |
| 330 minimizers improve performance and sensitivity when aligning PacBio | |
| 331 reads, but hurt when aligning Nanopore reads. | |
| 332 | |
| 333 Map long mRNA/cDNA reads | |
| 334 ^^^^^^^^^^^^^^^^^^^^^^^^ | |
| 335 | |
| 336 | |
| 337 There are different long-read RNA-seq technologies, including | |
| 338 tranditional full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq | |
| 339 and Direct RNA-seq. They produce data of varying quality and properties. | |
| 340 By default, ``-x splice`` assumes the read orientation relative to the | |
| 341 transcript strand is unknown. It tries two rounds of alignment to infer | |
| 342 the orientation and write the strand to the ``ts`` SAM/PAF tag if | |
| 343 possible. For Iso-seq, Direct RNA-seq and tranditional full-length | |
| 344 cDNAs, it would be desired to apply ``-u f`` to force minimap2 to | |
| 345 consider the forward transcript strand only. This speeds up alignment | |
| 346 with slight improvement to accuracy. For noisy Nanopore Direct RNA-seq | |
| 347 reads, it is recommended to use a smaller k-mer size for increased | |
| 348 sensitivity to the first or the last exons. | |
| 349 | |
| 350 It is worth noting that by default ``-x splice`` prefers | |
| 351 GT[A/G]..[C/T]AG over GT[C/T]..[A/G]AG, and then over other splicing | |
| 352 signals. Considering one additional base improves the junction accuracy | |
| 353 for noisy reads, but reduces the accuracy when aligning against the | |
| 354 widely used SIRV control data. This is because SIRV does not honor the | |
| 355 evolutionarily conservative splicing signal. If you are studying SIRV, | |
| 356 you may apply ``--splice-flank=no`` to let minimap2 only model GT..AG, | |
| 357 ignoring the additional base. | |
| 358 | |
| 359 Find overlaps between long reads | |
| 360 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| 361 | |
| 362 Similarly, ``ava-pb`` uses HPC minimizers while ``ava-ont`` uses | |
| 363 ordinary minimizers. It is usually not recommended to perform base-level | |
| 364 alignment in the overlapping mode because it is slow and may produce | |
| 365 false positive overlaps. However, if performance is not a concern, you | |
| 366 may try to add ``-a`` or ``-c`` anyway. | |
| 367 | |
| 368 Map short accurate genomic reads | |
| 369 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| 370 | |
| 371 | |
| 372 When two read files are specified, minimap2 reads from each file in turn | |
| 373 and merge them into an interleaved stream internally. Two reads are | |
| 374 considered to be paired if they are adjacent in the input stream and | |
| 375 have the same name (with the ``/[0-9]`` suffix trimmed if present). | |
| 376 Single- and paired-end reads can be mixed. | |
| 377 | |
| 378 Minimap2 does not work well with short spliced reads. There are many | |
| 379 capable RNA-seq mappers for short reads. | |
| 380 | |
| 381 Full genome/assembly alignment | |
| 382 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| 383 | |
| 384 For cross-species full-genome alignment, the scoring system needs to be | |
| 385 tuned according to the sequence divergence. | |
| 386 | |
| 387 Advanced features | |
| 388 ~~~~~~~~~~~~~~~~~ | |
| 389 | |
| 390 Working with >65535 CIGAR operations | |
| 391 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| 392 | |
| 393 Due to a design flaw, BAM does not work with CIGAR strings with >65535 | |
| 394 operations (SAM and CRAM work). However, for ultra-long nanopore reads | |
| 395 minimap2 may align ~1% of read bases with long CIGARs beyond the | |
| 396 capability of BAM. If you convert such SAM/CRAM to BAM, Picard and | |
| 397 recent samtools will throw an error and abort. Older samtools and other | |
| 398 tools may create corrupted BAM. | |
| 399 | |
| 400 To avoid this issue, you can add option ``-L`` at the minimap2 command | |
| 401 line. This option moves a long CIGAR to the ``CG`` tag and leaves a | |
| 402 fully clipped CIGAR at the SAM CIGAR column. Current tools that don’t | |
| 403 read CIGAR (e.g. merging and sorting) still work with such BAM records; | |
| 404 tools that read CIGAR will effectively ignore these records. I have pull | |
| 405 requests to the SAM spec, htslib, htsjdk, bedtools2, Rsamtools and | |
| 406 igv.js. If they are accepted, future versions of these tools will | |
| 407 seamlessly recognize long-cigar records generated by option ``-L``. | |
| 408 | |
| 409 **TD;DR**: if you work with ultra-long reads and use tools that only | |
| 410 process BAM files, please add option ``-L``. | |
| 411 | |
| 412 The cs optional tag | |
| 413 ^^^^^^^^^^^^^^^^^^^ | |
| 414 | |
| 415 The ``cs`` SAM/PAF tag encodes bases at mismatches and INDELs. It | |
| 416 matches regular expression | |
| 417 ``/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/``. Like CIGAR, ``cs`` | |
| 418 consists of series of operations. Each leading character specifies the | |
| 419 operation; the following sequence is the one involved in the operation. | |
| 420 | |
| 421 The ``cs`` tag is enabled by command line option ``--cs``. The following | |
| 422 alignment, for example: | |
| 423 | |
| 424 .. code:: | |
| 425 | |
| 426 CGATCGATAAATAGAGTAG---GAATAGCA | |
| 427 |||||| |||||||||| |||| ||| | |
| 428 CGATCG---AATAGAGTAGGTCGAATtGCA | |
| 429 | |
| 430 is represented as ``:6-ata:10+gtc:4*at:3``, where ``:[0-9]+`` represents | |
| 431 an identical block, ``-ata`` represents a deltion, ``+gtc`` an insertion | |
| 432 and ``*at`` indicates reference base ``a`` is substituted with a query | |
| 433 base ``t``. It is similar to the ``MD`` SAM tag but is standalone and | |
| 434 easier to parse. | |
| 435 | |
| 436 If ``--cs=long`` is used, the ``cs`` string also contains identical | |
| 437 sequences in the alignment. The above example will become | |
| 438 ``=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA``. The long form of ``cs`` | |
| 439 encodes both reference and query sequences in one string. | |
| 440 | |
| 441 Algorithm overview | |
| 442 ~~~~~~~~~~~~~~~~~~ | |
| 443 | |
| 444 In the following, minimap2 command line options have a dash ahead and | |
| 445 are highlighted in bold. The description may help to tune minimap2 | |
| 446 parameters. | |
| 447 | |
| 448 1. Read **-I** [=*4G*] reference bases, extract | |
| 449 (**-k**,\ **-w**)-minimizers and index them in a hash table. | |
| 450 | |
| 451 2. Read **-K** [=*200M*] query bases. For each query sequence, do step 3 | |
| 452 through 7: | |
| 453 | |
| 454 3. For each (**-k**,\ **-w**)-minimizer on the query, check against the | |
| 455 reference index. If a reference minimizer is not among the top **-f** | |
| 456 [=*2e-4*] most frequent, collect its the occurrences in the | |
| 457 reference, which are called *seeds*. | |
| 458 | |
| 459 4. Sort seeds by position in the reference. Chain them with dynamic | |
| 460 programming. Each chain represents a potential mapping. For read | |
| 461 overlapping, report all chains and then go to step 8. For reference | |
| 462 mapping, do step 5 through 7: | |
| 463 | |
| 464 5. Let *P* be the set of primary mappings, which is an empty set | |
| 465 initially. For each chain from the best to the worst according to | |
| 466 their chaining scores: if on the query, the chain overlaps with a | |
| 467 chain in *P* by **–mask-level** [=*0.5*] or higher fraction of the | |
| 468 shorter chain, mark the chain as *secondary* to the chain in *P*; | |
| 469 otherwise, add the chain to *P*. | |
| 470 | |
| 471 6. Retain all primary mappings. Also retain up to **-N** [=*5*] top | |
| 472 secondary mappings if their chaining scores are higher than **-p** | |
| 473 [=*0.8*] of their corresponding primary mappings. | |
| 474 | |
| 475 7. If alignment is requested, filter out an internal seed if it | |
| 476 potentially leads to both a long insertion and a long deletion. | |
| 477 Extend from the left-most seed. Perform global alignments between | |
| 478 internal seeds. Split the chain if the accumulative score along the | |
| 479 global alignment drops by **-z** [=*400*], disregarding long gaps. | |
| 480 Extend from the right-most seed. Output chains and their alignments. | |
| 481 | |
| 482 8. If there are more query sequences in the input, go to step 2 until no | |
| 483 more queries are left. | |
| 484 | |
| 485 9. If there are more reference sequences, reopen the query file from the | |
| 486 start and go to step 1; otherwise stop. | |
| 487 | |
| 488 Limitations | |
| 489 ----------- | |
| 490 | |
| 491 - Minimap2 may produce suboptimal alignments through long | |
| 492 low-complexity regions where seed positions may be suboptimal. This | |
| 493 should not be a big concern because even the optimal alignment may be | |
| 494 wrong in such regions. | |
| 495 | |
| 496 - Minimap2 requires SSE2 instructions to compile. It is possible to add | |
| 497 non-SSE2 support, but it would make minimap2 slower by several times. | |
| 498 | |
| 499 In general, minimap2 is a young project with most code written since | |
| 500 June, 2017. It may have bugs and room for improvements. Bug reports and | |
| 501 suggestions are warmly welcomed. | |
| 502 </help> | |
| 503 <citations> | |
| 504 <citation type="doi">10.1093/bioinformatics/btp324</citation> | |
| 505 <citation type="doi">10.1093/bioinformatics/btp698</citation> | |
| 506 <citation type="bibtex">@misc{1303.3997, | |
| 507 Author = {Heng Li}, | |
| 508 Title = {Minimap2: fast pairwise alignment for long nucleotide sequences}, | |
| 509 Year = {2017}, | |
| 510 Eprint = {arXiv:1708.01492}, | |
| 511 url = {https://arxiv.org/abs/1708.01492}, | |
| 512 }</citation> | |
| 513 </citations> | |
| 514 </tool> |
