# HG changeset patch # User rnateam # Date 1448888016 18000 # Node ID 570a7de9f151fb125a9e8d06703c4e318781aa84 # Parent 258b6f9e19ab6c466138e5c9976de17a513edec2 read from bam; fix header issue diff -r 258b6f9e19ab -r 570a7de9f151 extract_aln_ends.py --- a/extract_aln_ends.py Fri Nov 27 04:33:02 2015 -0500 +++ b/extract_aln_ends.py Mon Nov 30 07:53:36 2015 -0500 @@ -14,7 +14,7 @@ By default output is written to stdout. Input: -* sam file containing alignments (paired-end sequencing) +* alignments in SAM or BAM format (paired-end sequencing) Output: * bed6 file containing outer coordinates (sorted by read id) @@ -57,8 +57,8 @@ formatter_class=DefaultsRawDescriptionHelpFormatter) # positional arguments parser.add_argument( - "sam", - help="Path to sam file containing alignments.") + "infile", + help="Path to alignments in SAM or BAM format.") # optional arguments parser.add_argument( "-o", "--outfile", @@ -75,7 +75,7 @@ parser.add_argument( '--version', action='version', - version='0.1.0') + version='0.2.0') args = parser.parse_args() @@ -86,7 +86,7 @@ else: logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") logging.info("Parsed arguments:") -logging.info(" sam: '{}'".format(args.sam)) +logging.info(" infile: '{}'".format(args.infile)) if args.outfile: logging.info(" outfile: enabled writing to file") logging.info(" outfile: '{}'".format(args.outfile)) @@ -103,7 +103,7 @@ # sort by id logging.debug("calling samtools sort") - pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix") + pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix") # fix mate information # also removes secondary and unmapped reads diff -r 258b6f9e19ab -r 570a7de9f151 extract_aln_ends.xml --- a/extract_aln_ends.xml Fri Nov 27 04:33:02 2015 -0500 +++ b/extract_aln_ends.xml Mon Nov 30 07:53:36 2015 -0500 @@ -14,7 +14,7 @@ > $default]]> - + @@ -26,7 +26,7 @@ xml $output.id +# what is folder -> xml $__new_file_path__ +# what is format? -> xml $reads.ext +my ($outFile, $id, $folder, $format, $output_prefix) = @ARGV[($#ARGV - 4) .. $#ARGV]; + +# this parses all but the last four arguments +# contains the call to the flexbar actual +my $call = join " ", @ARGV[0..($#ARGV - 5)]; + +# this calls flexbar and +# prefix for output files will be "FlexbarTargetFile" +system $call .' --target FlexbarTargetFile > '. $outFile and exit 1; + +# now we parse all output files +foreach(){ + + # determine filetype + my $fileType; + + $fileType = $1 if /\.(\w+)$/; + $fileType = $format if /\.\w*fast\w$/; + $fileType = 'fasta' if /\.fasta$/; + $fileType = 'csfasta' if /\.csfasta$/; + $fileType = 'tabular' if /\.lengthdist$/; + + # this is just the filename from the for loop + my $file = $_; + + # replace underscores by minus in $_ + s/_/-/g; + + # set new name for output files + my $name = "primary_". $id ."_". $_ ."_visible_". $fileType; + + # rename output file to a pattern recognized by flexbar? + rename $file, $name; + # best guess: this seems to move the file into a folder + # rename behavious is not specified by perl... or implementation differs. + rename $name, $folder; +} diff -r 258b6f9e19ab -r 570a7de9f151 flexbar_named_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flexbar_named_output.xml Mon Nov 30 07:53:36 2015 -0500 @@ -0,0 +1,466 @@ + + + + + + + + + flexible barcode and adapter removal + + + flexbar + + + flexbar --version + + + + flexbar.pl flexbar + + --threads \${GALAXY_SLOTS:-1} + + --reads $reads + + #if $cReads2.select == "on": + #if $cReads2.reads2.ext == $reads.ext: + --reads2 $cReads2.reads2 + #end if + #end if + + #if $reads.ext == "fastqsanger": + --format sanger + #end if + #if $reads.ext == "fastqsolexa": + --format solexa + #end if + #if $reads.ext == "fastqillumina": + --format i1.3 + #end if + #if $reads.ext == "csfasta": + --color-space + #end if + #if $reads.ext == "fastqcssanger": + --color-space + #end if + + + --max-uncalled $maxUncalled + --min-read-length $minReadLen + + #if $trimEnds.select == "on": + --pre-trim-left $trimEnds.trimLeft + --pre-trim-right $trimEnds.trimRight + #end if + + #if $cTrimPhred.select == "on": + --pre-trim-phred $cTrimPhred.trimPhred + #end if + + #if $cTrimLen.select == "on": + --post-trim-length $cTrimLen.trimLen + #end if + + + #if $cBarcodes.select == "on": + --barcodes $cBarcodes.barcodes + + #if $cBarcodes.cbReads.select == "yes": + --barcode-reads $cBarcodes.cbReads.bReads + #end if + + #if $cBarcodes.cbReads.select == "no": + $cBarcodes.cbReads.bKeep + #end if + + $cBarcodes.bUnassigned + + --barcode-trim-end $cBarcodes.bTrimEnd + + #if $cBarcodes.cbTailLen.select == "yes": + --barcode-tail-length $cBarcodes.cbTailLen.bTailLen + #end if + + #if $cBarcodes.cbMinOverlap.select == "yes": + --barcode-min-overlap $cBarcodes.cbMinOverlap.bMinOverlap + #end if + + --barcode-threshold $cBarcodes.bThresh + + #if $cBarcodes.cbAlignScores.select == "yes": + --barcode-match $bMatch + --barcode-mismatch $bMismatch + --barcode-gap $bGap + #end if + #end if + + + #if $cAdapters.select == "on": + + #if $cAdapters.ccAdapters.select == "data": + --adapters $cAdapters.ccAdapters.adaptersData + #end if + + #if $cAdapters.ccAdapters.select == "seq": + --adapter-seq $cAdapters.ccAdapters.adapterSeq + #end if + + --adapter-trim-end $cAdapters.aTrimEnd + + #if $cAdapters.caTailLen.select == "yes": + --adapter-tail-length $cAdapters.caTailLen.aTailLen + #end if + + $cAdapters.aReadSet + + --adapter-min-overlap $cAdapters.aMinOverlap + --adapter-threshold $cAdapters.aThresh + + #if $cAdapters.caAlignScores.select == "yes": + --adapter-match $aMatch + --adapter-mismatch $aMismatch + --adapter-gap $aGap + #end if + #end if + + + #if $cOutput.select == "show": + $cOutput.fastaOutput + $cOutput.lenDist + $cOutput.singleReads + #end if + + #if $cLogging.select == "show": + $cLogging.logLevel + $cLogging.numTags + $cLogging.remTags + $cLogging.rndTags + #end if + + + $output $output.id $__new_file_path__ $reads.ext + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**Description** + +Flexbar preprocesses high-throughput sequencing data efficiently. It demultiplexes barcoded runs and removes adapter sequences. Moreover, trimming and filtering features are provided. Flexbar increases read mapping rates and improves genome and transcriptome assemblies. It supports next-generation sequencing data in fasta/q and csfasta/q format from Illumina, Roche 454, and the SOLiD platform. Flexbar is available on the project_ page. + +.. _project: https://github.com/seqan/flexbar + +------ + +**Trim-end modes** + +**Any:** longer side of read remains after overlap removal + +**Left:** right side remains after removal, align before or at read end + +**Right:** left part remains after removal, align after or at read start + +**Left tail:** consider first n bases of reads in alignment + +**Right tail:** use only last n bases, see tail-length options + +------ + +**Documentation** + +Further documentation is available on the `manual`__ wiki page and via the command line help screen. + +.. __: https://github.com/seqan/flexbar/wiki + +------ + +**Reference** + +Matthias Dodt, Johannes T. Roehr, Rina Ahmed, Christoph Dieterich: Flexbar — flexible barcode and adapter processing for next-generation sequencing platforms. Biology 2012, 1(3):895-905. + + + + diff -r 258b6f9e19ab -r 570a7de9f151 merge_pcr_duplicates.py --- a/merge_pcr_duplicates.py Fri Nov 27 04:33:02 2015 -0500 +++ b/merge_pcr_duplicates.py Mon Nov 30 07:53:36 2015 -0500 @@ -112,6 +112,9 @@ sep="\t", names=["chrom", "start", "stop", "read_id", "score", "strand"]) +# keep id parts up to first whitespace +alns["read_id"] = alns["read_id"].str.split(' ').str.get(0) + # combine barcode library and alignments bcalib = pd.merge( bcs, alns,