#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long;
use FindBin qw($RealBin);
use File::Copy;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use File::Temp qw/ tempfile tempdir /;

our $VERSION = "0.4.2";

###########################################################################
###########################################################################
##                                                                       ##
## Copyright 2013, Simon Andrews  (simon.andrews@babraham.ac.uk)         ##
##                 Mark Fiers     (Plant & Food Research, NZ)            ##
##                 Steven Wingett (steven.wingett@babraham.ac.uk)        ##
##                                                                       ##
## This program is free software: you can redistribute it and/or modify  ##
## it under the terms of the GNU General Public License as published by  ##
## the Free Software Foundation, either version 3 of the License, or     ##
## (at your option) any later version.                                   ##
##                                                                       ##
## This program is distributed in the hope that it will be useful,       ##
## but WITHOUT ANY WARRANTY; without even the implied warranty of        ##
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         ##
## GNU General Public License for more details.                          ##
##                                                                       ##
## You should have received a copy of the GNU General Public License     ##
## along with this program.  If not, see <http://www.gnu.org/licenses/>. ##
###########################################################################
###########################################################################


unless(@ARGV){    #Display a help message if no arguments specified
  print while (<DATA>);
  exit;
}


## Option variables
my $subset_count;
my $outdir;
my $illumina;
my $quiet;
my $help;
my $version;
my $paired;
my $threads;
my $color;
my $conf;
my $bowtie_opts;
my $bowtie2_opts;
my $nohits;
my $aligner;

my $config_result = GetOptions("subset=i" => \$subset_count,
                   "outdir=s" => \$outdir,
                   "illumina1_3" => \$illumina,
                   "quiet" => \$quiet,
                   "help" => \$help,
                   "version" => \$version,
                   "paired" => \$paired,
                   "conf=s" => \$conf,
                   "color" => \$color,
                   "bowtie=s" => \$bowtie_opts,
                   "bowtie2=s" => \$bowtie2_opts,
                   "threads=i" => \$threads,
                   "nohits" => \$nohits,
                   "aligner=s" => \$aligner
                  );

die "Could not parse options" unless ($config_result);

if ($help) {
  print while (<DATA>);
  exit;
}

if ($version) {
  print "fastq_screen v$VERSION\n";
  exit;
}

if ($color) {
  $color = '-C';
}
else {
  $color = '';
}

if(defined $aligner){
  if( ($aligner ne 'bowtie') and ($aligner ne 'bowtie2') ){
    die "Valid options for --aligner are 'bowtie' or 'bowtie2' only.\n";
  }elsif( ($aligner eq 'bowtie') and (defined $bowtie2_opts) ){
    die "Bowtie selected as the aligner yet bowtie2 options specified.\n";
  }elsif( ($aligner eq 'bowtie2') and (defined $bowtie_opts) ){
    die "Bowtie 2 selected as the aligner yet bowtie options specified.\n";
  }
}

$bowtie_opts = '' unless ($bowtie_opts); # Get undef warning otherwise
$bowtie2_opts = '' unless ($bowtie2_opts); # Get undef warning otherwise

# Configuration
my $number_of_threads = 1;
my $path_to_bowtie = 'bowtie';
my $path_to_bowtie2 = 'bowtie2';
my @libraries;
my %library_bowtie_version;   # Records which library should be used with which version of bowtie (i.e bowtie1 or bowtie2)

load_configuration($conf);

# Override the configuration default if they've
# manually specified a number of threads to use
$number_of_threads = $threads if ($threads);

unless (@libraries) {
  die "No search libraries were configured";
}


my @files = @ARGV;

die "No files to process" unless (@files);

my $index = 0;
while ($index <= $#files) {
  if ($paired) {
    process_file($files[$index],$files[$index+1]);
    $index+=2;
  }
  else {
    process_file($files[$index]);
    $index++;
  }
}



sub process_file {
  
  my ($file,$file2) = @_;
  
  my %seq_genomes; # Stores the hits to each genome
  my $readsprocessed = 0; # Stores the total number of sequences searched
  
  
  if ($file2) {
    warn "Processing $file and $file2\n" unless ($quiet);    
  }
  else {
    warn "Processing $file\n" unless ($quiet);
  }
  
  # Check that we can actually find the files we're working with
  unless (-e $file) {
    warn "Couldn't locate file $file - skipping\n";
    return;
  }
  if ($file2) {
    unless (-e $file2) {
      warn "Couldn't locate file $file2 - skipping\n";
      return;
    }
  }
  
  # Work out the output file name we're going to use
  my $outfile = $file;
  $outfile =~ s/\.(txt|seq|fastq)$//i;
  
  # We can try to remove the end specific part of the name
  # if they're using standard Illumina naming. It doesn't
  # really matter if this fails
  if ($paired) {
    $outfile =~ s/s_(\d)_1_sequence/s_${1}_sequence/;
  }
  
  $outfile .= "_screen.txt";
  
  if ($outdir) {
    $outfile = (split(/\//,$outfile))[-1];
    $outfile = $outdir."/".$outfile;
  }
  
  if (-e $outfile) {
    warn "Output file $outfile already exists - skipping\n";
    return;
  }
  
  open (OUT,'>',$outfile) or do {
    warn "Couldn't write to $outfile: $!";
    return;
  };
  
  #Print out the Version and other information to the output file
 
  if($subset_count){
    print OUT "#Fastq_screen version: $VERSION\t#Reads in subset: $subset_count\n";
  }else{
    print OUT "#Fastq_screen version: $VERSION\n";
  }

  #Print the headers to the output file
  print OUT join ("\t",('Library','#Reads_processed','#Unmapped','%Unmapped','#One_hit_one_library','%One_hit_one_library','#Multiple_hits_one_library','%Multiple_hits_one_library','#One_hit_multiple_libraries','%One_hit_multiple_libraries','Multiple_hits_multiple_libraries','%Multiple_hits_multiple_libraries')),"\n";     
  
  my $temp_file;
  my $temp_file2;
  
  my $read_length = get_read_length($file);
  
  if ($read_length < 0) {
    warn "Failed to calculate read length from $file";
    return;
  }
  
  if ($read_length < 20) {
    warn "Ignoring reads shorter than 20bp\n";
    $read_length = 20;
  }

  # We don't use a seed of >40 even if the reads are that long
  $read_length = 40 if ($read_length > 40);
  
  
  # Count the sequences in the file.
  # We need to make a subset of these sequences
  
  # First we need to count how many sequences are in the original file
  my $seqcount = 0;
  
  warn "Counting sequences in $file\n" unless ($quiet);
  
  #Check if the read file is compressed and open accordingly
  if($file =~ /\.gz$/){
    open (IN,"zcat \'$file\' |") or do {
      warn "Can't read $file: $!";
      return;
    };
  }else{
    open (IN,$file) or do {
      warn "Can't read $file: $!";
      return;
    }; 
  }
  
  ++$seqcount while (<IN>);
  
  $seqcount = int($seqcount/4);
  
  if ($subset_count and $seqcount > $subset_count*2) {    # We actually need to do the reduction
    
    $temp_file = $file;
    $temp_file .= "_temp_subset.txt";
    
    # Since we're writing we need to do it in
    # the directory we know we're allowed to write
    # to.
    
    if ($outdir) {
      $temp_file = (split(/\//,$temp_file))[-1];
      $temp_file = $outdir."/".$temp_file;
    }
    
    open (TEMP,'>',$temp_file) or do {
      warn "Can't write temp subset file: $!";
    return;
    };
    
    my $interval = sprintf("%.0f",($seqcount/$subset_count));
    
    warn "Making reduced seq file with ratio $interval:1\n" unless ($quiet);
    
    #Move back to the start of the file.  The Seek function will not work if reading the file using zcat, so
    #close and open the file to move back to the start.
    close IN;
    if($file =~ /\.gz$/){
      open (IN,"zcat \'$file\' |") or do {
    warn "Can't read $file: $!";
    return;
      };
    }else{
      open (IN,$file) or do {
    warn "Can't read $file: $!";
    return;
      }; 
    }
    
    my $current_count = 0;
    while (<IN>) {
      if (/^@/) {
    my $record = $_;
    $record .= scalar <IN>;
    $record .= scalar <IN>;
    $record .= scalar <IN>;
    
    if ($current_count % $interval == 0) {
      print TEMP $record;
      ++$readsprocessed;
    }
    ++$current_count;
      }
    }
    
    close (TEMP) or do {
      warn "Can't write temp subset file: $!";
      return;
    };
    
    $file = $temp_file;
    
    if ($file2) {    # We need to do the reduction in file 2 as well
      
      $temp_file2 = $file2;
      $temp_file2 .= "_temp_subset.txt";
      
      # Since we're writing we need to do it in
      # the directory we know we're allowed to write
      # to.
      
      if ($outdir) {
    $temp_file2 = (split(/\//,$temp_file2))[-1];
    $temp_file2 = $outdir."/".$temp_file2;
      }
      
      open (TEMP,'>',$temp_file2) or do {
    warn "Can't write temp2 subset file: $!";
    return;
      };
      
      warn "Making reduced seq file 2 with ratio $interval:1\n" unless ($quiet);
      
      #Check if the read file is compressed and open accordingly
      if($file2 =~ /\.gz$/){
    open (IN,"zcat \'$file2\' |") or do {
      warn "Failed to read $file2: $!";
      unlink ($temp_file) or warn "Couldn't delete temp file $temp_file: $!";
      return;
    };
      }else{
    open (IN,$file2) or do {
      warn "Failed to read $file2: $!";
      unlink ($temp_file) or warn "Couldn't delete temp file $temp_file: $!";
      return;
    };
      }
      
      my $current_count = 0;
      while (<IN>) {
    if (/^@/) {
      my $record = $_;
      $record .= scalar <IN>;
      $record .= scalar <IN>;
      $record .= scalar <IN>;
      
      print TEMP $record if ($current_count % $interval ==0);
      ++$current_count;
    }
      }
      
      close (TEMP) or do {
    warn "Can't write temp2 subset file: $!";
    return;
      };
      
      $file2 = $temp_file2;
    }    
  }    
  else {
    $readsprocessed = $seqcount;
    if ($subset_count) {
      warn "Not making subset of $subset_count since $seqcount actual seqs is too low or close enough\n" unless ($quiet);
    } 
    #If no file reduction was made and the original sequence file(s) were zipped, then they need to be unzipped before
    #sending to Bowtie.
    if ($file =~ /\.gz$/) {
      
      my $temp_copied_file = $file;
      $temp_copied_file .= "_temp_copied.gz";
      
      # Since we're writing we need to do it in
      # the directory we know we're allowed to write
      # to. 
      if ($outdir) {
    $temp_copied_file = (split(/\//,$temp_copied_file))[-1];
    $temp_copied_file = $outdir."/".$temp_copied_file;
      }
      
      #Copy the file to the output directory then unzip
      #Copying the file first avoids unzipping problems if the filename is a symbolic link
      copy ("$file", "$temp_copied_file");
      gunzip $temp_copied_file => "$temp_copied_file".".txt" or die "gunzip failed: $GunzipError\n";
      $file = $temp_copied_file;
      unlink $temp_copied_file;
      $file = $temp_copied_file.".txt";
      $temp_file = $file;    #Removes the unzipped file after Bowtie is finished mapping
    }
    
    if(($file2) and ($file2 =~ /\.gz$/)) {
      
      my $temp_copied_file2 = $file2;
      $temp_copied_file2 .= "_temp_copied.gz";
      
      if ($outdir) {
    $temp_copied_file2 = (split(/\//,$temp_copied_file2))[-1];
    $temp_copied_file2 = $outdir."/".$temp_copied_file2;
      }
      copy ("$file2", "$temp_copied_file2");
      gunzip $temp_copied_file2  => "$temp_copied_file2".".txt" or die "gunzip failed: $GunzipError\n";
      unlink  $temp_copied_file2;
      $file2 = $temp_copied_file2.".txt";
      $temp_file2 = $file2;    #Removes the unzipped file after Bowtie is finished mapping 
    }    
  }


  my $library_index = -1; # Make lists in the same order as @libraries

  foreach my $library (@libraries) {    
    #Write Bowtie/Bowtie2 Standard Error to a temporary output file
    #Generate a random filename and place in $outdir (if specified)
    my $error_fh; 
    my $error_filename;
    if($outdir){
      ($error_fh, $error_filename) = tempfile( 'aligner_standard_error.XXXXXXXX', SUFFIX => '.txt', DIR => $outdir);
    }else{
      ($error_fh, $error_filename) = tempfile( 'aligner_standard_error.XXXXXXXX', SUFFIX => '.txt');
    }
    
    if($file2){
      warn "Searching $file and $file2 against $library->[0]\n" unless ($quiet);
    }else{
      warn "Searching $file against $library->[0]\n" unless ($quiet);
    }
    
    my $illumina_flag = '';
    $illumina_flag = "--phred64-quals" if ($illumina);
    
    my $bowtie_command;
    
    
    #Count the index of the library being used
    $library_index++;
    
    
    #Determine whether to execute bowtie1 or bowtie2, whether this is a paired-end or single-end, and execute accordingly
    if($library_bowtie_version{$library->[1]} == 1 ){
      if ($file2) {
    $bowtie_command = "$path_to_bowtie $bowtie_opts $illumina_flag $color -l $read_length -k 2 --chunkmbs 512 --maxins 1000 -p $number_of_threads $library->[1] -1 \'$file\' -2 \'$file2\' 2>$error_filename |";
      }
      else {
    $bowtie_command = "$path_to_bowtie $bowtie_opts $illumina_flag $color -l $read_length -k 2 --chunkmbs 512 -p $number_of_threads $library->[1] \'$file\' 2>$error_filename |";
      }
    }else{
      if ($file2) {
    $bowtie_command = "$path_to_bowtie2 $bowtie2_opts $illumina_flag -k 2 --very-fast-local --no-hd --no-unal --maxins 1000 -p $number_of_threads $library->[1] -1 \'$file\' -2 \'$file2\' 2>$error_filename|";
      }
      else {
    $bowtie_command = "$path_to_bowtie2 $bowtie2_opts $illumina_flag -k 2 --very-fast-local --no-hd --no-unal -p $number_of_threads $library->[1] \'$file\' 2>$error_filename |";
      }
    }    
    
    open (BOWTIE,$bowtie_command) or die "Failed to launch bowtie command '$bowtie_command': $!";
    
    my $seqname_suffix = 1;    #Explained below
    
    while (<BOWTIE>) {
      
      my $line = $_;
      chomp $line;
      
      #Check output is in native original bowtie format or SAM format
      unless( ($line =~ /^.+\t[\+\-]\t.+\t.+\t[TAGCN]+\t.+/) or  ($line =~ /^.+\t\d+\t\S+\t\d+\t\d+\t\S+\t.+\t\d+\t\S+\t[ATCGN]+\t.+/) ){
    # This is a progress message coming from stderr
    warn "Bowtie/Bowtie2 output not in correct format: $_\n";
    next;
      }
      
      my ($seqname) = split(/\t/, $line);
      
      #Unlike the original bowtie native output format, bowtie2 removes the '/1' or '/2' from the end of a pair-end read (in fact bowtie adds the '/1' or '/2'
      #if it is missing in the FASTQ file). This will cause counting problems. To remedy this add '/1' or '/2' or alternating alignments
      if( ($library_bowtie_version{$library->[1]} == 2) and $file2 ){   #Bowtie2 used as aligner and a paired-end read
    $seqname = $seqname.'/'.$seqname_suffix;
    $seqname_suffix = abs($seqname_suffix - 3);    #seqname_suffix alternates between 1 and 2
    }
      
      
    # Initialise an array of zero values the same length as @libraries
      unless (exists $seq_genomes{$seqname}){
    for my $position(0..$#libraries){
      $seq_genomes{$seqname}->[$position] = 0;
    }
      }
      
      
      # If there is already a hit for this read against this library, set the hit score to 2
      # If this is the first hit, set the score to 1
      if($seq_genomes{$seqname}->[$library_index] == 1){ 
    $seq_genomes{$seqname}->[$library_index] = 2;
      }else{
    $seq_genomes{$seqname}->[$library_index] = 1;
      } 
    }
    
    
    #Check the Standard Error output file and report any errors
    #Bowtie reports the alignment summary to standard error, so ignore the alignment summary
    while(<$error_fh>){
      unless (/^\#/ or /^Reported/ or /^No alignments/ or /reads; of these/ or /were unpaired; of these/ or /aligned/ or /overall alignment rate/ or /mates make up the pairs; of these/ or /were paired; of these/ or /----/) {
    warn "Bowtie/Bowtie2 warning: $_";
      } 
    }
    close $error_fh;
    unlink $error_filename or die "Could not delete temporary Standard Error file '$error_filename' : $!";
  }
  
  
  # Collate the hit results from the Bowtie searches. 
  #Each read/read pair can be classified as 1) maps to no libraries;
  #2) maps uniquely to a specified library; 3) maps multiple times to a specified library, but to no others;
  #4) maps once to a specified library and to at least one other library; 5) maps multiple
  #times to a specified library and to at least one other library.
  
  my @one_hit_one_library;
  my @one_hit_multiple_libraries;
  my @multiple_hits_one_library;
  my @multiple_hits_multiple_libraries;
  
  # Initialise thoses arrays with zero values, making them the same length as @libraries
  for my $position(0..$#libraries){
    $one_hit_one_library[$position] = 0;
    $one_hit_multiple_libraries[$position] = 0;
    $multiple_hits_one_library[$position] = 0;
    $multiple_hits_multiple_libraries[$position] = 0;
  }
  
  foreach my $seq_id(keys %seq_genomes){
    my $numlib = 0;
    
    # Count up the number of libraries where this sequence
    # had a hit
    foreach (@{$seq_genomes{$seq_id}}) {
      $numlib++ if ($_);
    }
    
    #Identify the hits 
    # Classify the type of hit
    if($numlib == 1){   
      foreach my $index (0..$#{$seq_genomes{$seq_id}}){ 
    if($seq_genomes{$seq_id}->[$index] == 2){
      $multiple_hits_one_library[$index]++;    #Maps multiple times to a specified library
    }else{
      $one_hit_one_library[$index] += $seq_genomes{$seq_id}->[$index];    #Maps uniquely / does not map (i.e. add 0)
    }
      }
    }else{
      foreach my $index (0..$#{$seq_genomes{$seq_id}}){       
    if($seq_genomes{$seq_id}->[$index] == 2){
      $multiple_hits_multiple_libraries[$index]++;    #Maps multiple times to a specified library, but to no others
    }else{
      $one_hit_multiple_libraries[$index] += $seq_genomes{$seq_id}->[$index];    #Maps multiple times / does not map (i.e. add 0)
    }
      }
    }   
  }  
    
  # If we're doing a paired end search then all our counts will
  # be double what they should be since we'll get hits for both
  # of the sequences in the pair with different ids (bowtie1 will
  # make them different even if they weren't originally).  We
  # therefore need to halve the counts before calculating percentages.
  # The easiest way to do this is to double the expected total
  # count
  
  $readsprocessed *= 2 if ($file2);
  
  # Summarise the counts and write the text report  
  foreach my $index (0..$#libraries){
    my $library = $libraries[$index];
    
    my $percent_one_hit_one_library = ($one_hit_one_library[$index] / $readsprocessed) * 100;
    $percent_one_hit_one_library = sprintf("%.2f",$percent_one_hit_one_library);

    my $percent_one_hit_multiple_libraries = ($one_hit_multiple_libraries[$index] / $readsprocessed) * 100;
    $percent_one_hit_multiple_libraries  = sprintf("%.2f",$percent_one_hit_multiple_libraries);

    my $percent_multiple_hits_one_library = ($multiple_hits_one_library[$index] / $readsprocessed) * 100;
    $percent_multiple_hits_one_library = sprintf("%.2f", $percent_multiple_hits_one_library);

    my $percent_multiple_hits_multiple_libraries = ($multiple_hits_multiple_libraries[$index] / $readsprocessed) * 100;
    $percent_multiple_hits_multiple_libraries = sprintf("%.2f", $percent_multiple_hits_multiple_libraries);

    my $percent_unmapped = 100 - $percent_one_hit_one_library - $percent_one_hit_multiple_libraries - $percent_multiple_hits_one_library - $percent_multiple_hits_multiple_libraries;
    $percent_unmapped = sprintf("%.2f", $percent_unmapped);

    my $reads_unmapped = $readsprocessed - $one_hit_one_library[$index] - $one_hit_multiple_libraries[$index] - $multiple_hits_one_library[$index] - $multiple_hits_multiple_libraries[$index];

    print OUT join("\t",($library->[0], $readsprocessed, $reads_unmapped, $percent_unmapped, $one_hit_one_library[$index], $percent_one_hit_one_library, $multiple_hits_one_library[$index], $percent_multiple_hits_one_library, $one_hit_multiple_libraries[$index], $percent_one_hit_multiple_libraries, $multiple_hits_multiple_libraries[$index], $percent_multiple_hits_multiple_libraries)),"\n";
  }

  #Calculate the number of reads that mapped to none of the libraries
  my $percent_hit_no_libraries = (($readsprocessed - (scalar keys %seq_genomes)) / $readsprocessed) * 100;
  $percent_hit_no_libraries = sprintf("%.2f",$percent_hit_no_libraries);
  print OUT "\n\%Hit_no_libraries: $percent_hit_no_libraries\n";  
  
  close IN;
  close OUT or die "Coudn't write to $outfile: $!"; 
  

  if($nohits){
    if($file2){
      find_no_hits(\%seq_genomes, $file, $file2, $outfile);
    }else{
      find_no_hits(\%seq_genomes, $file, $outfile);
    }
  }
  
  unlink($temp_file) or warn "Unable to delete temp file '$temp_file'" if ($temp_file);
  
  unlink($temp_file2) or warn "Unable to delete temp file '$temp_file'" if ($temp_file2);


  #Check whether the module GD::Graph is installed
  eval{ 
    require GD::Graph::bars; 
    GD::Graph::pie->import(); 
  }; 

  unless($@) { 
    make_graph($outfile);
  }else{
    warn "Perl module GD::Graph::bars not installed, skipping charts\n";
  }
}

print "Processing complete\n";

  

#####################################################################################################
#Subroutines
#####################################################################################################


sub load_configuration {
  
  # Find the config file
  
  my ($conf_file) = @_;
  
  # If they haven't specified a conf file then look
  # in the directory containing the program.
  $conf_file = "$RealBin/fastq_screen.conf" unless ($conf_file);
  
  unless (-e $conf_file) {
    die "Couldn't find fastq_screen.conf at '$conf_file'";
  }

  warn "Reading configuration from '$conf_file'\n" unless ($quiet);
  
  open (CONF,$conf_file) or die "Can't read $conf_file : $!";
  
  #Determine aligner to use from 1) paths specified and 2) databases if no aligner specified 
  if(!defined $aligner){   
    my $bowtie1_path_exists = 0;
    my $bowtie2_path_exists = 0;
    my $bowtie1_database_exists = 0;
    my $bowtie2_database_exists = 0;
    
    while (<CONF>) {
      chomp;
      s/^\s+//;
      s/\s+$//;
      
      next if (/^\#/);
      next unless ($_);
      
      my ($name) = split(/\s+/);
      
      if ($name eq 'BOWTIE'){
    if( defined ((split(/\s+/,$_,2))[1]) ){
      $bowtie1_path_exists = 1; 
    }
      }elsif($name eq 'BOWTIE2'){
    if( defined ((split(/\s+/,$_,2))[1]) ){
      $bowtie2_path_exists = 1; 
    }
      }elsif ($name eq 'DATABASE') {
    my (undef, undef, undef, $bowtie_version) = split(/\s+/,$_,4);
    
    if(!defined $bowtie_version){    #Determine whether an index should be used with bowtie1 or bowtie2
      $bowtie1_database_exists = 1;
    }elsif($bowtie_version =~ /bowtie2/i){
      $bowtie2_database_exists = 1;
      
    }
      }
    }

    #Is bowtie or bowtie2 in path?
    unless(system "which bowtie >/dev/null 2>&1"){
      $bowtie1_path_exists = 1; 
    }
    unless(system "which bowtie2 >/dev/null 2>&1"){
      $bowtie2_path_exists = 1; 
    }
      
    #Decide whether to proceed and display message
    if($bowtie1_path_exists and $bowtie2_path_exists and $bowtie1_database_exists and $bowtie2_database_exists){
      $aligner = 'bowtie';
      warn "Aligner not specified (--aligner). Path to bowtie and bowtie2 both specified, bowtie and bowtie2 database both present: defaulting to bowtie\n";
    }elsif($bowtie1_path_exists and $bowtie1_database_exists){
      $aligner = 'bowtie';
      warn "Aligner not specified (--aligner). Path to bowtie specified and bowtie database(s) present: mapping with bowtie\n";
    }elsif($bowtie2_path_exists and $bowtie2_database_exists){
      $aligner = 'bowtie2';
      warn "Aligner not specified (--aligner). Path to bowtie2 specified and bowtie2 database(s) present: mapping with bowtie2\n";
    }else{
      die "Aligner not specified (--aligner). No path to bowtie or bowtie2, or, no bowtie or bowtie2 databases specified: lease adjust configuration file.\n";
    } 
    seek (CONF,0,0);    #Return to start of conf file
  }   


  while (<CONF>) {
    chomp;
    s/^\s+//;
    s/\s+$//;
    
    next if (/^\#/);
    next unless ($_);

    my ($name) = split(/\s+/);

    if ( ($name eq 'BOWTIE') and ($aligner eq 'bowtie') ) {

      $path_to_bowtie = (split(/\s+/,$_,2))[1];

      die "Empty value set for BOWTIE config parameter" unless ($path_to_bowtie);

      warn "Using '$path_to_bowtie' as bowtie path\n" unless ($quiet);
    }

    elsif( ($name eq 'BOWTIE2') and ($aligner eq 'bowtie2') ) {
      $path_to_bowtie2 = (split(/\s+/,$_,2))[1];
      
      die "Empty value set for BOWTIE2 config parameter" unless ($path_to_bowtie2);
      
      warn "Using '$path_to_bowtie2' as bowtie2 path\n" unless ($quiet);
    }

    elsif ($name eq 'THREADS') {
      $number_of_threads = (split(/\s+/))[1];
      unless ($number_of_threads =~ /^\d+$/) {
    die "Invalid number of threads '$number_of_threads set in conf file";
      }
      warn "Using $number_of_threads threads for searches\n" unless ($quiet);
    }

    elsif ($name eq 'DATABASE') {
      my (undef,$db_name,$db_path,$bowtie_version) = split(/\s+/,$_,4);
      
      if(!defined $bowtie_version){    #Determine whether an index should be used with bowtie1 or bowtie2
    $bowtie_version = 1;
      }elsif($bowtie_version =~ /bowtie2/i){
    $bowtie_version = 2;
    #Bowtie2 cannot process colorspace reads
    if($color){
      die "Bowtie2 cannot process colorspace reads, in the configuration file please specify original Bowtie indices for '$db_name'\n";
    }
      }elsif($bowtie_version =~ /bowtie/i){
    $bowtie_version = 1;
      }else{
    die "Could not determine which bowtie version to use for: '$db_name\t$db_path\t$bowtie_version'\nPlease modify the configuraion file by entering after the path to the desired sequence aligner either i)'Bowtie2' or ii)leave blank to default to the original Bowtie\n";
      } 
      
      if( ($bowtie_version == 1) and ($aligner eq 'bowtie2') ){    #Check the index is for the aligner to be used
    next;
      }elsif( ($bowtie_version == 2) and ($aligner eq 'bowtie') ){
    next;
      }

      $library_bowtie_version{$db_path} = $bowtie_version;

      # Check to see that there's a bowtie or bowtie2 index at that location
      if($bowtie_version == 1){
    unless (-e "$db_path.1.ebwt") {
      warn "Skipping DATABASE '$db_name' since no bowtie index was found at '$db_path'\n";
      next;
    }
      }else{
    unless (-e "$db_path.1.bt2") {
      warn "Skipping DATABASE '$db_name' since no bowtie index was found at '$db_path'\n";
      next;
    }
      }

      warn "Adding database $db_name\n" unless ($quiet);
      push @libraries, [$db_name,$db_path];
    }
  }
  close CONF;
}



sub get_read_length {

  my ($file) = @_;

  #Check if the read file is compressed and open accordingly
  if($file =~ /\.gz$/){
    open (IN,"zcat \'$file\' |") or do {
      warn "Failed to read $file: $!";
      return 0;
    };
  }else{
    open (IN,$file) or do {
      warn "Failed to read $file: $!";
      return 0;
    }; 
  }

  my $shortest_length = -1;

  my $current_count = 0;
  while (<IN>) {
    if (/^@/) {
      my $seq = scalar <IN>;
      chomp $seq;
      if ($shortest_length < 0 or length $seq < $shortest_length) {
    $shortest_length = length $seq;
      }

      # Skip the rest of the record
      $_ = scalar <IN>;
      $_ = scalar <IN>;

      ++$current_count;
      last if ($current_count > 1000);
    }
  }
  return $shortest_length;
}



sub make_graph {

  my ($file) = @_;

  my $outfile = $file;
  $outfile =~ s/\.txt$//;

  open (IN,$file) or die "Can't open data file '$file' to create graph";

  my @data;
  my $percent_no_hits;

  $_ = <IN>;    #Header
  $_ = <IN>;    #Header

  while (<IN>) {
    chomp;

    if(/^\s*$/){
      next;
    }

    if(/^\%Hit_no_libraries: (\S+)/){
      $percent_no_hits = $1;
      last;
    }

    my ($name, undef, undef, $unmapped, undef, $percent_one_hit_one_library, undef, $percent_multiple_hits_one_library, undef, $percent_one_hit_multiple_libraries, undef, $percent_multiple_hits_multiple_libraries) = split(/\t/);

    push @{$data[0]},$name;
    push @{$data[1]},$percent_one_hit_one_library;
    push @{$data[2]},$percent_multiple_hits_one_library;
    push @{$data[3]},$percent_one_hit_multiple_libraries;
    push @{$data[4]},$percent_multiple_hits_multiple_libraries;
  }

  close IN;

  #Add a separate bar for the reads that hit none of the libraries
  push @{$data[0]},'No hits';
  push @{$data[1]},$percent_no_hits;
  push @{$data[2]},'0';
  push @{$data[3]},'0';
  push @{$data[4]},'0';

  # my $graph = GD::Graph::bars->new(75* @{$data[0]},350);
  # too small - ross fix may 12 2014
  my $graph = GD::Graph::bars->new(1024,768);


  $graph->set_legend('One hit\one library', 'Multiple hits\one library', 'One hit\multiple libraries','Multiple hits\multiple libraries');

  $graph -> set (
         y_label => '% Mapped',
         title => ((split(/\//,$outfile))[-1]),
         y_max_value => 100,
         bar_spacing => 15,
         transparent => 0,
         overwrite => 1,
         cumulate => 1,
         dclrs => [qw(lblue dblue lred dred)],
        );

  my $gd = $graph -> plot (\@data);



  $outfile .= ".png";

  open (IMG,'>',$outfile) or die "Can't write graph to $outfile: $!";
  binmode IMG;
  print IMG $gd->png();
  close IMG or die "Can't write graph to $outfile: $!";
}



sub find_no_hits{
  my $seq_genomes;
  my $file;
  my $file2;
  my $no_hits_filename;
  my @filenames = ();
  my $count = 0;

  #Determine if we have a paired read
  if(scalar @_ == 4){
    ($seq_genomes, $file, $file2, $no_hits_filename) = @_;
    push(@filenames, ($file, $file2));
  }else{
    ($seq_genomes, $file, $no_hits_filename) = @_;
     push(@filenames, $file);
  }
  
  $no_hits_filename .= "_no_hits.txt";

  foreach my $filename (@filenames){
    $count++;

    open(IN, $filename) or die "Can't read $filename : $!";

    open(OUT, ">>$no_hits_filename") or die "Could not write to $no_hits_filename";
    
    while(<IN>){   
      if (/^@/) {
    my $seq_id = $_;
    chomp $seq_id;
    $seq_id = substr($seq_id,1); #Remove the '@' symbol to match Bowtie output
    my $rest_of_record = scalar <IN>;
    $rest_of_record .= scalar <IN>;
    $rest_of_record .= scalar <IN>;

    #Read files containing paired-end reads will sometimes name the reads according to whether they are the #1 or #2 mates
    # by appending a /1 or /2 suffix to the read name. If no such suffix is present in Bowtie's input, the suffix will be added 
    #when Bowtie prints read names in alignments.
    #Paired files without the /1, /2 convention will cause problems for fastq_screen, since the Bowte output read name will not match 
    #that in the input file.  The code below corrects for this.
    if($file2){
      my $mate = substr($seq_id, -2);
      unless(($mate eq '/1') or ($mate eq '/2')){
        $seq_id .= '/'."$count";
      }
    }

    unless(exists $$seq_genomes{$seq_id}){
      print OUT "@"."$seq_id"."\n"."$rest_of_record";
    }
      }
    }
    close IN;
    close OUT or die "Coudn't write to $no_hits_filename: $!";
  }
}



__DATA__

Fastq Screen - Screen sequences against a panel of databases

Synopsis

  fastq_screen [OPTION]... [FastQ FILE]...

Function

  Fastq Screen is intended to be used as part of a QC pipeline.
  It allows you to take a sequence dataset and search it
  against a set of bowtie databases.  It will then generate
  both a text and a graphical summary of the results to see if
  the sequence dataset contains the kind of sequences you expect
  or not.

Options

  --help -h      Print program help and exit

  --subset       Don't use the whole sequence file to search, but
                 create a temporary dataset of this size. The
                 dataset created will be of approximately (within
                 a factor of 2) of this size. If the real dataset
                 is smaller than twice the specified size then the
                 whole dataset will be used. Subsets will be taken
                 evenly from throughout the whole original dataset

  --paired       Files are paired end. Files must be specified in
                 the correct order with pairs of files coming
                 immediately after one another. Results files will
                 be named after the first file in the pair if the
                 names differ between the two files.

  --outdir       Specify a directory in which to save output files.
                 If no directory is specified then output files
                 are saved into the same directory as the input
                 file.

  --illumina1_3  Assume that the quality values are in encoded in
                 Illumina v1.3 format. Defaults to Sanger format
                 if this flag is not specified

  --quiet        Supress all progress reports on stderr and only
                 report errors

  --version      Print the program version and exit

  --threads      Specify across how many threads bowtie will be
                 allowed to run. Overrides the default value set
                 in the conf file

  --conf         Manually specify a location for the configuration
                 file to be used for this run. If not specified 
                 then the file will be taken from the same directory 
                 as the fastq_screen program

  --color        FastQ files are in colorspace. This requires that 
                 the libraries configures in the config file are 
                 colorspace indices.

  --bowtie       Specify extra parameters to be passed to bowtie. 
                 These parameters should be quoted to clearly 
                 delimit bowtie parameters from fastq_screen 
                 parameters. You should not try to use this option 
                 to override the normal search or reporting options 
                 for bowtie which are set automatically but it might 
                 be useful to allow reads to be trimmed before
                 alignment etc.

  --bowtie2      Specify extra parameters to be passed to bowtie 2. 
                 These parameters should be quoted to clearly 
                 delimit bowtie2 parameters from fastq_screen 
                 parameters. You should not try to use this option 
                 to override the normal search or reporting options 
                 for bowtie which are set automatically but it might 
                 be useful to allow reads to be trimmed before
                 alignment etc.

  --nohits       Writes to a file the sequences that did not map to 
                 any of the specified genome libraries. If the 
                 subset option is also specified, only reads from 
                 the temporary dataset that failed to align to the
                 reference genomes will be written to the output file.

  --aligner     Specify the aligner to use for the mapping. Valid 
                arguments are 'bowtie' or 'bowtie2'.
                 
