Mercurial > repos > elixir-it > corgat_multifc
comparison multifasta/align.pl @ 0:3f6d4e4340e8 draft
Uploaded
| author | elixir-it |
|---|---|
| date | Fri, 30 Oct 2020 13:32:35 +0000 |
| parents | |
| children | 2a5485758ae7 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3f6d4e4340e8 |
|---|---|
| 1 use strict; | |
| 2 use Cwd; | |
| 3 | |
| 4 my %arguments= | |
| 5 | |
| 6 ( | |
| 7 "--multi"=>"F", #F==FALSE, --multi <file> used to pass a multifasta input file | |
| 8 "--filelist"=>"F", #F==FALSE, --filelist <file> used to pass a file of file names. | |
| 9 "--suffix"=>"F", #F==FALSE, --suffix <value> specifies a file name suffix. All files with that suffix will be used | |
| 10 "--clean"=>"T", #BOLEAN: T: remove temporary directory of results. F: keep it. Defaule T | |
| 11 "--tmpdir"=>"align.tmp", #Name of the temporary directory. Defaults to align.tmp. | |
| 12 "--refile"=>"GCF_009858895.2_ASM985889v3_genomic.fna", #Name of the reference genome | |
| 13 #####OUTPUT file############################################# | |
| 14 "--out"=>"ALIGN_out.tsv" #file #OUTPUT #tabulare | |
| 15 ); | |
| 16 | |
| 17 my $dir=getcwd();#"/export/covid_wrapper/process_multifasta";#getcwd(); | |
| 18 | |
| 19 ############################################################ | |
| 20 #Process input arguments and check if valid | |
| 21 check_arguments(); | |
| 22 check_input_arg_valid(); | |
| 23 | |
| 24 ########################################################### | |
| 25 # download the ref genome. | |
| 26 my $refile=$arguments{"--refile"}; | |
| 27 unless (-e $refile) | |
| 28 { | |
| 29 download_ref(); | |
| 30 } | |
| 31 | |
| 32 ########################################################### | |
| 33 #create temporary dir for storing intermediate files | |
| 34 check_exists_command('mkdir') or die "$0 requires mkdir to create a temporary directory\n"; | |
| 35 check_exists_command('cp') or die "$0 requires cp to copy files to temporary directory\n"; | |
| 36 my $TGdir=$arguments{"--tmpdir"}; | |
| 37 if (-e $TGdir) | |
| 38 { | |
| 39 warn ("Temporary directory $TGdir does already exist!. Please be aware that all the alignment files contained in that directory will be incorporated in the output of CorGAT!\n"); | |
| 40 }else{ | |
| 41 system("mkdir $TGdir")==0||die ("can not create temporary directory $TGdir\n"); | |
| 42 } | |
| 43 | |
| 44 ########################################################### | |
| 45 # Compile the list of files to processed. | |
| 46 my @target_files=(); | |
| 47 if ($arguments{"--filelist"} ne "F") | |
| 48 { | |
| 49 # if filelist, read name. Copy files to tmpdir | |
| 50 my $lfile=$arguments{"--filelist"}; | |
| 51 open(IN,$lfile); | |
| 52 while(my $file=<IN>) | |
| 53 { | |
| 54 chomp($file); | |
| 55 push(@target_files,"$TGdir/$file"); | |
| 56 system("cp $file $TGdir")==0||die("could not copy file $file to $TGdir\n"); | |
| 57 } | |
| 58 }elsif ($arguments{"--suffix"} ne "F"){ | |
| 59 # if suffix. use all files in the present folder with suffix. Copy files to tmpdir | |
| 60 my $suffix=$arguments{"--suffix"}; | |
| 61 check_exists_command('cp') or die "$0 requires cp to copy files to temporary directory\n"; | |
| 62 system("cp *.$suffix $TGdir")==0||die "$! could not copy files with the .$suffix extension to the target dir $TGdir\n"; | |
| 63 @target_files=<$TGdir/*.$suffix>; | |
| 64 }elsif ($arguments{"--multi"} ne "F"){ | |
| 65 #if multifasta, split the files. Directly in the target folder. Compile arguments files. | |
| 66 my $multifile=$arguments{"--multi"}; | |
| 67 @target_files=@{split_fasta($multifile,$TGdir)}; | |
| 68 } | |
| 69 | |
| 70 ########################################################### | |
| 71 # Align | |
| 72 align(\@target_files,$TGdir); | |
| 73 my @alignments=<$TGdir/*_ref_qry.snps>; | |
| 74 my $out_file=$arguments{"--out"}; | |
| 75 | |
| 76 ############################################################ | |
| 77 # Consolidate the alignments and write output | |
| 78 consolidate(\@alignments,$out_file,$TGdir); | |
| 79 | |
| 80 | |
| 81 ############################################################ | |
| 82 # check if temp_files need to be removed | |
| 83 if ($arguments{"--clean"} eq "T") | |
| 84 { | |
| 85 print "--clean set to T=TRUE. I am going to delete the temporary file folder $TGdir\n"; | |
| 86 system ("rm -rf $TGdir")==0||warn("For some reason, the temporary directory $TGdir could not be removed. Please check\n"); | |
| 87 } | |
| 88 | |
| 89 | |
| 90 ###################################################################################################################################################################### | |
| 91 sub check_arguments | |
| 92 { | |
| 93 my @arguments=@ARGV; | |
| 94 for (my $i=0;$i<=$#ARGV;$i+=2) | |
| 95 { | |
| 96 my $act=$ARGV[$i]; | |
| 97 my $val=$ARGV[$i+1]; | |
| 98 if (exists $arguments{$act}) | |
| 99 { | |
| 100 $arguments{$act}=$val; | |
| 101 }else{ | |
| 102 warn("$act: unknown argument\n"); | |
| 103 my @valid=keys %arguments; | |
| 104 warn("Valid arguments are @valid\n"); | |
| 105 warn("All those moments will be lost in time, like tears in rain.\n Time to die!\n"); #HELP.txt | |
| 106 print_help(); | |
| 107 } | |
| 108 } | |
| 109 } | |
| 110 | |
| 111 | |
| 112 sub download_ref | |
| 113 { | |
| 114 print "Reference genome file, not in the current folder\n"; | |
| 115 print "CorGAT will try to Download the reference genome from Genbank\n"; | |
| 116 print "Please download this file manually, if this fails\n"; | |
| 117 check_exists_command('wget') or die "$0 requires wget to download the genome\nHit <<which wget>> on the terminal to check if you have wget\n"; | |
| 118 check_exists_command('gunzip') or die "$0 requires gunzip to unzip the genome\n"; | |
| 119 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.fna.gz")==0||die("Could not retrieve the reference genome\n"); | |
| 120 system("gunzip GCF_009858895.2_ASM985889v3_genomic.fna.gz")==0 ||die("Could not unzip the reference genome"); | |
| 121 | |
| 122 } | |
| 123 | |
| 124 sub check_exists_command { | |
| 125 my $check = `sh -c 'command -v $_[0]'`; | |
| 126 return $check; | |
| 127 } | |
| 128 | |
| 129 sub check_input_arg_valid | |
| 130 { | |
| 131 if ($arguments{"--filelist"} eq "F" && $arguments{"--suffix"} eq "F" && $arguments{"--multi"} eq "F") | |
| 132 { | |
| 133 print_help(); | |
| 134 die("No valid input mode provided. One of --filelist, --suffix or --multi needs to be provided. You set none!"); | |
| 135 } | |
| 136 unless ($arguments{"--clean"} eq "T" || $arguments{"--clean"} eq "F") | |
| 137 { | |
| 138 print_help(); | |
| 139 die("invalid value for --clean, valid options are either T or F\n"); | |
| 140 } | |
| 141 if ($arguments{"--multi"} ne "F") | |
| 142 { | |
| 143 if ($arguments{"--filelist"} ne "F" || $arguments{"--suffix"} ne "F") | |
| 144 { | |
| 145 print_help(); | |
| 146 print "Invalid options provided: --multi --filelist and --suffix are mutually exclusive\nOnly one can be T. You provided: multi->". $arguments{"--multi"} . "\tfilelist->". $arguments{"--filelist"}. "\tsuffix->". $arguments{"--suffix"}. "\n"; | |
| 147 die ("Please check and revise\n"); | |
| 148 } | |
| 149 }elsif ($arguments{"--filelist"} ne "F" && $arguments{"--suffix"} ne "F"){ | |
| 150 print_help(); | |
| 151 print "Invalid options provided: --multi --filelist and --suffix are mutually exclusive\nOnly one can be T. You provided: multi->". $arguments{"--multi"} . "\tfilelist->". $arguments{"--filelist"}. "\tsuffix->". $arguments{"--suffix"}. "\n"; | |
| 152 die ("Please check and revise\n"); | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 sub split_fasta | |
| 157 { | |
| 158 my $multiF=$_[0]; | |
| 159 die("multifasta input file does not exist $multiF\n") unless -e $multiF; | |
| 160 my $tgdir=$_[1]; | |
| 161 my @list_files=(); | |
| 162 open(IN,$multiF); | |
| 163 while(<IN>) | |
| 164 { | |
| 165 if ($_=~/^>(.*)/) | |
| 166 { | |
| 167 my $id=$1; | |
| 168 $id=(split(/\s+/,$id))[0]; | |
| 169 $id=~s/\-//g; | |
| 170 open(OUT,">$tgdir/$id.fasta"); | |
| 171 print OUT ">$id\n"; | |
| 172 push(@list_files,"$tgdir/$id.fasta"); | |
| 173 }else{ | |
| 174 chomp(); | |
| 175 print OUT; | |
| 176 } | |
| 177 } | |
| 178 return(\@list_files); | |
| 179 } | |
| 180 | |
| 181 sub align | |
| 182 { | |
| 183 my @target_files=@{$_[0]}; | |
| 184 my $TGdir=$_[1]; | |
| 185 die("Target directory does not exist\n") unless -e $TGdir; | |
| 186 check_exists_command('nucmer') or die "$0 requires nucmer to align genomes. Please check that nucmer is installed and can be executed. Hit <<which nucmer>> on\n your terminal to understand if the program is correctly installed"; | |
| 187 check_exists_command('show-snps') or die "$0 requires show-snps from the mummer package to compute polymorphic sites. Please check that show-snps is installed and can be executed. Hit <<which show-snps>> on\n your terminal to understand if the program is correctly installed"; | |
| 188 foreach my $tg (@target_files) | |
| 189 { | |
| 190 my $name=$tg; | |
| 191 chomp($name); | |
| 192 $name=~s/\.fasta//; | |
| 193 $name=~s/\.fna//; | |
| 194 $name=~s/\.fa//; | |
| 195 if (-e "$TGdir/$name\_ref_qry.snps") | |
| 196 { | |
| 197 print "output file $name\_ref_qry.snps already in folder. Alignment skipped\n" | |
| 198 }else{ | |
| 199 system("nucmer --prefix=ref_qry $refile $tg")==0||die("no nucmer alignment\n"); | |
| 200 system("show-snps -Clr ref_qry.delta > $name\_ref_qry.snps")==0||warn("no nucmer snps $tg\n"); | |
| 201 } | |
| 202 } | |
| 203 } | |
| 204 | |
| 205 sub consolidate | |
| 206 { | |
| 207 my @files=@{$_[0]}; | |
| 208 my $out_file=$_[1]; | |
| 209 my $dir_prefix=$_[2];; | |
| 210 my @genomes=(); | |
| 211 my %dat_final=(); | |
| 212 foreach my $f (@files) | |
| 213 { | |
| 214 my $name=$f; | |
| 215 $name=~s/_ref_qry.snps//; | |
| 216 $name=~s/$dir_prefix\///; | |
| 217 push(@genomes,$name); | |
| 218 open(IN,$f); | |
| 219 my %ldata=(); | |
| 220 while(<IN>) | |
| 221 { | |
| 222 next unless $_=~/NC_045512.2/; | |
| 223 my ($pos,$b1,$b2)=(split(/\s+/,$_))[1,2,3]; | |
| 224 $ldata{$pos}=[$b1,$b2]; | |
| 225 } | |
| 226 my $prev_pos=0; | |
| 227 my $pos_append=0; | |
| 228 my $prev_ref="na"; | |
| 229 my $prev_alt="na"; | |
| 230 foreach my $pos (sort{$a<=>$b} keys %ldata) | |
| 231 { | |
| 232 my $dist=$pos-$prev_pos; | |
| 233 if ($dist>1) | |
| 234 { | |
| 235 $pos_append=$prev_pos-length($prev_alt)+1; | |
| 236 $dat_final{"$pos_append\_$prev_ref|$prev_alt"}{$name}=1 unless $prev_ref eq "na"; | |
| 237 $prev_ref=$ldata{$pos}[0]; | |
| 238 $prev_alt=$ldata{$pos}[1]; | |
| 239 }else{ | |
| 240 $prev_ref.=$ldata{$pos}[0]; | |
| 241 $prev_alt.=$ldata{$pos}[1]; | |
| 242 } | |
| 243 $prev_pos=$pos; | |
| 244 } | |
| 245 $pos_append=$prev_pos-length($prev_alt)+1; | |
| 246 $dat_final{"$pos_append\_$prev_ref|$prev_alt"}{$name}=1 if $prev_ref ne "na"; | |
| 247 } | |
| 248 open(OUT,">$out_file"); | |
| 249 my $TOT=$#genomes+1; | |
| 250 my %AF=(); | |
| 251 print OUT " @genomes\n"; | |
| 252 foreach my $pos (sort{$a<=>$b} keys %dat_final) | |
| 253 { | |
| 254 my $line="$pos "; | |
| 255 my $sum=0; | |
| 256 foreach my $g (@genomes) | |
| 257 { | |
| 258 my $val=$dat_final{$pos}{$g} ? 1 : 0; | |
| 259 $sum+=$val; | |
| 260 $line.="$val "; | |
| 261 } | |
| 262 chop($line); | |
| 263 print OUT "$line\n"; | |
| 264 } | |
| 265 close(OUT); | |
| 266 } | |
| 267 | |
| 268 | |
| 269 sub print_help | |
| 270 { | |
| 271 print "This utility can be used to 1) download the reference SARS-CoV-2 genome from Genbank and 2) align it with a collection\n"; | |
| 272 print "of SARS-CoV-2 genomes. And finally 3)Call/identify genomic variants. On any *nix based system the script should be\n"; | |
| 273 print "completely capable to download the reference genome by itself. Please download the genome yourself if this fails.\n"; | |
| 274 print "Input genomes, to be aligned to the reference, can be provided by means of 3 mutually exclusive (as in only one should be set)\n"; | |
| 275 print "parameters:\n"; | |
| 276 print "##INPUT PARAMETERS\n\n"; | |
| 277 print "--multi <<filename>>\tprovides a multifasta of genome sequences\n"; | |
| 278 print "--suffix <<text>>\tspecifies an extension. All the files with that extension in the current folder will be uses\n"; | |
| 279 print "--listfile <<filename>>\tspecifies a file containing a list of file names. All files need to be in the current folder\n"; | |
| 280 print "\nTo run the program you MUST provide one of the above options. Please notice that for --suffix and --listfile ,all\n"; | |
| 281 print "files need to be in the current folder.\n\nAdditional (not strictly required) options are as follows:\n\n"; | |
| 282 print "--tmpdir <<name>>\tname of a temporary directory. All intermediate files are saved there. Defaults to align.tmp\n"; | |
| 283 print "--clean <<T/F>>\t\tif T, tmpdir is delete. Otherwise it is not.\n"; | |
| 284 print "--refile <<file>>\tname of the reference genome file. Defaults to the name of the reference assembly of the SARS-CoV-2 genome\n"; | |
| 285 print " in Genbank. Do not change uless you have a very valid reason\n"; | |
| 286 print "\n##OUTPUT PARAMETERS\n\n"; | |
| 287 print "--out <<name>>\tName of the output file. Defaults to ALIGN_out.tsv\n"; | |
| 288 print "\n##EXAMPLES:\n\n"; | |
| 289 print "1# input is multi-fasta (apollo.fa):\nperl align.pl --multi apollo.fa\n\n"; | |
| 290 print "2# use all .fasta files from the current folder:\nperl align.pl --suffix fasta\n\n"; | |
| 291 print "3# use a file of file names (lfile):\n perl align.pl --filelist lfile\n\n"; | |
| 292 } |
