diff snpEff_2_1a/scripts/proteinFasta2NM.pl @ 0:f8eaa3f8194b default tip

Uploaded snpEff_v2_1a_core.tgz from Pablo Cingolani
author greg
date Fri, 20 Apr 2012 14:47:09 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpEff_2_1a/scripts/proteinFasta2NM.pl	Fri Apr 20 14:47:09 2012 -0400
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+#-------------------------------------------------------------------------------
+#
+# Convert fasta headers from protein ID (NP_XXXX) into transcript ID NM_XXXX
+#
+#
+#															Pablo Cingolani
+#-------------------------------------------------------------------------------
+
+# Parse command line arguments
+$refLink = $ARGV[0];
+die "Usage: cat file.fasta | ./proteinFasta2NM.pl refLink.txt > protein_NM.fasta" if $refLink eq '';
+
+# Read refLink file
+open RL, $refLink or die "Cannot opne file '$refLink'\n";
+while( <RL> ) {
+	chomp;
+	@t = split /\t/;
+	($nm, $np) = ($t[2], $t[3]);
+
+	if( $np ne '' ) {
+		if( $trId{$np} ne '' )	{ print STDERR "Error: Non empty entry '$np' = $trId{$np}\n"; }
+		else					{ $trId{$np} = $nm; }
+	}
+}
+
+# Read fasta file
+while( <STDIN> ) {
+	chomp;
+
+	if( /^>(.*)/ ) {	# Header? => change form protein NP_XXX to transcript NM_XXXX
+		# Get NM_ field
+		@t = split /\|/;
+		$np = $t[3];
+
+		# Remove anything after the dot
+		if( $np =~ /(.*)\./ )	{ $np = $1; }
+
+		# Found a transcript ID? 
+		if( $trId{$np} ne '' )	{ print ">$trId{$np}\n"; }
+		else					{ print "$l\n"; }
+	} else { print "$_\n"; }	# Show line
+}
+