Previous changeset 18:f7bfe1fbd543 (2018-06-14) Next changeset 20:72aa8e0ad523 (2018-06-18) |
Commit message:
Uploaded |
modified:
gene_family_scaffold_updater.pl |
b |
diff -r f7bfe1fbd543 -r 813f7ef1b1b5 gene_family_scaffold_updater.pl --- a/gene_family_scaffold_updater.pl Thu Jun 14 14:32:13 2018 -0400 +++ b/gene_family_scaffold_updater.pl Thu Jun 14 14:32:21 2018 -0400 |
[ |
b'@@ -2,7 +2,7 @@\n # Author: Eric Wafula\n # Email: ekw10@psu.edu\n # Institution: Penn State University, Biology Dept, Claude dePamphilis Lab\n-# Date: 01-31-2015\n+# Date: June 2018\n \n use strict;\n use warnings;\n@@ -24,30 +24,32 @@\n # Required Options:\n #\n #\n-# --database_connection_string <string> : Postgres database connection string using format\n-# postgresql://<user>:<password>@<host>/<database name>\n+# --database_connection_string <string> : Postgres database connection string using format\n+# postgresql://<user>:<password>@<host>/<database name>\n #\n-# --proteins <string> : Amino acids (proteins) sequences fasta file (proteins.fasta)\n-# This can either be an absolute path or just the file name\n+# --proteins <string> : Amino acids (proteins) sequences fasta file (proteins.fasta)\n+# This can either be an absolute path or just the file name\n #\n-# --coding_sequences <string> : Corresponding coding sequences (CDS) fasta file (cds.fasta)\n+# --coding_sequences <string> : Corresponding coding sequences (CDS) fasta file (cds.fasta)\n #\n-# --scaffold <string> : Orthogroups or gene families proteins scaffold. This can either be an absolute\n-# path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)\n-# or just the scaffold (e.g., 22Gv1.1). If the latter, ~home/data is prepended to\n-# the scaffold to create the absolute path.\n-# the scaffold to create the absolute path.\n-# If Monocots clusters (version 1.0): 12Gv1.0\n-# If Angiosperms clusters (version 1.0): 22Gv1.0\n-# If Angiosperms clusters (version 1.1): 22Gv1.1\n-# If Green plants clusters (version 1.0): 31Gv1.0\n-# If Other non PlantTribes clusters: XGvY.Z, where "X" is the number species in the scaffold,\n-# and "Y.Z" version number such as 12Gv1.0. Please look at one of the PlantTribes scaffold\n-# data on how data files and directories are named, formated, and organized.\n+# --scaffold <string> : Orthogroups or gene families proteins scaffold. This can either be an absolute\n+# path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)\n+# or just the scaffold (e.g., 22Gv1.1). If the latter, ~home/data is prepended to\n+# the scaffold to create the absolute path.\n+# the scaffold to create the absolute path.\n+# If Monocots clusters (version 1.0): 12Gv1.0\n+# If Angiosperms clusters (version 1.0): 22Gv1.0\n+# If Angiosperms clusters (version 1.1): 22Gv1.1\n+# If Green plants clusters (version 1.0): 31Gv1.0\n+# If Other non PlantTribes clusters: XGvY.Z, where "X" is the number species in the scaffold,\n+# and "Y.Z" version number such as 12Gv1.0. Please look at one of the PlantTribes scaffold\n+# data on how data files and directories are named, formated, and organized.\n #\n #\n # --species_n'..b'tabase table\\n\\n";\n- open(IN, "$dirname/gene_scaffold_orthogroup_taxon_association.tsv") or die "can\'t open $dirname/gene_scaffold_orthogroup_taxon_association.tsv file\\n";\n+ log_msg("Inserting records into the gene_scaffold_orthogroup_taxon_association database table.");\n+ open(IN, "$gsot_association_prep_file") or die "Can\'t open $gsot_association_prep_file file\\n";\n+ $num_recs = 0;\n+ my ( $stmt, $sth, $rv, $scaffold_id, $clustering_method, $orthogroup_id, $taxon_id, $gene_id );\n+ my ( $gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db );\n while(<IN>){\n chomp;\n- if (/^gene_id/){ next; }\n+ if (/^gene_id/) {\n+ # gene_id scaffold_id clustering_method orthogroup_id species_name\n+ next;\n+ }\n my @fields = split(/\\t/, $_);\n- my ( $stmt, $sth, $rv, $scaffold_id, $orthogroup_id, $taxon_id, $gene_id );\n- $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = \'$fields[1]\' AND clustering_method = \'$fields[2]\';);\n+ # gnl_Fakge_v1.0_AT1G03390.1 22Gv1.1 orthomcl 3 Fake genome\n+ $gene_id = $fields[0];\n+ $scaffold_id = $fields[1];\n+ $clustering_method = $fields[2];\n+ $orthogroup_id = $fields[3];\n+ $species_name = $fields[4];\n+ $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = \'$scaffold_id\' AND clustering_method = \'$clustering_method\';);\n $sth = $dbh->prepare( $stmt );\n $rv = $sth->execute() or die $DBI::errstr;\n if ($rv < 0) { print $DBI::errstr; }\n while (my @row = $sth->fetchrow_array()) {\n- $scaffold_id = $row[0];\n+ $scaffold_id_db = $row[0];\n }\n- $stmt = qq(SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = $fields[3] AND scaffold_id = \'$scaffold_id\';);\n+ $stmt = qq(SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = \'$orthogroup_id\' AND scaffold_id = \'$scaffold_id_db\';);\n $sth = $dbh->prepare( $stmt );\n $rv = $sth->execute() or die $DBI::errstr;\n if ($rv < 0) { print $DBI::errstr; }\n while (my @row = $sth->fetchrow_array()) {\n- $orthogroup_id = $row[0];\n+ $orthogroup_id_db = $row[0];\n }\n- $stmt = qq(SELECT id FROM plant_tribes_taxon WHERE species_name = \'$species_name\' AND scaffold_id = \'$scaffold_id\';);\n+ $stmt = qq(SELECT id FROM plant_tribes_taxon WHERE species_name = \'$species_name\' AND scaffold_id = \'$scaffold_id_db\';);\n $sth = $dbh->prepare( $stmt );\n $rv = $sth->execute() or die $DBI::errstr;\n if ($rv < 0) { print $DBI::errstr; }\n while (my @row = $sth->fetchrow_array()) {\n- $taxon_id = $row[0];\n+ $taxon_id_db = $row[0];\n }\n- $stmt = qq(SELECT id FROM plant_tribes_gene WHERE gene_id = \'$fields[0]\' );\n+ $stmt = qq(SELECT id FROM plant_tribes_gene WHERE gene_id = \'$gene_id\' );\n $sth = $dbh->prepare( $stmt );\n $rv = $sth->execute() or die $DBI::errstr;\n if ($rv < 0) { print $DBI::errstr; }\n while (my @row = $sth->fetchrow_array()) {\n- $gene_id = $row[0];\n+ $gene_id_db = $row[0];\n }\n- $stmt = qq(INSERT INTO gene_scaffold_orthogroup_taxon_association (gene_id, scaffold_id, orthogroup_id, taxon_id) VALUES ($gene_id, $scaffold_id, $orthogroup_id, $taxon_id));\n+ $stmt = qq(INSERT INTO gene_scaffold_orthogroup_taxon_association (gene_id, scaffold_id, orthogroup_id, taxon_id) VALUES ($gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db));\n $rv = $dbh->do($stmt) or die $DBI::errstr;\n-\n+ $num_recs = $num_recs + 1;\n }\n close IN;\n- print "$species_name $scaffold records successfully created in the data association table\\n\\n";\n+ log_msg("$num_recs records for $scaffold $clustering_method were successfully inserted into the gene_scaffold_orthogroup_taxon_association table.");\n $dbh->disconnect();\n }\n-\n' |