Repository 'plant_tribes_gene_family_scaffold_updater'
hg clone https://eddie.galaxyproject.org/repos/greg/plant_tribes_gene_family_scaffold_updater

Changeset 19:813f7ef1b1b5 (2018-06-14)
Previous changeset 18:f7bfe1fbd543 (2018-06-14) Next changeset 20:72aa8e0ad523 (2018-06-18)
Commit message:
Uploaded
modified:
gene_family_scaffold_updater.pl
b
diff -r f7bfe1fbd543 -r 813f7ef1b1b5 gene_family_scaffold_updater.pl
--- a/gene_family_scaffold_updater.pl Thu Jun 14 14:32:13 2018 -0400
+++ b/gene_family_scaffold_updater.pl Thu Jun 14 14:32:21 2018 -0400
[
b'@@ -2,7 +2,7 @@\n # Author: Eric Wafula\n # Email: ekw10@psu.edu\n # Institution: Penn State University, Biology Dept, Claude dePamphilis Lab\n-# Date: 01-31-2015\n+# Date: June 2018\n \n use strict;\n use warnings;\n@@ -24,30 +24,32 @@\n #  Required Options:\n #\n #\n-#  --database_connection_string <string>     : Postgres database connection string using format\n-#                                              postgresql://<user>:<password>@<host>/<database name>\n+#  --database_connection_string <string>    : Postgres database connection string using format\n+#                                             postgresql://<user>:<password>@<host>/<database name>\n #\n-#  --proteins <string>                       : Amino acids (proteins) sequences fasta file (proteins.fasta)\n-#                                              This can either be an absolute path or just the file name\n+#  --proteins <string>                      : Amino acids (proteins) sequences fasta file (proteins.fasta)\n+#                                             This can either be an absolute path or just the file name\n #\n-#  --coding_sequences <string>               : Corresponding coding sequences (CDS) fasta file (cds.fasta)\n+#  --coding_sequences <string>              : Corresponding coding sequences (CDS) fasta file (cds.fasta)\n #\n-#  --scaffold <string>                       : Orthogroups or gene families proteins scaffold.  This can either be an absolute\n-#                                              path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)\n-#                                              or just the scaffold (e.g., 22Gv1.1).  If the latter, ~home/data is prepended to\n-#                                              the scaffold to create the absolute path.\n-#                                              the scaffold to create the absolute path.\n-#                                              If Monocots clusters (version 1.0): 12Gv1.0\n-#                                              If Angiosperms clusters (version 1.0): 22Gv1.0\n-#                                              If Angiosperms clusters (version 1.1): 22Gv1.1\n-#                                              If Green plants clusters (version 1.0): 31Gv1.0\n-#                                              If Other non PlantTribes clusters: XGvY.Z, where "X" is the number species in the scaffold,\n-#                                              and "Y.Z" version number such as 12Gv1.0. Please look at one of the PlantTribes scaffold\n-#                                              data on how data files and directories are named, formated, and organized.\n+#  --scaffold <string>                      : Orthogroups or gene families proteins scaffold.  This can either be an absolute\n+#                                             path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)\n+#                                             or just the scaffold (e.g., 22Gv1.1).  If the latter, ~home/data is prepended to\n+#                                             the scaffold to create the absolute path.\n+#                                             the scaffold to create the absolute path.\n+#                                             If Monocots clusters (version 1.0): 12Gv1.0\n+#                                             If Angiosperms clusters (version 1.0): 22Gv1.0\n+#                                             If Angiosperms clusters (version 1.1): 22Gv1.1\n+#                                             If Green plants clusters (version 1.0): 31Gv1.0\n+#                                             If Other non PlantTribes clusters: XGvY.Z, where "X" is the number species in the scaffold,\n+#                                             and "Y.Z" version number such as 12Gv1.0. Please look at one of the PlantTribes scaffold\n+#                                               data on how data files and directories are named, formated, and organized.\n #\n #\n #  --species_n'..b'tabase table\\n\\n";\n-    open(IN, "$dirname/gene_scaffold_orthogroup_taxon_association.tsv") or die "can\'t open $dirname/gene_scaffold_orthogroup_taxon_association.tsv file\\n";\n+    log_msg("Inserting  records into the gene_scaffold_orthogroup_taxon_association database table.");\n+    open(IN, "$gsot_association_prep_file") or die "Can\'t open $gsot_association_prep_file file\\n";\n+    $num_recs = 0;\n+    my ( $stmt, $sth, $rv, $scaffold_id, $clustering_method, $orthogroup_id, $taxon_id, $gene_id );\n+    my ( $gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db );\n     while(<IN>){\n         chomp;\n-        if (/^gene_id/){ next; }\n+        if (/^gene_id/) {\n+            # gene_id scaffold_id clustering_method orthogroup_id species_name\n+            next;\n+        }\n         my @fields = split(/\\t/, $_);\n-        my ( $stmt, $sth, $rv, $scaffold_id, $orthogroup_id, $taxon_id, $gene_id );\n-        $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = \'$fields[1]\' AND clustering_method = \'$fields[2]\';);\n+        # gnl_Fakge_v1.0_AT1G03390.1 22Gv1.1 orthomcl 3 Fake genome\n+        $gene_id = $fields[0];\n+        $scaffold_id = $fields[1];\n+        $clustering_method = $fields[2];\n+        $orthogroup_id = $fields[3];\n+        $species_name = $fields[4];\n+        $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = \'$scaffold_id\' AND clustering_method = \'$clustering_method\';);\n         $sth = $dbh->prepare( $stmt );\n         $rv = $sth->execute() or die $DBI::errstr;\n         if ($rv < 0) { print $DBI::errstr; }\n         while (my @row = $sth->fetchrow_array()) {\n-            $scaffold_id = $row[0];\n+            $scaffold_id_db = $row[0];\n         }\n-        $stmt = qq(SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = $fields[3] AND scaffold_id = \'$scaffold_id\';);\n+        $stmt = qq(SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = \'$orthogroup_id\' AND scaffold_id = \'$scaffold_id_db\';);\n         $sth = $dbh->prepare( $stmt );\n         $rv = $sth->execute() or die $DBI::errstr;\n         if ($rv < 0) { print $DBI::errstr; }\n         while (my @row = $sth->fetchrow_array()) {\n-            $orthogroup_id = $row[0];\n+            $orthogroup_id_db = $row[0];\n         }\n-        $stmt = qq(SELECT id FROM plant_tribes_taxon WHERE species_name = \'$species_name\' AND scaffold_id = \'$scaffold_id\';);\n+        $stmt = qq(SELECT id FROM plant_tribes_taxon WHERE species_name = \'$species_name\' AND scaffold_id = \'$scaffold_id_db\';);\n         $sth = $dbh->prepare( $stmt );\n         $rv = $sth->execute() or die $DBI::errstr;\n         if ($rv < 0) { print $DBI::errstr; }\n         while (my @row = $sth->fetchrow_array()) {\n-            $taxon_id = $row[0];\n+            $taxon_id_db = $row[0];\n         }\n-        $stmt = qq(SELECT id FROM plant_tribes_gene WHERE gene_id = \'$fields[0]\' );\n+        $stmt = qq(SELECT id FROM plant_tribes_gene WHERE gene_id = \'$gene_id\' );\n         $sth = $dbh->prepare( $stmt );\n         $rv = $sth->execute() or die $DBI::errstr;\n         if ($rv < 0) { print $DBI::errstr; }\n         while (my @row = $sth->fetchrow_array()) {\n-            $gene_id = $row[0];\n+            $gene_id_db = $row[0];\n         }\n-        $stmt = qq(INSERT INTO gene_scaffold_orthogroup_taxon_association (gene_id, scaffold_id, orthogroup_id, taxon_id) VALUES ($gene_id, $scaffold_id, $orthogroup_id, $taxon_id));\n+        $stmt = qq(INSERT INTO gene_scaffold_orthogroup_taxon_association (gene_id, scaffold_id, orthogroup_id, taxon_id) VALUES ($gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db));\n         $rv = $dbh->do($stmt) or die $DBI::errstr;\n-\n+        $num_recs = $num_recs + 1;\n     }\n     close IN;\n-    print "$species_name $scaffold records successfully created in the data association table\\n\\n";\n+    log_msg("$num_recs records for $scaffold $clustering_method were successfully inserted into the gene_scaffold_orthogroup_taxon_association table.");\n     $dbh->disconnect();\n }\n-\n'