# HG changeset patch
# User iuc
# Date 1672928851 0
# Node ID 341bcf4d4fcd77d8e7f8fa45c5dc4fe693272c3e
# Parent 6052fcc0d1136fd4f0d970b8e35a5453163d34f2
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit c3685ed6a70b47012b62b95a2a3db062bd3b7475
diff -r 6052fcc0d113 -r 341bcf4d4fcd macros.xml
--- a/macros.xml Wed Nov 09 12:18:36 2022 +0000
+++ b/macros.xml Thu Jan 05 14:27:31 2023 +0000
@@ -1,6 +1,6 @@
1.0.0
- 0
+ 2
numpy
@@ -14,7 +14,7 @@
- 10.1038/s41467-019-12528-4
+ 10.3389/fbinf.2022.867111
-
\ No newline at end of file
+
diff -r 6052fcc0d113 -r 341bcf4d4fcd predict.py
--- a/predict.py Wed Nov 09 12:18:36 2022 +0000
+++ b/predict.py Thu Jan 05 14:27:31 2023 +0000
@@ -9,7 +9,7 @@
import pandas as pd
from Bio import SeqIO
from joblib import load
-from models import model_5, model_7
+from models import model_10, model_5, model_7
from utils import preprocess as pp
os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -18,7 +18,7 @@
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-def predict_nn(ds_path, nn_weights_path, length, batch_size=256):
+def predict_nn(ds_path, nn_weights_path, length, use_10, batch_size=256):
"""
Breaks down contigs into fragments
and uses pretrained neural networks to give predictions for fragments
@@ -37,10 +37,14 @@
"pred_plant_7": [],
"pred_vir_7": [],
"pred_bact_7": [],
- # "pred_plant_10": [],
- # "pred_vir_10": [],
- # "pred_bact_10": [],
}
+ if use_10:
+ out_table_ = {
+ "pred_plant_10": [],
+ "pred_vir_10": [],
+ "pred_bact_10": [],
+ }
+ out_table.update(out_table_)
if not seqs_:
raise ValueError("All sequences were smaller than length of the model")
test_fragments = []
@@ -56,24 +60,32 @@
out_table["fragment"].append(j)
test_encoded = pp.one_hot_encode(test_fragments)
test_encoded_rc = pp.one_hot_encode(test_fragments_rc)
- # for model, s in zip([model_5.model(length), model_7.model(length), model_10.model(length)], [5, 7, 10]):
- for model, s in zip([model_5.model(length), model_7.model(length)], [5, 7]):
+ if use_10:
+ zipped_models = zip([model_5.model(length), model_7.model(length), model_10.model(length)], [5, 7, 10])
+ else:
+ zipped_models = zip([model_5.model(length), model_7.model(length)], [5, 7])
+ for model, s in zipped_models:
model.load_weights(Path(nn_weights_path, f"model_{s}_{length}.h5"))
prediction = model.predict([test_encoded, test_encoded_rc], batch_size)
out_table[f"pred_plant_{s}"].extend(list(prediction[..., 0]))
out_table[f"pred_vir_{s}"].extend(list(prediction[..., 1]))
out_table[f"pred_bact_{s}"].extend(list(prediction[..., 2]))
+
return pd.DataFrame(out_table)
-def predict_rf(df, rf_weights_path, length):
+def predict_rf(df, rf_weights_path, length, use_10):
"""
Using predictions by predict_nn and weights of a trained RF classifier gives a single prediction for a fragment
"""
clf = load(Path(rf_weights_path, f"RF_{length}.joblib"))
- X = df[["pred_plant_5", "pred_vir_5", "pred_plant_7", "pred_vir_7"]]
- # X = ["pred_plant_5", "pred_vir_5", "pred_plant_7", "pred_vir_7", "pred_plant_10", "pred_vir_10", ]]
+ if use_10:
+ X = df[
+ ["pred_plant_5", "pred_vir_5", "pred_plant_7", "pred_vir_7", "pred_plant_10", "pred_vir_10", ]]
+ else:
+ X = df[
+ ["pred_plant_5", "pred_vir_5", "pred_plant_7", "pred_vir_7", ]]
y_pred = clf.predict(X)
mapping = {0: "plant", 1: "virus", 2: "bacteria"}
df["RF_decision"] = np.vectorize(mapping.get)(y_pred)
@@ -89,12 +101,10 @@
Based on predictions of predict_rf for fragments gives a final prediction for the whole contig
"""
df = (
- df.groupby(["id", "length", 'RF_decision'], sort=False)
- .size()
- .unstack(fill_value=0)
+ df.groupby(["id", "length", 'RF_decision'], sort=False).size().unstack(fill_value=0)
)
df = df.reset_index()
- df = df.reindex(['length', 'id', 'virus', 'plant', 'bacteria'], axis=1)
+ df = df.reindex(['length', 'id', 'virus', 'plant', 'bacteria'], axis=1).fillna(value=0)
conditions = [
(df['virus'] > df['plant']) & (df['virus'] > df['bacteria']),
(df['plant'] > df['virus']) & (df['plant'] > df['bacteria']),
@@ -131,7 +141,7 @@
assert Path(weights).exists(), f'{weights} does not exist'
assert isinstance(limit, int), 'limit should be an integer'
Path(out_path).mkdir(parents=True, exist_ok=True)
-
+ use_10 = Path(weights, 'model_10_500.h5').exists()
for ts in test_ds:
dfs_fr = []
dfs_cont = []
@@ -141,12 +151,14 @@
ds_path=ts,
nn_weights_path=weights,
length=l_,
+ use_10=use_10
)
print(df)
df = predict_rf(
df=df,
rf_weights_path=weights,
length=l_,
+ use_10=use_10
)
df = df.round(3)
dfs_fr.append(df)
@@ -178,7 +190,7 @@
parser.add_argument("--weights", help="path to the folder containing weights for NN and RF modules trained on 500 and 1000 fragment lengths (str)")
parser.add_argument("--out_path", help="path to the folder to store predictions (str)")
parser.add_argument("--return_viral", help="whether to return contigs annotated as viral in separate fasta file (True/False)")
- parser.add_argument("--limit", help="Do predictions only for contigs > l. We suggest l=750. (int)", type=int)
+ parser.add_argument("--limit", help="Do predictions only for contigs > l. We suggest l=750. (int)", type=int, default=750)
args = parser.parse_args()
if args.test_ds:
diff -r 6052fcc0d113 -r 341bcf4d4fcd tool-data/virhunter.loc.sample
--- a/tool-data/virhunter.loc.sample Wed Nov 09 12:18:36 2022 +0000
+++ b/tool-data/virhunter.loc.sample Thu Jan 05 14:27:31 2023 +0000
@@ -1,29 +1,27 @@
#This is a sample file distributed with Galaxy that enables tools
-#to use a directory of Samtools indexed sequences data files. You will need
-#to create these data files and then create a fasta_indexes.loc file
+#to use a directory of virhunter hdf5 model files. You will need
+#to create these data files and then create a virhunter.loc file
#similar to this one (store it in this directory) that points to
-#the directories in which those files are stored. The fasta_indexes.loc
+#the directories in which those files are stored. The virhunter.loc
#file has this format (white space characters are TAB characters):
#
-#
+#
#
-#So, for example, if you had hg19 Canonical indexed stored in
+#So, for example, if you had fungi hdf5 model files stored in
#
-# /depot/data2/galaxy/hg19/sam/,
+# /tool-data/weights/peach/,
#
-#then the fasta_indexes.loc entry would look like this:
+#then the virhunter.loc entry would look like this:
#
-#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /depot/data2/galaxy/hg19/sam/hg19canon.fa
+#peach peach /data/databases/path/weights/peach
#
-#and your /depot/data2/galaxy/hg19/sam/ directory
-#would contain hg19canon.fa and hg19canon.fa.fai files.
+#and your /tool-data/weights/peach/ directory
+#would contain model_5_500.h5,model_7_500.h5,model_10_500.h5 and model_5_1000.h5, model_7_1000.h5, model_10_1000.h5 files.
#
-#Your fasta_indexes.loc file should include an entry per line for
+#Your virhunter.loc file should include an entry per line for
#each index set you have stored. The file in the path does actually
#exist, but it should never be directly used. Instead, the name serves
#as a prefix for the index file. For example:
#
-#hg18canon hg18 Human (Homo sapiens): hg18 Canonical /depot/data2/galaxy/hg18/sam/hg18canon.fa
-#hg18full hg18 Human (Homo sapiens): hg18 Full /depot/data2/galaxy/hg18/sam/hg18full.fa
-#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /depot/data2/galaxy/hg19/sam/hg19canon.fa
-#hg19full hg19 Human (Homo sapiens): hg19 Full /depot/data2/galaxy/hg19/sam/hg19full.fa
\ No newline at end of file
+#peach peach /data/databases/path/weights/peach
+#grapevine grapevine /data/databases/path/weights/peach
diff -r 6052fcc0d113 -r 341bcf4d4fcd virhunter.xml
--- a/virhunter.xml Wed Nov 09 12:18:36 2022 +0000
+++ b/virhunter.xml Thu Jan 05 14:27:31 2023 +0000
@@ -1,6 +1,6 @@
- Deep learning method to identify viruses in sequencing datasets..
+ Deep learning method to identify viruses in sequencing datasets
macros.xml
@@ -24,7 +24,7 @@
]]>
-
+
@@ -49,7 +49,8 @@
\ No newline at end of file