Mercurial > repos > iuc > scanpy_filter
diff filter.xml @ 12:d600e0947468 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit c21958f44b81d740191999fb6015d5ae69538ee0
| author | iuc |
|---|---|
| date | Wed, 31 Jul 2024 18:06:35 +0000 |
| parents | 97b82bb0bb7e |
| children | e299752da98e |
line wrap: on
line diff
--- a/filter.xml Wed Sep 22 21:04:52 2021 +0000 +++ b/filter.xml Wed Jul 31 18:06:35 2024 +0000 @@ -1,9 +1,9 @@ -<tool id="scanpy_filter" name="Filter" version="@galaxy_version@" profile="@profile@"> +<tool id="scanpy_filter" name="Filter" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@"> <description>with scanpy</description> - <expand macro="bio_tools"/> <macros> <import>macros.xml</import> </macros> + <expand macro="bio_tools"/> <expand macro="requirements"/> <expand macro="version_command"/> <command detect_errors="exit_code"><![CDATA[ @@ -45,14 +45,13 @@ #else if $method.method == 'tl.filter_rank_genes_groups' sc.tl.filter_rank_genes_groups( adata, - #if str($method.key) != '' + #if $method.key key='$method.key', #end if - #if str($method.groupby) != '' + #if $method.groupby groupby='$method.groupby', #end if use_raw=$method.use_raw, - log=$method.log, key_added='$method.key_added', min_in_group_fraction=$method.min_in_group_fraction, max_out_group_fraction=$method.max_out_group_fraction, @@ -63,17 +62,11 @@ adata=adata, flavor='$method.flavor.flavor', #if $method.flavor.flavor == 'seurat' - #if str($method.flavor.min_mean) != '' - min_mean=$method.flavor.min_mean, - #end if - #if str($method.flavor.max_mean) != '' - max_mean=$method.flavor.max_mean, - #end if - #if str($method.flavor.min_disp) != '' - min_disp=$method.flavor.min_disp, - #end if + min_mean=$method.flavor.min_mean, + max_mean=$method.flavor.max_mean, + min_disp=$method.flavor.min_disp, #if str($method.flavor.max_disp) != '' - max_disp=$method.flavor.max_disp, + max_disp=$method.flavor.max_disp, #end if #else if $method.flavor.flavor == 'cell_ranger' n_top_genes=$method.flavor.n_top_genes, @@ -105,6 +98,41 @@ random_state=$method.random_state, replace=$method.replace, copy=False) + +#else if $method.method == "filter_marker" + +#if $method.layer_selection.use_raw == 'False': + adata.X = adata.layers['$method.layer_selection.layer'] +#end if + +def check_marker(adata, group, gene, thresh_mean, thresh_frac, groupby): + filtered_data = adata[adata.obs[groupby] == group, adata.var_names == gene] + mean_expression = np.mean(filtered_data.X) + frac_cell_mean_expression = len(filtered_data.X[filtered_data.X > mean_expression]) / filtered_data.n_obs + if ( mean_expression > thresh_mean and frac_cell_mean_expression >= thresh_frac ): + return(True) + return(False) + +header='infer' + +#if $method.header == 'not_included': + header=None +#end if + +marker_list={key: list(value.values()) for key, value in pd.read_csv('$method.markerfile', sep='\t', index_col=0, header=header).to_dict(orient='index').items()} + +for key, value in marker_list.items(): + marker_list[key] = [x for x in value if check_marker(adata, key, x, $method.thresh_mean, $method.thresh_frac, '$method.groupby')] + +# Find the maximum length of lists +max_len = max(len(lst) for lst in marker_list.values()) + +# Fill smaller lists with empty values +for key, value in marker_list.items(): + marker_list[key] = value + [''] * (max_len - len(value)) + +df = pd.DataFrame(marker_list).T +df.to_csv('marker.tsv', sep='\t', index=True) #end if @CMD_anndata_write_outputs@ @@ -120,6 +148,7 @@ <option value="pp.highly_variable_genes">Annotate (and filter) highly variable genes, using 'pp.highly_variable_genes'</option> <option value="pp.subsample">Subsample to a fraction of the number of observations, using 'pp.subsample'</option> <option value="pp.downsample_counts">Downsample counts from count matrix, using 'pp.downsample_counts'</option> + <option value="filter_marker">Filter markers from count matrix and marker list</option> </param> <when value="pp.filter_cells"> <conditional name="filter"> @@ -173,7 +202,6 @@ <expand macro="sanitize_query" /> </param> <expand macro="param_use_raw"/> - <expand macro="param_log"/> <param argument="key_added" type="text" value="rank_genes_groups_filtered" label="Key that will contain new values"> <expand macro="sanitize_query" /> </param> @@ -221,15 +249,40 @@ <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling"/> <param argument="replace" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Sample the counts with replacement?"/> </when> + <when value="filter_marker"> + <param argument="markerfile" type="data" format="tabular" label="List of markers" help="This should be a tsv where row = group (e.g. celltypes) and columns = markers."></param> + <param name="header" type="select" label="Header in the list of markers?"> + <option value="included">Header incldued</option> + <option value="not_included">Header not included</option> + </param> + <param argument="thresh_mean" type="float" min="0.0" value="1.0" label="Minimal average count of all cells of a group (e.g., celltype) for a particular marker" help="Increasing the threshold will result in a smaller marker set."/> + <param argument="thresh_frac" type="float" min="0.0" max="1.0" value="0.1" label="Minimal fractions of cells that has a higher count than the average count of all cells of the group for the marker" help="Increasing this threshold might remove marker outliers."/> + <conditional name="layer_selection"> + <param name="use_raw" type="select" label="Use .X of adata to perform the filtering" help=""> + <option value="True">Yes</option> + <option value="False">No</option> + </param> + <when value="False"> + <param argument="layer" type="text" value="" label="Key from adata.layers whose value will be used to filter" help="If layers specified then use adata.layers[layer]."/> + </when> + <when value="True"/> + </conditional> + <param argument="groupby" type="text" value="" label="The key of the observation grouping to consider (e.g., celltype)" help=""> + <expand macro="sanitize_query" /> + </param> + </when> </conditional> <expand macro="inputs_common_advanced"/> </inputs> <outputs> <expand macro="anndata_outputs"/> + <data name="marker_out" format="tabular" from_work_dir="marker.tsv" label="${tool.name} on ${on_string}: Markers"> + <filter>method['method'] == 'filter_marker'</filter> + </data> </outputs> <tests> - <test> - <!-- test 0 --> + <test expect_num_outputs="2"> + <!-- test 1 --> <param name="adata" value="krumsiek11.h5ad" /> <conditional name="method"> <param name="method" value="pp.filter_cells"/> @@ -252,8 +305,8 @@ </output> <output name="anndata_out" file="pp.filter_cells.krumsiek11-min_counts.h5ad" ftype="h5ad" compare="sim_size"/> </test> - <test> - <!-- test 1 --> + <test expect_num_outputs="2"> + <!-- test 2 --> <param name="adata" value="krumsiek11.h5ad" /> <conditional name="method"> <param name="method" value="pp.filter_cells"/> @@ -274,8 +327,8 @@ </output> <output name="anndata_out" file="pp.filter_cells.krumsiek11-max_genes.h5ad" ftype="h5ad" compare="sim_size"/> </test> - <test> - <!-- test 2 --> + <test expect_num_outputs="2"> + <!-- test 3 --> <param name="adata" value="krumsiek11.h5ad" /> <conditional name="method"> <param name="method" value="pp.filter_genes"/> @@ -295,36 +348,41 @@ </output> <output name="anndata_out" file="pp.filter_genes.krumsiek11-min_counts.h5ad" ftype="h5ad" compare="sim_size"/> </test> - <!-- <test> --> - <!-- <!-\- test 3 -\-> --> - <!-- <!-\- Input dataset appears to be missing rank_genes_groups key... -\-> --> - <!-- <param name="adata" value="tl.rank_genes_groups.krumsiek11.h5ad" /> --> - <!-- <conditional name="method"> --> - <!-- <param name="method" value="tl.filter_rank_genes_groups"/> --> - <!-- <param name="key" value="rank_genes_groups"/> --> - <!-- <param name="use_raw" value="False"/> --> - <!-- <param name="log" value="False"/> --> - <!-- <param name="key_added" value="rank_genes_groups_filtered"/> --> - <!-- <param name="min_in_group_fraction" value="0.25"/> --> - <!-- <param name="max_out_group_fraction" value="0.5"/> --> - <!-- <param name="min_fold_change" value="3"/> --> - <!-- </conditional> --> - <!-- <output name="hidden_output"> --> - <!-- <assert_contents> --> - <!-- <has_text_matching expression="tl.filter_rank_genes_groups"/> --> - <!-- <has_text_matching expression="key='rank_genes_groups'"/> --> - <!-- <has_text_matching expression="use_raw=False"/> --> - <!-- <has_text_matching expression="log=False"/> --> - <!-- <has_text_matching expression="key_added='rank_genes_groups_filtered'"/> --> - <!-- <has_text_matching expression="min_in_group_fraction=0.25"/> --> - <!-- <has_text_matching expression="max_out_group_fraction=0.5"/> --> - <!-- <has_text_matching expression="min_fold_change=3"/> --> - <!-- </assert_contents> --> - <!-- </output> --> - <!-- <output name="anndata_out" file="pp.filter_rank_genes_groups.h5ad" ftype="h5ad" compare="sim_size"/> --> - <!-- </test> --> - <test> - <!-- test 4 --> + + <!-- test 4 --> + <!-- Fails to write to anndata after tl.filter_rank_genes_groups + Issue has been reported here: https://github.com/scverse/anndata/issues/726 + The current fix is: del adata.uns['rank_genes_groups_filtered'] --> + <!--<test expect_num_outputs="2"> + <param name="adata" value="tl.rank_genes_groups.krumsiek11.h5ad" /> + <conditional name="method"> + <param name="method" value="tl.filter_rank_genes_groups"/> + <param name="key" value="rank_genes_groups"/> + <param name="use_raw" value="False"/> + <param name="key_added" value="rank_genes_groups_filtered"/> + <param name="min_in_group_fraction" value="0.25"/> + <param name="max_out_group_fraction" value="0.5"/> + <param name="min_fold_change" value="3"/> + </conditional> + <section name="advanced_common"> + <param name="show_log" value="true" /> + </section> + <output name="hidden_output"> + <assert_contents> + <has_text_matching expression="tl.filter_rank_genes_groups"/> + <has_text_matching expression="key='rank_genes_groups'"/> + <has_text_matching expression="use_raw=False"/> + <has_text_matching expression="log=False"/> + <has_text_matching expression="key_added='rank_genes_groups_filtered'"/> + <has_text_matching expression="min_in_group_fraction=0.25"/> + <has_text_matching expression="max_out_group_fraction=0.5"/> + <has_text_matching expression="min_fold_change=3"/> + </assert_contents> + </output> + <output name="anndata_out" file="pp.filter_rank_genes_groups.h5ad" ftype="h5ad" compare="sim_size"/> + </test>--> + <test expect_num_outputs="2"> + <!-- test 5 --> <param name="adata" value="blobs.h5ad"/> <conditional name="method"> <param name="method" value="pp.highly_variable_genes"/> @@ -353,8 +411,8 @@ </output> <output name="anndata_out" file="pp.highly_variable_genes.seurat.blobs.h5ad" ftype="h5ad" compare="sim_size" delta="100000" delta_frac="0.2"/> </test> - <test> - <!-- test 5 --> + <test expect_num_outputs="2"> + <!-- test 6 --> <param name="adata" value="krumsiek11.h5ad" /> <conditional name="method"> <param name="method" value="pp.highly_variable_genes"/> @@ -379,8 +437,8 @@ </output> <output name="anndata_out" file="pp.highly_variable_genes.krumsiek11-cell_ranger.h5ad" ftype="h5ad" compare="sim_size" delta="100000" delta_frac="0.9"/> </test> - <test> - <!-- test 6 --> + <test expect_num_outputs="2"> + <!-- test 7 --> <param name="adata" value="krumsiek11.h5ad" /> <conditional name="method"> <param name="method" value="pp.subsample"/> @@ -402,8 +460,8 @@ </output> <output name="anndata_out" file="pp.subsample.krumsiek11_fraction.h5ad" ftype="h5ad" compare="sim_size"/> </test> - <test> - <!-- test 7 --> + <test expect_num_outputs="2"> + <!-- test 8 --> <param name="adata" value="krumsiek11.h5ad" /> <conditional name="method"> <param name="method" value="pp.subsample"/> @@ -425,8 +483,8 @@ </output> <output name="anndata_out" file="pp.subsample.krumsiek11_n_obs.h5ad" ftype="h5ad" compare="sim_size"/> </test> - <test> - <!-- test 8 --> + <test expect_num_outputs="2"> + <!-- test 9 --> <param name="adata" value="random-randint.h5ad" /> <conditional name="method"> <param name="method" value="pp.downsample_counts"/> @@ -447,6 +505,32 @@ </output> <output name="anndata_out" file="pp.downsample_counts.random-randint.h5ad" ftype="h5ad" compare="sim_size" delta="10000000" delta_frac="0.5"/> </test> + <test expect_num_outputs="3"> + <!-- test 10 --> + <param name="adata" value="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad" /> + <conditional name="method"> + <param name="method" value="filter_marker"/> + <param name="markerfile" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_1.tsv"/> + <param name="thresh_mean" value="1.0"/> + <param name="thresh_frac" value="0.2"/> + <param name="layer_selection" value="True"/> + <param name="groupby" value="bulk_labels"/> + </conditional> + <section name="advanced_common"> + <param name="show_log" value="true" /> + </section> + <output name="hidden_output"> + <assert_contents> + <has_text_matching expression="adata, key, x, 1.0, 0.2, 'bulk_labels'"/> + </assert_contents> + </output> + <output name="anndata_out" file="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="obs, var, uns" /> + </assert_contents> + </output> + <output name="marker_out" file="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv" ftype="tabular" compare="sim_size"/> + </test> </tests> <help><![CDATA[ @@ -461,7 +545,7 @@ `max_counts`, `max_genes` per call. More details on the `scanpy documentation -<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.filter_cells.html>`__ +<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.filter_cells.html>`__ Filter genes based on number of cells or counts (`pp.filter_genes`) @@ -475,14 +559,14 @@ `max_counts`, `max_cells` per call. More details on the `scanpy documentation -<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.filter_genes.html>`__ +<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.filter_genes.html>`__ Filters out genes based on fold change and fraction of genes expressing the gene within and outside the groupby categories (`tl.filter_rank_genes_groups`) ========================================================================================================================================================== More details on the `scanpy documentation -<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.tl.filter_rank_genes_groups.html>`__ +<https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.filter_rank_genes_groups.html>`__ Annotate highly variable genes (`pp.highly_variable_genes`) @@ -497,7 +581,7 @@ ====================================================================== More details on the `scanpy documentation -<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.subsample.html>`__ +<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.subsample.html>`__ Downsample counts (`pp.downsample_counts`) ========================================== @@ -505,8 +589,18 @@ Downsample counts so that each cell has no more than `target_counts`. Cells with fewer counts than `target_counts` are unaffected by this. This has been implemented by M. D. Luecken. + +Filter marker genes (`filter_marker`) +====================================================================== + +This option is specific for celltype marker gene detection. You can generate a celltype marker gene file (tsv) with **COSG** provided at Galaxy. + +The marker gene file should have as rows celltypes and columns as marker genes. Each celltype can have varying number of marker genes. + +A marker gene is returned (retained in the list) if the mean expression of the marker gene is bigger than the threshold of mean expression (thresh_mean) and if the fraction of cells with the marker gene expression is equal or higher than the cell fraction threshold (thresh_frac). + More details on the `scanpy documentation -<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.downsample_counts.html>`__ +<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.downsample_counts.html>`__ ]]></help>
