scanpy_filter: filter.xml comparison

comparison filter.xml @ 12:d600e0947468 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit c21958f44b81d740191999fb6015d5ae69538ee0

author	iuc
date	Wed, 31 Jul 2024 18:06:35 +0000
parents	97b82bb0bb7e
children	e299752da98e

comparison

equal deleted inserted replaced

-:c7ccb6ba94fb
+:d600e0947468
-<tool id="scanpy_filter" name="Filter" version="@galaxy_version@" profile="@profile@">
+<tool id="scanpy_filter" name="Filter" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@">
 <description>with scanpy</description>
-<expand macro="bio_tools"/>
 <macros>
 <import>macros.xml</import>
 </macros>
+<expand macro="bio_tools"/>
 <expand macro="requirements"/>
 <expand macro="version_command"/>
 <command detect_errors="exit_code"><![CDATA[
 @CMD@
 ]]></command>
 copy=False)
 #else if $method.method == 'tl.filter_rank_genes_groups'
 sc.tl.filter_rank_genes_groups(
 adata,
-#if str($method.key) != ''
+#if $method.key
 key='$method.key',
 #end if
-#if str($method.groupby) != ''
+#if $method.groupby
 groupby='$method.groupby',
 #end if
 use_raw=$method.use_raw,
-log=$method.log,
 key_added='$method.key_added',
 min_in_group_fraction=$method.min_in_group_fraction,
 max_out_group_fraction=$method.max_out_group_fraction,
 min_fold_change=$method.min_fold_change)
 #else if $method.method == "pp.highly_variable_genes"
 sc.pp.highly_variable_genes(
 adata=adata,
 flavor='$method.flavor.flavor',
 #if $method.flavor.flavor == 'seurat'
-#if str($method.flavor.min_mean) != ''
+min_mean=$method.flavor.min_mean,
-min_mean=$method.flavor.min_mean,
+max_mean=$method.flavor.max_mean,
-#end if
+min_disp=$method.flavor.min_disp,
-#if str($method.flavor.max_mean) != ''
-max_mean=$method.flavor.max_mean,
-#end if
-#if str($method.flavor.min_disp) != ''
-min_disp=$method.flavor.min_disp,
-#end if
 #if str($method.flavor.max_disp) != ''
 max_disp=$method.flavor.max_disp,
 #end if
 #else if $method.flavor.flavor == 'cell_ranger'
 n_top_genes=$method.flavor.n_top_genes,
 #end if
 n_bins=$method.n_bins,
 total_counts=$method.total_counts,
 #end if
 random_state=$method.random_state,
 replace=$method.replace,
 copy=False)
+#else if $method.method == "filter_marker"
+#if $method.layer_selection.use_raw == 'False':
+adata.X = adata.layers['$method.layer_selection.layer']
+#end if
+def check_marker(adata, group, gene, thresh_mean, thresh_frac, groupby):
+filtered_data = adata[adata.obs[groupby] == group, adata.var_names == gene]
+mean_expression = np.mean(filtered_data.X)
+frac_cell_mean_expression = len(filtered_data.X[filtered_data.X > mean_expression]) / filtered_data.n_obs
+if ( mean_expression > thresh_mean and frac_cell_mean_expression >= thresh_frac ):
+return(True)
+return(False)
+header='infer'
+#if $method.header == 'not_included':
+header=None
+#end if
+marker_list={key: list(value.values()) for key, value in pd.read_csv('$method.markerfile', sep='\t', index_col=0, header=header).to_dict(orient='index').items()}
+for key, value in marker_list.items():
+marker_list[key] = [x for x in value if check_marker(adata, key, x, $method.thresh_mean, $method.thresh_frac, '$method.groupby')]
+# Find the maximum length of lists
+max_len = max(len(lst) for lst in marker_list.values())
+# Fill smaller lists with empty values
+for key, value in marker_list.items():
+marker_list[key] = value + [''] * (max_len - len(value))
+df = pd.DataFrame(marker_list).T
+df.to_csv('marker.tsv', sep='\t', index=True)
 #end if
 @CMD_anndata_write_outputs@
 ]]></configfile>
 </configfiles>
 <option value="pp.filter_genes">Filter genes based on number of cells or counts, using 'pp.filter_genes'</option>
 <option value="tl.filter_rank_genes_groups">Filters out genes based on fold change and fraction of genes expressing the gene within and outside the groupby categories, using 'tl.filter_rank_genes_groups'</option>
 <option value="pp.highly_variable_genes">Annotate (and filter) highly variable genes, using 'pp.highly_variable_genes'</option>
 <option value="pp.subsample">Subsample to a fraction of the number of observations, using 'pp.subsample'</option>
 <option value="pp.downsample_counts">Downsample counts from count matrix, using 'pp.downsample_counts'</option>
+<option value="filter_marker">Filter markers from count matrix and marker list</option>
 </param>
 <when value="pp.filter_cells">
 <conditional name="filter">
 <param argument="filter" type="select" label="Filter">
 <option value="min_counts">Minimum number of counts</option>
 </param>
 <param argument="groupby" type="text" optional="true" label="The key of the observations grouping to consider">
 <expand macro="sanitize_query" />
 </param>
 <expand macro="param_use_raw"/>
-<expand macro="param_log"/>
 <param argument="key_added" type="text" value="rank_genes_groups_filtered" label="Key that will contain new values">
 <expand macro="sanitize_query" />
 </param>
 <param argument="min_in_group_fraction" type="float" min="0" max="1" value="0.25" label="Minimum fraction of genes expressing the gene within the categories"/>
 <param argument="max_out_group_fraction" type="float" min="0" max="1" value="0.5" label="Maximum fraction of genes expressing the gene outside the categories"/>
 <param argument="counts_per_cell" type="integer" min="0" optional="true" label="Target total counts per cell" help="If a cell has more than ‘counts_per_cell’, it will be downsampled to this number. Resulting counts can be specified on a per cell basis by passing an array."/>
 <param argument="total_counts" type="integer" min="0" optional="true" label="Target total counts" help="If the count matrix has more than total_counts it will be downsampled to have this number."/>
 <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling"/>
 <param argument="replace" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Sample the counts with replacement?"/>
 </when>
+<when value="filter_marker">
+<param argument="markerfile" type="data" format="tabular" label="List of markers" help="This should be a tsv where row = group (e.g. celltypes) and columns = markers."></param>
+<param name="header" type="select" label="Header in the list of markers?">
+<option value="included">Header incldued</option>
+<option value="not_included">Header not included</option>
+</param>
+<param argument="thresh_mean" type="float" min="0.0" value="1.0" label="Minimal average count of all cells of a group (e.g., celltype) for a particular marker" help="Increasing the threshold will result in a smaller marker set."/>
+<param argument="thresh_frac" type="float" min="0.0" max="1.0" value="0.1" label="Minimal fractions of cells that has a higher count than the average count of all cells of the group for the marker" help="Increasing this threshold might remove marker outliers."/>
+<conditional name="layer_selection">
+<param name="use_raw" type="select" label="Use .X of adata to perform the filtering" help="">
+<option value="True">Yes</option>
+<option value="False">No</option>
+</param>
+<when value="False">
+<param argument="layer" type="text" value="" label="Key from adata.layers whose value will be used to filter" help="If layers specified then use adata.layers[layer]."/>
+</when>
+<when value="True"/>
+</conditional>
+<param argument="groupby" type="text" value="" label="The key of the observation grouping to consider (e.g., celltype)" help="">
+<expand macro="sanitize_query" />
+</param>
+</when>
 </conditional>
 <expand macro="inputs_common_advanced"/>
 </inputs>
 <outputs>
 <expand macro="anndata_outputs"/>
+<data name="marker_out" format="tabular" from_work_dir="marker.tsv" label="${tool.name} on ${on_string}: Markers">
+<filter>method['method'] == 'filter_marker'</filter>
+</data>
 </outputs>
 <tests>
-<test>
+<test expect_num_outputs="2">
-<!-- test 0 -->
+<!-- test 1 -->
 <param name="adata" value="krumsiek11.h5ad" />
 <conditional name="method">
 <param name="method" value="pp.filter_cells"/>
 <conditional name="filter">
 <param name="filter" value="min_counts"/>
 <has_text_matching expression="min_counts=3"/>
 </assert_contents>
 </output>
 <output name="anndata_out" file="pp.filter_cells.krumsiek11-min_counts.h5ad" ftype="h5ad" compare="sim_size"/>
 </test>
-<test>
+<test expect_num_outputs="2">
-<!-- test 1 -->
+<!-- test 2 -->
 <param name="adata" value="krumsiek11.h5ad" />
 <conditional name="method">
 <param name="method" value="pp.filter_cells"/>
 <conditional name="filter">
 <param name="filter" value="max_genes"/>
 <has_text_matching expression="max_genes=100"/>
 </assert_contents>
 </output>
 <output name="anndata_out" file="pp.filter_cells.krumsiek11-max_genes.h5ad" ftype="h5ad" compare="sim_size"/>
 </test>
-<test>
+<test expect_num_outputs="2">
-<!-- test 2 -->
+<!-- test 3 -->
 <param name="adata" value="krumsiek11.h5ad" />
 <conditional name="method">
 <param name="method" value="pp.filter_genes"/>
 <conditional name="filter">
 <param name="filter" value="min_counts"/>
 <has_text_matching expression="min_counts=3"/>
 </assert_contents>
 </output>
 <output name="anndata_out" file="pp.filter_genes.krumsiek11-min_counts.h5ad" ftype="h5ad" compare="sim_size"/>
 </test>
-<!-- <test> -->
-<!--     <!-\- test 3 -\-> -->
+<!--  test 4 -->
-<!--     <!-\- Input dataset appears to be missing rank_genes_groups key... -\-> -->
+<!-- Fails to write to anndata after tl.filter_rank_genes_groups
-<!--     <param name="adata" value="tl.rank_genes_groups.krumsiek11.h5ad" /> -->
+Issue has been reported here: https://github.com/scverse/anndata/issues/726
-<!--     <conditional name="method"> -->
+The current fix is: del adata.uns['rank_genes_groups_filtered']  -->
-<!--         <param name="method" value="tl.filter_rank_genes_groups"/> -->
+<!--<test expect_num_outputs="2">
-<!--         <param name="key" value="rank_genes_groups"/> -->
+<param name="adata" value="tl.rank_genes_groups.krumsiek11.h5ad" />
-<!--         <param name="use_raw" value="False"/> -->
+<conditional name="method">
-<!--         <param name="log" value="False"/> -->
+<param name="method" value="tl.filter_rank_genes_groups"/>
-<!--         <param name="key_added" value="rank_genes_groups_filtered"/> -->
+<param name="key" value="rank_genes_groups"/>
-<!--         <param name="min_in_group_fraction" value="0.25"/> -->
+<param name="use_raw" value="False"/>
-<!--         <param name="max_out_group_fraction" value="0.5"/> -->
+<param name="key_added" value="rank_genes_groups_filtered"/>
-<!--         <param name="min_fold_change" value="3"/> -->
+<param name="min_in_group_fraction" value="0.25"/>
-<!--     </conditional> -->
+<param name="max_out_group_fraction" value="0.5"/>
-<!--     <output name="hidden_output"> -->
+<param name="min_fold_change" value="3"/>
-<!--         <assert_contents> -->
+</conditional>
-<!--             <has_text_matching expression="tl.filter_rank_genes_groups"/> -->
+<section name="advanced_common">
-<!--             <has_text_matching expression="key='rank_genes_groups'"/> -->
+<param name="show_log" value="true" />
-<!--             <has_text_matching expression="use_raw=False"/> -->
+</section>
-<!--             <has_text_matching expression="log=False"/> -->
+<output name="hidden_output">
-<!--             <has_text_matching expression="key_added='rank_genes_groups_filtered'"/> -->
+<assert_contents>
-<!--             <has_text_matching expression="min_in_group_fraction=0.25"/> -->
+<has_text_matching expression="tl.filter_rank_genes_groups"/>
-<!--             <has_text_matching expression="max_out_group_fraction=0.5"/> -->
+<has_text_matching expression="key='rank_genes_groups'"/>
-<!--             <has_text_matching expression="min_fold_change=3"/> -->
+<has_text_matching expression="use_raw=False"/>
-<!--         </assert_contents> -->
+<has_text_matching expression="log=False"/>
-<!--     </output> -->
+<has_text_matching expression="key_added='rank_genes_groups_filtered'"/>
-<!--     <output name="anndata_out" file="pp.filter_rank_genes_groups.h5ad" ftype="h5ad" compare="sim_size"/> -->
+<has_text_matching expression="min_in_group_fraction=0.25"/>
-<!-- </test> -->
+<has_text_matching expression="max_out_group_fraction=0.5"/>
-<test>
+<has_text_matching expression="min_fold_change=3"/>
-<!-- test 4 -->
+</assert_contents>
+</output>
+<output name="anndata_out" file="pp.filter_rank_genes_groups.h5ad" ftype="h5ad" compare="sim_size"/>
+</test>-->
+<test expect_num_outputs="2">
+<!-- test 5 -->
 <param name="adata" value="blobs.h5ad"/>
 <conditional name="method">
 <param name="method" value="pp.highly_variable_genes"/>
 <conditional name="flavor">
 <param name="flavor" value="seurat"/>
 <has_text_matching expression="subset=False"/>
 </assert_contents>
 </output>
 <output name="anndata_out" file="pp.highly_variable_genes.seurat.blobs.h5ad" ftype="h5ad" compare="sim_size" delta="100000" delta_frac="0.2"/>
 </test>
-<test>
+<test expect_num_outputs="2">
-<!-- test 5 -->
+<!-- test 6 -->
 <param name="adata" value="krumsiek11.h5ad" />
 <conditional name="method">
 <param name="method" value="pp.highly_variable_genes"/>
 <conditional name="flavor">
 <param name="flavor" value="cell_ranger"/>
 <has_text_matching expression="subset=True"/>
 </assert_contents>
 </output>
 <output name="anndata_out" file="pp.highly_variable_genes.krumsiek11-cell_ranger.h5ad" ftype="h5ad" compare="sim_size" delta="100000" delta_frac="0.9"/>
 </test>
-<test>
+<test expect_num_outputs="2">
-<!-- test 6 -->
+<!-- test 7 -->
 <param name="adata" value="krumsiek11.h5ad" />
 <conditional name="method">
 <param name="method" value="pp.subsample"/>
 <conditional name="type">
 <param name="type" value="fraction" />
 <has_text_matching expression="random_state=0"/>
 </assert_contents>
 </output>
 <output name="anndata_out" file="pp.subsample.krumsiek11_fraction.h5ad" ftype="h5ad" compare="sim_size"/>
 </test>
-<test>
+<test expect_num_outputs="2">
-<!-- test 7 -->
+<!-- test 8 -->
 <param name="adata" value="krumsiek11.h5ad" />
 <conditional name="method">
 <param name="method" value="pp.subsample"/>
 <conditional name="type">
 <param name="type" value="n_obs" />
 <has_text_matching expression="random_state=0"/>
 </assert_contents>
 </output>
 <output name="anndata_out" file="pp.subsample.krumsiek11_n_obs.h5ad" ftype="h5ad" compare="sim_size"/>
 </test>
-<test>
+<test expect_num_outputs="2">
-<!-- test 8 -->
+<!-- test 9 -->
 <param name="adata" value="random-randint.h5ad" />
 <conditional name="method">
 <param name="method" value="pp.downsample_counts"/>
 <param name="total_counts" value="20000"/>
 <param name="random_state" value="0"/>
 <has_text_matching expression="replace=False"/>
 </assert_contents>
 </output>
 <output name="anndata_out" file="pp.downsample_counts.random-randint.h5ad" ftype="h5ad" compare="sim_size" delta="10000000" delta_frac="0.5"/>
 </test>
+<test expect_num_outputs="3">
+<!-- test 10 -->
+<param name="adata" value="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad" />
+<conditional name="method">
+<param name="method" value="filter_marker"/>
+<param name="markerfile" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_1.tsv"/>
+<param name="thresh_mean" value="1.0"/>
+<param name="thresh_frac" value="0.2"/>
+<param name="layer_selection" value="True"/>
+<param name="groupby" value="bulk_labels"/>
+</conditional>
+<section name="advanced_common">
+<param name="show_log" value="true" />
+</section>
+<output name="hidden_output">
+<assert_contents>
+<has_text_matching expression="adata, key, x, 1.0, 0.2, 'bulk_labels'"/>
+</assert_contents>
+</output>
+<output name="anndata_out" file="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad" ftype="h5ad">
+<assert_contents>
+<has_h5_keys keys="obs, var, uns" />
+</assert_contents>
+</output>
+<output name="marker_out" file="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv" ftype="tabular" compare="sim_size"/>
+</test>
 </tests>
 <help><![CDATA[
 Filter cells outliers based on counts and numbers of genes expressed (`pp.filter_cells`)
 ========================================================================================
 Only provide one of the optional parameters `min_counts`, `min_genes`,
 `max_counts`, `max_genes` per call.
 More details on the `scanpy documentation
-<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.filter_cells.html>`__
+<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.filter_cells.html>`__
 Filter genes based on number of cells or counts (`pp.filter_genes`)
 ===================================================================
 Only provide one of the optional parameters `min_counts`, `min_cells`,
 `max_counts`, `max_cells` per call.
 More details on the `scanpy documentation
-<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.filter_genes.html>`__
+<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.filter_genes.html>`__
 Filters out genes based on fold change and fraction of genes expressing the gene within and outside the groupby categories (`tl.filter_rank_genes_groups`)
 ==========================================================================================================================================================
 More details on the `scanpy documentation
-<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.tl.filter_rank_genes_groups.html>`__
+<https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.filter_rank_genes_groups.html>`__
 Annotate highly variable genes (`pp.highly_variable_genes`)
 ===========================================================
 Subsample to a fraction of the number of observations (`pp.subsample`)
 ======================================================================
 More details on the `scanpy documentation
-<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.subsample.html>`__
+<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.subsample.html>`__
 Downsample counts (`pp.downsample_counts`)
 ==========================================
 Downsample counts so that each cell has no more than `target_counts`. Cells with fewer counts than `target_counts` are unaffected by this. This
 has been implemented by M. D. Luecken.
+Filter marker genes (`filter_marker`)
+======================================================================
+This option is specific for celltype marker gene detection. You can generate a celltype marker gene file (tsv) with **COSG** provided at Galaxy.
+The marker gene file should have as rows celltypes and columns as marker genes. Each celltype can have varying number of marker genes.
+A marker gene is returned (retained in the list) if the mean expression of the marker gene is bigger than the threshold of mean expression (thresh_mean) and if the fraction of cells with the marker gene expression is equal or higher than the cell fraction threshold (thresh_frac).
 More details on the `scanpy documentation
-<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.downsample_counts.html>`__
+<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.downsample_counts.html>`__
 ]]></help>
 <expand macro="citations"/>
 </tool>

Mercurial > repos > iuc > scanpy_filter

comparison filter.xml @ 12:d600e0947468 draft