Mercurial > repos > recetox > query

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query.py	Wed Dec 09 02:04:20 2020 +0000
@@ -0,0 +1,56 @@
+import json
+from typing import Tuple
+
+import click
+import pandas
+import pandasql
+from pandas import DataFrame
+
+
+def read(path: str, filetype: str, name: str) -> Tuple[str, DataFrame]:
+    if filetype == 'csv':
+        return name, pandas.read_csv(path)
+    elif filetype in ('tsv', 'tabular'):
+        return name, pandas.read_table(path)
+    elif filetype in ('h5', 'hdf'):
+        return name, pandas.read_hdf(path, name)
+    elif filetype == 'feather':
+        return name, pandas.read_feather(path)
+    elif filetype == 'parquet':
+        return name, pandas.read_parquet(path)
+    elif filetype == 'sqlite':
+        return pandas.read_sql(name, f'sqlite:///{path}')
+    else:
+        raise NotImplementedError(f'Unknown filetype {filetype}')
+
+
+def write(df: DataFrame, path: str, filetype: str, name: str) -> None:
+    if filetype == 'csv':
+        df.to_csv(path)
+    elif filetype in ('tsv', 'tabular'):
+        df.to_csv(path, sep='\t')
+    elif filetype in ('h5', 'hdf'):
+        with pandas.HDFStore(path) as file:
+            file.append(name, df, data_columns=list(df.columns))
+    elif filetype == 'feather':
+        df.to_feather(path)
+    elif filetype == 'parquet':
+        df.to_parquet(path)
+    elif filetype == 'sqlite':
+        df.to_sql(name, f'sqlite:///{path}')
+    else:
+        raise NotImplementedError(f'Unknown filetype {filetype}')
+
+
+@click.command()
+@click.argument('config', type=click.File())
+def main(config) -> None:
+    config = json.load(config)
+
+    tables = dict(read(table['path'], table['format'], table['name']) for table in config['tables'])
+    result = pandasql.sqldf(config['query'], tables)
+    write(result, config['result']['path'], config['result']['format'], config['result']['name'])
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query.xml	Wed Dec 09 02:04:20 2020 +0000
@@ -0,0 +1,134 @@
+<tool id="query" name="Query" version="@TOOL_VERSION@">
+    <macros>
+        <token name="@TOOL_VERSION@">0.1</token>
+        <xml name="formats">
+            <option value="csv">csv</option>
+            <option value="tsv">tsv</option>
+            <option value="h5">h5</option>
+            <option value="feather">feather</option>
+            <option value="parquet">parquet</option>
+            <option value="sqlite">sqlite</option>
+        </xml>
+        <xml name="internal_name" token_format="">
+            <when value="@FORMAT@">
+                <param name="name" type="text" help="Specify the format's internal name of the table."/>
+            </when>
+        </xml>
+    </macros>
+
+    <requirements>
+        <requirement type="package">click</requirement>
+        <requirement type="package">pyarrow</requirement>
+        <requirement type="package">pytables</requirement>
+        <requirement type="package">pandas</requirement>
+        <requirement type="package">pandasql</requirement>
+        <requirement type="package" version="3.7">python</requirement>
+    </requirements>
+
+    <command><![CDATA[
+        python '$__tool_directory__/query.py'
+            $config
+    ]]></command>
+
+    <configfiles>
+        <configfile name="config"><![CDATA[
+            #def filetype(data):
+                #if $data.is_of_type('h5')
+                    #return 'hdf'
+                #elif $data.is_of_type('csv')
+                    #return 'csv'
+                #elif $data.is_of_type('tsv', 'tabular')
+                    #return 'tsv'
+                #elif $data.is_of_type('feather', 'arrow')
+                    #return 'feather'
+                #elif $data.is_of_type('parquet')
+                    #return 'parquet'
+                #elif $data.is_of_type('sqlite')
+                    #return 'sqlite'
+                #end if
+            #end def
+
+            #set $config = {
+                'tables': [
+                    {
+                        'path': str($table.file),
+                        'name': str($table.name) if $table.name else 't{}'.format(str($index)),
+                        'format': str($table.format) if $table.format else $filetype($table.file)
+                    }
+                    for $index, $table in enumerate($tables)
+                ],
+                'query': str($query),
+                'result': {
+                    'path': str($output),
+                    'name': str($result.name) if str($result.format) in ('h5', 'hdf', 'sqlite') and $result.name else None,
+                    'format': str($result.format)
+                }
+            }
+
+            #import json
+            #echo $json.dumps($config)
+        ]]></configfile>
+    </configfiles>
+
+    <inputs>
+        <repeat name="tables" title="Table">
+            <param name="file" type="data" format="csv,tsv,h5,feather,parquet,sqlite" label="File"/>
+            <param name="format" type="select" optional="true">
+                <label>Force the file format</label>
+                <help>Force a particular file format to get around formats unknown to Galaxy, ie. Parquet.</help>
+                <expand macro="formats"/>
+            </param>
+            <param name="name" type="text" optional="true">
+                <label>Table name</label>
+                <help>Choose an unique name for the table which then you may use in the SQL Query. By default tables will be named as: t0, t1, ...</help>
+            </param>
+        </repeat>
+        <param name="query" type="text" area="true">
+            <label>SQL Query</label>
+            <help>
+                Type an arbitrary SQL SELECT to perform on the input tables. The input tables may be referred by their given names.
+                An example query is SELECT employees.name AS employee, emails.value AS email FROM one JOIN b ON employees.id = emails.employee, provided that the input tables are named employees and emails.
+            </help>
+        </param>
+        <conditional name="result">
+            <param name="format" type="select">
+                <label>SQL Query</label>
+                <help>Specify the file format of the query result.</help>
+                <expand macro="formats"/>
+            </param>
+            <expand macro="internal_name" format="h5"/>
+            <expand macro="internal_name" format="sqlite"/>
+        </conditional>
+    </inputs>
+
+    <outputs>
+        <data name="output">
+            <change_format>
+                <when input="result.format" value="h5" format="h5"/>
+                <when input="result.format" value="csv" format="csv"/>
+                <when input="result.format" value="tsv" format="tsv"/>
+                <when input="result.format" value="feather" format="feather"/>
+                <when input="result.format" value="parquet" format="parquet"/>
+                <when input="result.format" value="sqlite" format="sqlite"/>
+            </change_format>
+        </data>
+    </outputs>
+
+    <tests>
+        <test>
+            <repeat name="tables">
+                <param name="file" value="a.csv" ftype="csv"/>
+                <param name="name" value="a"/>
+            </repeat>
+            <repeat name="tables">
+                <param name="file" value="b.csv" ftype="csv"/>
+                <param name="name" value="b"/>
+            </repeat>
+            <param name="query" value="SELECT a.x AS ax, a.y AS ay, a.z AS az, b.x AS bx, b.y AS by, b.z AS bz FROM a JOIN b ON a.x = b.x"/>
+            <conditional name="result">
+                <param name="format" value="csv"/>
+            </conditional>
+            <output name="output" file="ab.csv" ftype="csv"/>
+        </test>
+    </tests>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/a.csv	Wed Dec 09 02:04:20 2020 +0000
@@ -0,0 +1,4 @@
+x,y,z
+1,1,a
+2,2,b
+3,3,c
Binary file test-data/a.parquet has changed
Binary file test-data/a.sqlite has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ab.csv	Wed Dec 09 02:04:20 2020 +0000
@@ -0,0 +1,4 @@
+,ax,ay,az,bx,by,bz
+0,1,1,a,1,2,3
+1,2,2,b,2,3,4
+2,3,3,c,3,4,5
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/b.csv	Wed Dec 09 02:04:20 2020 +0000
@@ -0,0 +1,4 @@
+x,y,z
+1,2,3
+2,3,4
+3,4,5