comparison testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/join.xml @ 0:ef23f9cd599b draft default tip

Uploaded
author devteam
date Thu, 27 Sep 2012 13:37:59 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ef23f9cd599b
1 <tool id="cg_join" name="join(beta) 1.6" version="1.0.1">
2 <!--
3 This tool creates a GUI for the join function of cgatools from Complete Genomics, Inc.
4 written 6-18-2012 by bcrain@completegenomics.com
5 updated 8-14-2012 by bcrain@completegenomics.com
6 -->
7
8 <description>two tsv files based on equal fields or overlapping regions.</description>
9
10 <command>
11 <!-- print version of cgatools to STDOUT-->
12 cgatools | head -1;
13
14 <!-- print command lines to STDOUT-->
15 echo "cgatools join --beta
16 --input $inputA
17 --input $inputB
18 --output $output
19 --output-mode $outmode
20 $dump
21 --select $col
22 #for $m in $matches <!--get all matched columns-->
23 --match ${m.match}
24 #end for
25 #if $range_overlap.range == 'yes'
26 #for $o in $range_overlap.overlaps <!--get all overlapped columns-->
27 --overlap ${o.overlap}
28 #end for
29 --overlap-mode $range_overlap.overlapmode
30 --overlap-fraction-A $range_overlap.fractionA
31 --boundary-uncertainty-A $range_overlap.boundaryA
32 --overlap-fraction-B $range_overlap.fractionB
33 --boundary-uncertainty-B $range_overlap.boundaryB
34 #end if
35 ";
36
37 <!-- execute cgatools-->
38 cgatools join --beta
39 --input $inputA
40 --input $inputB
41 --output $output
42 --output-mode $outmode
43 $dump
44 --select $col
45 #for $m in $matches <!--get all matched columns-->
46 --match ${m.match}
47 #end for
48 #if $range_overlap.range == 'yes'
49 #for $o in $range_overlap.overlaps <!--get all overlapped columns-->
50 --overlap ${o.overlap}
51 #end for
52 --overlap-mode $range_overlap.overlapmode
53 --overlap-fraction-A $range_overlap.fractionA
54 --boundary-uncertainty-A $range_overlap.boundaryA
55 --overlap-fraction-B $range_overlap.fractionB
56 --boundary-uncertainty-B $range_overlap.boundaryB
57 #end if
58 </command>
59
60 <outputs>
61 <data format="tabular" name="output" label="${tool.name} output"/>
62 </outputs>
63
64 <inputs>
65 <!--form field to select input file A-->
66 <param name="inputA" type="data" format="tabular" label="Select input file A ">
67 <validator type="dataset_ok_validator" />
68 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
69 metadata_name="dbkey" metadata_column="0"
70 message="cgatools is not currently available for this build."/>
71 </param>
72
73 <!--form field to select input file B-->
74 <param name="inputB" type="data" format="tabular" label="Select input file B ">
75 <validator type="dataset_ok_validator" />
76 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
77 metadata_name="dbkey" metadata_column="0"
78 message="cgatools is not currently available for this build."/>
79 </param>
80
81 <!--form field to specify columns to print-->
82 <param name="col" type="text" value="A.*,B.*" size="40" label="Specify columns for output" help="The default value A.*,B.* prints all columns from both files, other selections enter in the format A.col_name1,A.col_name3,B.col_name1">
83 <validator type="empty_field" message="You must specify colums to print, the default is A.*,B.*"/>
84 </param>
85
86 <!--form field to select output-mode-->
87 <param name="outmode" type="select" label="Select output mode">
88 <option value="full" selected="true">full (1 line for each match of records in A and B)</option>
89 <option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option>
90 <option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option>
91 </param>
92
93 <!--form field to select dumping mode-->
94 <param name="dump" type="select" label="Select records to print">
95 <option value="--always-dump" selected="true">print all records of A even if not matched in B</option>
96 <option value="">print only records of A that are matched in B</option>
97 </param>
98
99 <!--form field to specify columns to match-->
100 <repeat name="matches" title="Exact match column" min="1">
101 <param name="match" type="text" size="40" label="Enter column:column" help="Enter column_from_A:column_from_B, e.g. chromosome:chromosome">
102 <validator type="empty_field" message="You must specify colums to match"/>
103 </param>
104 </repeat>
105
106 <!--form field to select range overlaps-->
107 <conditional name="range_overlap">
108 <param name="range" type="select" label="Do you want to match columns by overlapping range?">
109 <option value="no">no</option>
110 <option value="yes">yes</option>
111 </param>
112
113 <when value="yes">
114 <!--form field to specify columns to overlap-->
115 <repeat name="overlaps" title="Range column">
116 <param name="overlap" type="text" size="40" label="Enter column&#91;,column&#93;:column&#91;,column&#93;" help="Enter range_start_from_A&#91;,range_stop_from_A&#93;:range_start_from_B&#91;,range_stop_from_B&#93;, e.g. begin,end:begin,end (overlapping range of positions) or begin,end:position"/>
117 </repeat>
118
119 <!--form field to select overlap-mode-->
120 <param name="overlapmode" type="select" label="Select overlap mode">
121 <option value="strict" selected="true">strict (overlap if A.begin&lt;B.end and B.begin&gt;A.end)</option>
122 <option value="allow-abutting-points">allow-abutting-points (overlap if A.begin&lt;B.end and B.begin&gt;A.end, or if A.begin&lt;=B.end and B.begin&lt;=A.end and either A or B has zero length.)</option>
123 </param>
124
125 <!--form fields to enter overlap options-->
126 <param name="fractionA" type="integer" value="0" label="Minimum fraction of A region overlap (default 0)" />
127 <param name="boundaryA" type="integer" value="0" label="Boundary uncertainty for A for overlap filtering (default 0)" help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-A * (A-range-length - boundary-uncertainty-A)"/>
128
129 <param name="fractionB" type="integer" value="0" label="Minimum fraction of B region overlap (default 0)" />
130 <param name="boundaryB" type="integer" value="0" label="Boundary uncertainty for overlap filtering (default 0)" help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-B * (B-range-length - boundary-uncertainty-B)"/>
131 </when>
132 </conditional>
133 </inputs>
134
135 <help>
136
137 **What it does**
138
139 This tool joins two tab-delimited files based on equal fields or overlapping regions.
140
141 **cgatools 1.6.0 Documentation**
142
143 Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
144
145 Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
146
147 **Command line reference**::
148
149 COMMAND NAME
150 join - Joins two tab-delimited files based on equal fields or overlapping regions.
151
152 DESCRIPTION
153 Joins two tab-delimited files based on equal fields or overlapping regions.
154 By default, an output record is produced for each match found between file
155 A and file B, but output format can be controlled by the --output-mode
156 parameter.
157
158 OPTIONS
159 -h [ --help ]
160 Print this help message.
161
162 --beta
163 This is a beta command. To run this command, you must pass the --beta
164 flag.
165
166 --input arg
167 File name to use as input (may be passed in as arguments at the end of
168 the command), or omitted for stdin). There must be exactly two input
169 files to join. If only one file is specified by name, file A is taken
170 to be stdin and file B is the named file. File B is read fully into
171 memory, and file A is streamed. File A's columns appear first in the
172 output.
173
174 --output arg (=STDOUT)
175 The output file name (may be omitted for stdout).
176
177 --match arg
178 A match specification, which is a column from A and a column from B
179 separated by a colon.
180
181 --overlap arg
182 Overlap specification. An overlap specification consists of a range
183 definition for files A and B, separated by a colon. A range definition
184 may be two columns, in which case they are interpreted as the beginning
185 and end of the range. Or it may be one column, in which case the range
186 is defined as the 1-base range starting at the given value. The records
187 from the two files must overlap in order to be considered for output.
188 Two ranges are considered to overlap if the overlap is at least one
189 base long, or if one of the ranges is length 0 and the ranges overlap
190 or abut. For example, "begin,end:offset" will match wherever end-begin
191 &gt; 0, begin&lt;offset+1, and end&gt;offset, or wherever end-begin = 0,
192 begin&lt;=offset+1, and end&gt;=offset.
193
194
195 -m [ --output-mode ] arg (=full)
196 Output mode, one of the following:
197 full Print an output record for each match found between
198 file A and file B.
199 compact Print at most one record for each record of file A,
200 joining the file B values by a semicolon and
201 suppressing repeated B values and empty B values.
202 compact-pct Same as compact, but for each distinct B value,
203 annotate with the percentage of the A record that is
204 overlapped by B records with that B value. Percentage
205 is rounded up to nearest integer.
206
207 --overlap-mode arg (=strict)
208 Overlap mode, one of the following:
209 strict Range A and B overlap if A.begin &lt; B.end and
210 B.begin &lt; A.end.
211 allow-abutting-points Range A and B overlap they meet the strict
212 requirements, or if A.begin &lt;= B.end and
213 B.begin &lt;= A.end and either A or B has zero
214 length.
215
216 --select arg (=A.*,B.*)
217 Set of fields to select for output.
218
219 -a [ --always-dump ]
220 Dump every record of A, even if there are no matches with file B.
221
222 --overlap-fraction-A arg (=0)
223 Minimum fraction of A region overlap for filtering output.
224
225 --boundary-uncertainty-A arg (=0)
226 Boundary uncertainty for overlap filtering. Specifically, records
227 failing the following predicate are filtered away: overlap &gt;=
228 overlap-fraction-A * ( A-range-length - boundary-uncertainty-A )
229
230 --overlap-fraction-B arg (=0)
231 Minimum fraction of B region overlap for filtering output.
232
233 --boundary-uncertainty-B arg (=0)
234 Boundary uncertainty for overlap filtering. Specifically, records
235 failing the following predicate are filtered away: overlap &gt;=
236 overlap-fraction-B * ( B-range-length - boundary-uncertainty-B )
237
238 SUPPORTED FORMAT_VERSION
239 Any
240 </help>
241 </tool>