0
|
1 <tool id="cg_join" name="join(beta) 1.6" version="1.0.1">
|
|
2 <!--
|
|
3 This tool creates a GUI for the join function of cgatools from Complete Genomics, Inc.
|
|
4 written 6-18-2012 by bcrain@completegenomics.com
|
|
5 updated 8-14-2012 by bcrain@completegenomics.com
|
|
6 -->
|
|
7
|
|
8 <description>two tsv files based on equal fields or overlapping regions.</description>
|
|
9
|
|
10 <command>
|
|
11 <!-- print version of cgatools to STDOUT-->
|
|
12 cgatools | head -1;
|
|
13
|
|
14 <!-- print command lines to STDOUT-->
|
|
15 echo "cgatools join --beta
|
|
16 --input $inputA
|
|
17 --input $inputB
|
|
18 --output $output
|
|
19 --output-mode $outmode
|
|
20 $dump
|
|
21 --select $col
|
|
22 #for $m in $matches <!--get all matched columns-->
|
|
23 --match ${m.match}
|
|
24 #end for
|
|
25 #if $range_overlap.range == 'yes'
|
|
26 #for $o in $range_overlap.overlaps <!--get all overlapped columns-->
|
|
27 --overlap ${o.overlap}
|
|
28 #end for
|
|
29 --overlap-mode $range_overlap.overlapmode
|
|
30 --overlap-fraction-A $range_overlap.fractionA
|
|
31 --boundary-uncertainty-A $range_overlap.boundaryA
|
|
32 --overlap-fraction-B $range_overlap.fractionB
|
|
33 --boundary-uncertainty-B $range_overlap.boundaryB
|
|
34 #end if
|
|
35 ";
|
|
36
|
|
37 <!-- execute cgatools-->
|
|
38 cgatools join --beta
|
|
39 --input $inputA
|
|
40 --input $inputB
|
|
41 --output $output
|
|
42 --output-mode $outmode
|
|
43 $dump
|
|
44 --select $col
|
|
45 #for $m in $matches <!--get all matched columns-->
|
|
46 --match ${m.match}
|
|
47 #end for
|
|
48 #if $range_overlap.range == 'yes'
|
|
49 #for $o in $range_overlap.overlaps <!--get all overlapped columns-->
|
|
50 --overlap ${o.overlap}
|
|
51 #end for
|
|
52 --overlap-mode $range_overlap.overlapmode
|
|
53 --overlap-fraction-A $range_overlap.fractionA
|
|
54 --boundary-uncertainty-A $range_overlap.boundaryA
|
|
55 --overlap-fraction-B $range_overlap.fractionB
|
|
56 --boundary-uncertainty-B $range_overlap.boundaryB
|
|
57 #end if
|
|
58 </command>
|
|
59
|
|
60 <outputs>
|
|
61 <data format="tabular" name="output" label="${tool.name} output"/>
|
|
62 </outputs>
|
|
63
|
|
64 <inputs>
|
|
65 <!--form field to select input file A-->
|
|
66 <param name="inputA" type="data" format="tabular" label="Select input file A ">
|
|
67 <validator type="dataset_ok_validator" />
|
|
68 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
|
|
69 metadata_name="dbkey" metadata_column="0"
|
|
70 message="cgatools is not currently available for this build."/>
|
|
71 </param>
|
|
72
|
|
73 <!--form field to select input file B-->
|
|
74 <param name="inputB" type="data" format="tabular" label="Select input file B ">
|
|
75 <validator type="dataset_ok_validator" />
|
|
76 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
|
|
77 metadata_name="dbkey" metadata_column="0"
|
|
78 message="cgatools is not currently available for this build."/>
|
|
79 </param>
|
|
80
|
|
81 <!--form field to specify columns to print-->
|
|
82 <param name="col" type="text" value="A.*,B.*" size="40" label="Specify columns for output" help="The default value A.*,B.* prints all columns from both files, other selections enter in the format A.col_name1,A.col_name3,B.col_name1">
|
|
83 <validator type="empty_field" message="You must specify colums to print, the default is A.*,B.*"/>
|
|
84 </param>
|
|
85
|
|
86 <!--form field to select output-mode-->
|
|
87 <param name="outmode" type="select" label="Select output mode">
|
|
88 <option value="full" selected="true">full (1 line for each match of records in A and B)</option>
|
|
89 <option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option>
|
|
90 <option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option>
|
|
91 </param>
|
|
92
|
|
93 <!--form field to select dumping mode-->
|
|
94 <param name="dump" type="select" label="Select records to print">
|
|
95 <option value="--always-dump" selected="true">print all records of A even if not matched in B</option>
|
|
96 <option value="">print only records of A that are matched in B</option>
|
|
97 </param>
|
|
98
|
|
99 <!--form field to specify columns to match-->
|
|
100 <repeat name="matches" title="Exact match column" min="1">
|
|
101 <param name="match" type="text" size="40" label="Enter column:column" help="Enter column_from_A:column_from_B, e.g. chromosome:chromosome">
|
|
102 <validator type="empty_field" message="You must specify colums to match"/>
|
|
103 </param>
|
|
104 </repeat>
|
|
105
|
|
106 <!--form field to select range overlaps-->
|
|
107 <conditional name="range_overlap">
|
|
108 <param name="range" type="select" label="Do you want to match columns by overlapping range?">
|
|
109 <option value="no">no</option>
|
|
110 <option value="yes">yes</option>
|
|
111 </param>
|
|
112
|
|
113 <when value="yes">
|
|
114 <!--form field to specify columns to overlap-->
|
|
115 <repeat name="overlaps" title="Range column">
|
|
116 <param name="overlap" type="text" size="40" label="Enter column[,column]:column[,column]" help="Enter range_start_from_A[,range_stop_from_A]:range_start_from_B[,range_stop_from_B], e.g. begin,end:begin,end (overlapping range of positions) or begin,end:position"/>
|
|
117 </repeat>
|
|
118
|
|
119 <!--form field to select overlap-mode-->
|
|
120 <param name="overlapmode" type="select" label="Select overlap mode">
|
|
121 <option value="strict" selected="true">strict (overlap if A.begin<B.end and B.begin>A.end)</option>
|
|
122 <option value="allow-abutting-points">allow-abutting-points (overlap if A.begin<B.end and B.begin>A.end, or if A.begin<=B.end and B.begin<=A.end and either A or B has zero length.)</option>
|
|
123 </param>
|
|
124
|
|
125 <!--form fields to enter overlap options-->
|
|
126 <param name="fractionA" type="integer" value="0" label="Minimum fraction of A region overlap (default 0)" />
|
|
127 <param name="boundaryA" type="integer" value="0" label="Boundary uncertainty for A for overlap filtering (default 0)" help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-A * (A-range-length - boundary-uncertainty-A)"/>
|
|
128
|
|
129 <param name="fractionB" type="integer" value="0" label="Minimum fraction of B region overlap (default 0)" />
|
|
130 <param name="boundaryB" type="integer" value="0" label="Boundary uncertainty for overlap filtering (default 0)" help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-B * (B-range-length - boundary-uncertainty-B)"/>
|
|
131 </when>
|
|
132 </conditional>
|
|
133 </inputs>
|
|
134
|
|
135 <help>
|
|
136
|
|
137 **What it does**
|
|
138
|
|
139 This tool joins two tab-delimited files based on equal fields or overlapping regions.
|
|
140
|
|
141 **cgatools 1.6.0 Documentation**
|
|
142
|
|
143 Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
|
|
144
|
|
145 Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
|
|
146
|
|
147 **Command line reference**::
|
|
148
|
|
149 COMMAND NAME
|
|
150 join - Joins two tab-delimited files based on equal fields or overlapping regions.
|
|
151
|
|
152 DESCRIPTION
|
|
153 Joins two tab-delimited files based on equal fields or overlapping regions.
|
|
154 By default, an output record is produced for each match found between file
|
|
155 A and file B, but output format can be controlled by the --output-mode
|
|
156 parameter.
|
|
157
|
|
158 OPTIONS
|
|
159 -h [ --help ]
|
|
160 Print this help message.
|
|
161
|
|
162 --beta
|
|
163 This is a beta command. To run this command, you must pass the --beta
|
|
164 flag.
|
|
165
|
|
166 --input arg
|
|
167 File name to use as input (may be passed in as arguments at the end of
|
|
168 the command), or omitted for stdin). There must be exactly two input
|
|
169 files to join. If only one file is specified by name, file A is taken
|
|
170 to be stdin and file B is the named file. File B is read fully into
|
|
171 memory, and file A is streamed. File A's columns appear first in the
|
|
172 output.
|
|
173
|
|
174 --output arg (=STDOUT)
|
|
175 The output file name (may be omitted for stdout).
|
|
176
|
|
177 --match arg
|
|
178 A match specification, which is a column from A and a column from B
|
|
179 separated by a colon.
|
|
180
|
|
181 --overlap arg
|
|
182 Overlap specification. An overlap specification consists of a range
|
|
183 definition for files A and B, separated by a colon. A range definition
|
|
184 may be two columns, in which case they are interpreted as the beginning
|
|
185 and end of the range. Or it may be one column, in which case the range
|
|
186 is defined as the 1-base range starting at the given value. The records
|
|
187 from the two files must overlap in order to be considered for output.
|
|
188 Two ranges are considered to overlap if the overlap is at least one
|
|
189 base long, or if one of the ranges is length 0 and the ranges overlap
|
|
190 or abut. For example, "begin,end:offset" will match wherever end-begin
|
|
191 > 0, begin<offset+1, and end>offset, or wherever end-begin = 0,
|
|
192 begin<=offset+1, and end>=offset.
|
|
193
|
|
194
|
|
195 -m [ --output-mode ] arg (=full)
|
|
196 Output mode, one of the following:
|
|
197 full Print an output record for each match found between
|
|
198 file A and file B.
|
|
199 compact Print at most one record for each record of file A,
|
|
200 joining the file B values by a semicolon and
|
|
201 suppressing repeated B values and empty B values.
|
|
202 compact-pct Same as compact, but for each distinct B value,
|
|
203 annotate with the percentage of the A record that is
|
|
204 overlapped by B records with that B value. Percentage
|
|
205 is rounded up to nearest integer.
|
|
206
|
|
207 --overlap-mode arg (=strict)
|
|
208 Overlap mode, one of the following:
|
|
209 strict Range A and B overlap if A.begin < B.end and
|
|
210 B.begin < A.end.
|
|
211 allow-abutting-points Range A and B overlap they meet the strict
|
|
212 requirements, or if A.begin <= B.end and
|
|
213 B.begin <= A.end and either A or B has zero
|
|
214 length.
|
|
215
|
|
216 --select arg (=A.*,B.*)
|
|
217 Set of fields to select for output.
|
|
218
|
|
219 -a [ --always-dump ]
|
|
220 Dump every record of A, even if there are no matches with file B.
|
|
221
|
|
222 --overlap-fraction-A arg (=0)
|
|
223 Minimum fraction of A region overlap for filtering output.
|
|
224
|
|
225 --boundary-uncertainty-A arg (=0)
|
|
226 Boundary uncertainty for overlap filtering. Specifically, records
|
|
227 failing the following predicate are filtered away: overlap >=
|
|
228 overlap-fraction-A * ( A-range-length - boundary-uncertainty-A )
|
|
229
|
|
230 --overlap-fraction-B arg (=0)
|
|
231 Minimum fraction of B region overlap for filtering output.
|
|
232
|
|
233 --boundary-uncertainty-B arg (=0)
|
|
234 Boundary uncertainty for overlap filtering. Specifically, records
|
|
235 failing the following predicate are filtered away: overlap >=
|
|
236 overlap-fraction-B * ( B-range-length - boundary-uncertainty-B )
|
|
237
|
|
238 SUPPORTED FORMAT_VERSION
|
|
239 Any
|
|
240 </help>
|
|
241 </tool>
|