Mercurial > repos > devteam > testing_cgatools
comparison testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/join.xml @ 0:ef23f9cd599b draft default tip
Uploaded
| author | devteam |
|---|---|
| date | Thu, 27 Sep 2012 13:37:59 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ef23f9cd599b |
|---|---|
| 1 <tool id="cg_join" name="join(beta) 1.6" version="1.0.1"> | |
| 2 <!-- | |
| 3 This tool creates a GUI for the join function of cgatools from Complete Genomics, Inc. | |
| 4 written 6-18-2012 by bcrain@completegenomics.com | |
| 5 updated 8-14-2012 by bcrain@completegenomics.com | |
| 6 --> | |
| 7 | |
| 8 <description>two tsv files based on equal fields or overlapping regions.</description> | |
| 9 | |
| 10 <command> | |
| 11 <!-- print version of cgatools to STDOUT--> | |
| 12 cgatools | head -1; | |
| 13 | |
| 14 <!-- print command lines to STDOUT--> | |
| 15 echo "cgatools join --beta | |
| 16 --input $inputA | |
| 17 --input $inputB | |
| 18 --output $output | |
| 19 --output-mode $outmode | |
| 20 $dump | |
| 21 --select $col | |
| 22 #for $m in $matches <!--get all matched columns--> | |
| 23 --match ${m.match} | |
| 24 #end for | |
| 25 #if $range_overlap.range == 'yes' | |
| 26 #for $o in $range_overlap.overlaps <!--get all overlapped columns--> | |
| 27 --overlap ${o.overlap} | |
| 28 #end for | |
| 29 --overlap-mode $range_overlap.overlapmode | |
| 30 --overlap-fraction-A $range_overlap.fractionA | |
| 31 --boundary-uncertainty-A $range_overlap.boundaryA | |
| 32 --overlap-fraction-B $range_overlap.fractionB | |
| 33 --boundary-uncertainty-B $range_overlap.boundaryB | |
| 34 #end if | |
| 35 "; | |
| 36 | |
| 37 <!-- execute cgatools--> | |
| 38 cgatools join --beta | |
| 39 --input $inputA | |
| 40 --input $inputB | |
| 41 --output $output | |
| 42 --output-mode $outmode | |
| 43 $dump | |
| 44 --select $col | |
| 45 #for $m in $matches <!--get all matched columns--> | |
| 46 --match ${m.match} | |
| 47 #end for | |
| 48 #if $range_overlap.range == 'yes' | |
| 49 #for $o in $range_overlap.overlaps <!--get all overlapped columns--> | |
| 50 --overlap ${o.overlap} | |
| 51 #end for | |
| 52 --overlap-mode $range_overlap.overlapmode | |
| 53 --overlap-fraction-A $range_overlap.fractionA | |
| 54 --boundary-uncertainty-A $range_overlap.boundaryA | |
| 55 --overlap-fraction-B $range_overlap.fractionB | |
| 56 --boundary-uncertainty-B $range_overlap.boundaryB | |
| 57 #end if | |
| 58 </command> | |
| 59 | |
| 60 <outputs> | |
| 61 <data format="tabular" name="output" label="${tool.name} output"/> | |
| 62 </outputs> | |
| 63 | |
| 64 <inputs> | |
| 65 <!--form field to select input file A--> | |
| 66 <param name="inputA" type="data" format="tabular" label="Select input file A "> | |
| 67 <validator type="dataset_ok_validator" /> | |
| 68 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" | |
| 69 metadata_name="dbkey" metadata_column="0" | |
| 70 message="cgatools is not currently available for this build."/> | |
| 71 </param> | |
| 72 | |
| 73 <!--form field to select input file B--> | |
| 74 <param name="inputB" type="data" format="tabular" label="Select input file B "> | |
| 75 <validator type="dataset_ok_validator" /> | |
| 76 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" | |
| 77 metadata_name="dbkey" metadata_column="0" | |
| 78 message="cgatools is not currently available for this build."/> | |
| 79 </param> | |
| 80 | |
| 81 <!--form field to specify columns to print--> | |
| 82 <param name="col" type="text" value="A.*,B.*" size="40" label="Specify columns for output" help="The default value A.*,B.* prints all columns from both files, other selections enter in the format A.col_name1,A.col_name3,B.col_name1"> | |
| 83 <validator type="empty_field" message="You must specify colums to print, the default is A.*,B.*"/> | |
| 84 </param> | |
| 85 | |
| 86 <!--form field to select output-mode--> | |
| 87 <param name="outmode" type="select" label="Select output mode"> | |
| 88 <option value="full" selected="true">full (1 line for each match of records in A and B)</option> | |
| 89 <option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option> | |
| 90 <option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option> | |
| 91 </param> | |
| 92 | |
| 93 <!--form field to select dumping mode--> | |
| 94 <param name="dump" type="select" label="Select records to print"> | |
| 95 <option value="--always-dump" selected="true">print all records of A even if not matched in B</option> | |
| 96 <option value="">print only records of A that are matched in B</option> | |
| 97 </param> | |
| 98 | |
| 99 <!--form field to specify columns to match--> | |
| 100 <repeat name="matches" title="Exact match column" min="1"> | |
| 101 <param name="match" type="text" size="40" label="Enter column:column" help="Enter column_from_A:column_from_B, e.g. chromosome:chromosome"> | |
| 102 <validator type="empty_field" message="You must specify colums to match"/> | |
| 103 </param> | |
| 104 </repeat> | |
| 105 | |
| 106 <!--form field to select range overlaps--> | |
| 107 <conditional name="range_overlap"> | |
| 108 <param name="range" type="select" label="Do you want to match columns by overlapping range?"> | |
| 109 <option value="no">no</option> | |
| 110 <option value="yes">yes</option> | |
| 111 </param> | |
| 112 | |
| 113 <when value="yes"> | |
| 114 <!--form field to specify columns to overlap--> | |
| 115 <repeat name="overlaps" title="Range column"> | |
| 116 <param name="overlap" type="text" size="40" label="Enter column[,column]:column[,column]" help="Enter range_start_from_A[,range_stop_from_A]:range_start_from_B[,range_stop_from_B], e.g. begin,end:begin,end (overlapping range of positions) or begin,end:position"/> | |
| 117 </repeat> | |
| 118 | |
| 119 <!--form field to select overlap-mode--> | |
| 120 <param name="overlapmode" type="select" label="Select overlap mode"> | |
| 121 <option value="strict" selected="true">strict (overlap if A.begin<B.end and B.begin>A.end)</option> | |
| 122 <option value="allow-abutting-points">allow-abutting-points (overlap if A.begin<B.end and B.begin>A.end, or if A.begin<=B.end and B.begin<=A.end and either A or B has zero length.)</option> | |
| 123 </param> | |
| 124 | |
| 125 <!--form fields to enter overlap options--> | |
| 126 <param name="fractionA" type="integer" value="0" label="Minimum fraction of A region overlap (default 0)" /> | |
| 127 <param name="boundaryA" type="integer" value="0" label="Boundary uncertainty for A for overlap filtering (default 0)" help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-A * (A-range-length - boundary-uncertainty-A)"/> | |
| 128 | |
| 129 <param name="fractionB" type="integer" value="0" label="Minimum fraction of B region overlap (default 0)" /> | |
| 130 <param name="boundaryB" type="integer" value="0" label="Boundary uncertainty for overlap filtering (default 0)" help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-B * (B-range-length - boundary-uncertainty-B)"/> | |
| 131 </when> | |
| 132 </conditional> | |
| 133 </inputs> | |
| 134 | |
| 135 <help> | |
| 136 | |
| 137 **What it does** | |
| 138 | |
| 139 This tool joins two tab-delimited files based on equal fields or overlapping regions. | |
| 140 | |
| 141 **cgatools 1.6.0 Documentation** | |
| 142 | |
| 143 Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf | |
| 144 | |
| 145 Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf | |
| 146 | |
| 147 **Command line reference**:: | |
| 148 | |
| 149 COMMAND NAME | |
| 150 join - Joins two tab-delimited files based on equal fields or overlapping regions. | |
| 151 | |
| 152 DESCRIPTION | |
| 153 Joins two tab-delimited files based on equal fields or overlapping regions. | |
| 154 By default, an output record is produced for each match found between file | |
| 155 A and file B, but output format can be controlled by the --output-mode | |
| 156 parameter. | |
| 157 | |
| 158 OPTIONS | |
| 159 -h [ --help ] | |
| 160 Print this help message. | |
| 161 | |
| 162 --beta | |
| 163 This is a beta command. To run this command, you must pass the --beta | |
| 164 flag. | |
| 165 | |
| 166 --input arg | |
| 167 File name to use as input (may be passed in as arguments at the end of | |
| 168 the command), or omitted for stdin). There must be exactly two input | |
| 169 files to join. If only one file is specified by name, file A is taken | |
| 170 to be stdin and file B is the named file. File B is read fully into | |
| 171 memory, and file A is streamed. File A's columns appear first in the | |
| 172 output. | |
| 173 | |
| 174 --output arg (=STDOUT) | |
| 175 The output file name (may be omitted for stdout). | |
| 176 | |
| 177 --match arg | |
| 178 A match specification, which is a column from A and a column from B | |
| 179 separated by a colon. | |
| 180 | |
| 181 --overlap arg | |
| 182 Overlap specification. An overlap specification consists of a range | |
| 183 definition for files A and B, separated by a colon. A range definition | |
| 184 may be two columns, in which case they are interpreted as the beginning | |
| 185 and end of the range. Or it may be one column, in which case the range | |
| 186 is defined as the 1-base range starting at the given value. The records | |
| 187 from the two files must overlap in order to be considered for output. | |
| 188 Two ranges are considered to overlap if the overlap is at least one | |
| 189 base long, or if one of the ranges is length 0 and the ranges overlap | |
| 190 or abut. For example, "begin,end:offset" will match wherever end-begin | |
| 191 > 0, begin<offset+1, and end>offset, or wherever end-begin = 0, | |
| 192 begin<=offset+1, and end>=offset. | |
| 193 | |
| 194 | |
| 195 -m [ --output-mode ] arg (=full) | |
| 196 Output mode, one of the following: | |
| 197 full Print an output record for each match found between | |
| 198 file A and file B. | |
| 199 compact Print at most one record for each record of file A, | |
| 200 joining the file B values by a semicolon and | |
| 201 suppressing repeated B values and empty B values. | |
| 202 compact-pct Same as compact, but for each distinct B value, | |
| 203 annotate with the percentage of the A record that is | |
| 204 overlapped by B records with that B value. Percentage | |
| 205 is rounded up to nearest integer. | |
| 206 | |
| 207 --overlap-mode arg (=strict) | |
| 208 Overlap mode, one of the following: | |
| 209 strict Range A and B overlap if A.begin < B.end and | |
| 210 B.begin < A.end. | |
| 211 allow-abutting-points Range A and B overlap they meet the strict | |
| 212 requirements, or if A.begin <= B.end and | |
| 213 B.begin <= A.end and either A or B has zero | |
| 214 length. | |
| 215 | |
| 216 --select arg (=A.*,B.*) | |
| 217 Set of fields to select for output. | |
| 218 | |
| 219 -a [ --always-dump ] | |
| 220 Dump every record of A, even if there are no matches with file B. | |
| 221 | |
| 222 --overlap-fraction-A arg (=0) | |
| 223 Minimum fraction of A region overlap for filtering output. | |
| 224 | |
| 225 --boundary-uncertainty-A arg (=0) | |
| 226 Boundary uncertainty for overlap filtering. Specifically, records | |
| 227 failing the following predicate are filtered away: overlap >= | |
| 228 overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) | |
| 229 | |
| 230 --overlap-fraction-B arg (=0) | |
| 231 Minimum fraction of B region overlap for filtering output. | |
| 232 | |
| 233 --boundary-uncertainty-B arg (=0) | |
| 234 Boundary uncertainty for overlap filtering. Specifically, records | |
| 235 failing the following predicate are filtered away: overlap >= | |
| 236 overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) | |
| 237 | |
| 238 SUPPORTED FORMAT_VERSION | |
| 239 Any | |
| 240 </help> | |
| 241 </tool> |
