Mercurial > repos > greg > validate_affy_metadata
comparison validate_affy_metadata.py @ 0:ae030e6e838c draft
Uploaded
author | greg |
---|---|
date | Wed, 14 Nov 2018 14:47:32 -0500 |
parents | |
children | 613186952d42 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ae030e6e838c |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Validate the metadata file associated with Affymetrix 96 well plate data. | |
4 """ | |
5 import argparse | |
6 import datetime | |
7 import decimal | |
8 import re | |
9 import shutil | |
10 import sys | |
11 | |
12 parser = argparse.ArgumentParser() | |
13 parser.add_argument('--input', dest='input', help='Metadata file for Affymetrix 96 well plate data') | |
14 parser.add_argument('--output', dest='output', help='Output dataset'), | |
15 args = parser.parse_args() | |
16 | |
17 EMAIL_MAX_LEN = 255 | |
18 VALID_EMAIL_RE = re.compile("[^@]+@[^@]+\.[^@]+") | |
19 | |
20 | |
21 def add_error_msg(accumulated_msgs, msg): | |
22 return "%s\n%s" % (accumulated_msgs, msg) | |
23 | |
24 | |
25 def empty_value(line_no, label, accumulated_msgs): | |
26 return add_error_msg(accumulated_msgs, "The required %s value is missing on line %d." % (label, line_no)) | |
27 | |
28 | |
29 def stop_error(msg): | |
30 sys.exit(msg) | |
31 | |
32 | |
33 def validate_date_string(line_no, date_string, accumulated_msgs): | |
34 try: | |
35 datetime.datetime.strptime(date_string, '%Y/%m/%d') | |
36 return accumulated_msgs | |
37 except ValueError: | |
38 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (must be YY/MM/DD)." % line_no) | |
39 | |
40 | |
41 def validate_decimal(line_no, decimal_string, accumulated_msgs): | |
42 try: | |
43 decimal.Decimal(decimal_string) | |
44 return accumulated_msgs | |
45 except Exception: | |
46 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s)." % (line_no, decimal_string)) | |
47 | |
48 | |
49 def validate_email(line_no, email, accumulated_msgs): | |
50 if not (VALID_EMAIL_RE.match(email)): | |
51 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email)) | |
52 elif len(email) > EMAIL_MAX_LEN: | |
53 return add_error_msg(accumulated_msgs, "Line %d contains an email address (%) that is longer than the maximum length, %d characters." % (line_no, email)) | |
54 return accumulated_msgs | |
55 | |
56 | |
57 accumulated_msgs = "" | |
58 # Parse the input file, skipping the header, and validating | |
59 # that each data line consists of 31 comma-separated items. | |
60 with open(args.input, "r") as ih: | |
61 for i, line in ih: | |
62 if i == 0: | |
63 # Skip the header. | |
64 continue | |
65 line = line.rstrip("\r\n") | |
66 if i > 97: | |
67 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 96 data lines.") | |
68 stop_error(accumulated_msgs) | |
69 items = line.split("\t") | |
70 if len(items) != 31: | |
71 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 31)." % (i, len(items))) | |
72 stop_error() | |
73 # Required. | |
74 sample_id = items[0] | |
75 if len(sample_id) == 0: | |
76 accumulated_msgs = empty_value(i, "sample_id", accumulated_msgs) | |
77 # Required and validated. | |
78 date_entered_db = items[1] | |
79 accumulated_msgs = validate_date_string(date_entered_db, accumulated_msgs) | |
80 # Required. | |
81 user_specimen_id = items[2] | |
82 if len(user_specimen_id) == 0: | |
83 accumulated_msgs = empty_value(i, "user_specimen_id", accumulated_msgs) | |
84 # Optional. | |
85 duplicate_sample = items[3] | |
86 # Optional. | |
87 matching_samples = items[4] | |
88 # Optional. | |
89 field_call = items[5] | |
90 # Optional. | |
91 bcoral_genet_id = items[6] | |
92 # Optional. | |
93 bsym_genet_id = items[7] | |
94 # Required. | |
95 reef = items[8] | |
96 if len(reef) == 0: | |
97 accumulated_msgs = empty_value(i, "reef", accumulated_msgs) | |
98 # Required. | |
99 region = items[9] | |
100 if len(region) == 0: | |
101 accumulated_msgs = empty_value(i, "region", accumulated_msgs) | |
102 # Required and validated. | |
103 latitude = items[10] | |
104 accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs) | |
105 # Required and validated. | |
106 longitude = items[11] | |
107 accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs) | |
108 # Optional. | |
109 geographic_origin = items[12] | |
110 # Optional. | |
111 sample_location = items[13] | |
112 # Optional. | |
113 latitude_outplant = items[14] | |
114 # Optional. | |
115 longitude_outplant = items[15] | |
116 # Optional. | |
117 depth = items[16] | |
118 # Optional. | |
119 dist_shore = items[17] | |
120 # Optional. | |
121 disease_resist = items[18] | |
122 # Optional. | |
123 bleach_resist = items[19] | |
124 # Optional. | |
125 mortality = items[20] | |
126 # Optional. | |
127 tle = items[21] | |
128 # Optional. | |
129 spawning = items[22] | |
130 # Required. | |
131 collector = items[23] | |
132 if len(collector) == 0: | |
133 accumulated_msgs = empty_value(i, "collector", accumulated_msgs) | |
134 # Required. | |
135 org = items[24] | |
136 if len(org) == 0: | |
137 accumulated_msgs = empty_value(i, "org", accumulated_msgs) | |
138 # Required and validated. | |
139 collection_date = items[25] | |
140 accumulated_msgs = validate_date_string(date_entered_db, accumulated_msgs) | |
141 # Required and validated. | |
142 contact_email = items[26] | |
143 accumulated_msgs = validate_email(contact_email, accumulated_msgs) | |
144 # Required. | |
145 seq_facility = items[27] | |
146 if len(seq_facility) == 0: | |
147 accumulated_msgs = empty_value(i, "seq_facility", accumulated_msgs) | |
148 # Optional. | |
149 array_version = items[28] | |
150 # Optional. | |
151 data_sharing = items[29] | |
152 # Optional. | |
153 data_hold = items[30] | |
154 | |
155 if len(accumulated_msgs > 0): | |
156 stop_error(accumulated_msgs) | |
157 | |
158 shutil.copy_file(args.input, args.output) |