annotate validate_affy_metadata.py @ 1:613186952d42 draft

Uploaded
author greg
date Wed, 14 Nov 2018 14:50:08 -0500
parents ae030e6e838c
children f3ca1cd544fa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ae030e6e838c Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
ae030e6e838c Uploaded
greg
parents:
diff changeset
2 """
ae030e6e838c Uploaded
greg
parents:
diff changeset
3 Validate the metadata file associated with Affymetrix 96 well plate data.
ae030e6e838c Uploaded
greg
parents:
diff changeset
4 """
ae030e6e838c Uploaded
greg
parents:
diff changeset
5 import argparse
ae030e6e838c Uploaded
greg
parents:
diff changeset
6 import datetime
ae030e6e838c Uploaded
greg
parents:
diff changeset
7 import decimal
ae030e6e838c Uploaded
greg
parents:
diff changeset
8 import re
ae030e6e838c Uploaded
greg
parents:
diff changeset
9 import shutil
ae030e6e838c Uploaded
greg
parents:
diff changeset
10 import sys
ae030e6e838c Uploaded
greg
parents:
diff changeset
11
ae030e6e838c Uploaded
greg
parents:
diff changeset
12 parser = argparse.ArgumentParser()
ae030e6e838c Uploaded
greg
parents:
diff changeset
13 parser.add_argument('--input', dest='input', help='Metadata file for Affymetrix 96 well plate data')
ae030e6e838c Uploaded
greg
parents:
diff changeset
14 parser.add_argument('--output', dest='output', help='Output dataset'),
ae030e6e838c Uploaded
greg
parents:
diff changeset
15 args = parser.parse_args()
ae030e6e838c Uploaded
greg
parents:
diff changeset
16
ae030e6e838c Uploaded
greg
parents:
diff changeset
17 EMAIL_MAX_LEN = 255
ae030e6e838c Uploaded
greg
parents:
diff changeset
18 VALID_EMAIL_RE = re.compile("[^@]+@[^@]+\.[^@]+")
ae030e6e838c Uploaded
greg
parents:
diff changeset
19
ae030e6e838c Uploaded
greg
parents:
diff changeset
20
ae030e6e838c Uploaded
greg
parents:
diff changeset
21 def add_error_msg(accumulated_msgs, msg):
ae030e6e838c Uploaded
greg
parents:
diff changeset
22 return "%s\n%s" % (accumulated_msgs, msg)
ae030e6e838c Uploaded
greg
parents:
diff changeset
23
ae030e6e838c Uploaded
greg
parents:
diff changeset
24
ae030e6e838c Uploaded
greg
parents:
diff changeset
25 def empty_value(line_no, label, accumulated_msgs):
ae030e6e838c Uploaded
greg
parents:
diff changeset
26 return add_error_msg(accumulated_msgs, "The required %s value is missing on line %d." % (label, line_no))
ae030e6e838c Uploaded
greg
parents:
diff changeset
27
ae030e6e838c Uploaded
greg
parents:
diff changeset
28
ae030e6e838c Uploaded
greg
parents:
diff changeset
29 def stop_error(msg):
ae030e6e838c Uploaded
greg
parents:
diff changeset
30 sys.exit(msg)
ae030e6e838c Uploaded
greg
parents:
diff changeset
31
ae030e6e838c Uploaded
greg
parents:
diff changeset
32
ae030e6e838c Uploaded
greg
parents:
diff changeset
33 def validate_date_string(line_no, date_string, accumulated_msgs):
ae030e6e838c Uploaded
greg
parents:
diff changeset
34 try:
ae030e6e838c Uploaded
greg
parents:
diff changeset
35 datetime.datetime.strptime(date_string, '%Y/%m/%d')
ae030e6e838c Uploaded
greg
parents:
diff changeset
36 return accumulated_msgs
ae030e6e838c Uploaded
greg
parents:
diff changeset
37 except ValueError:
ae030e6e838c Uploaded
greg
parents:
diff changeset
38 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (must be YY/MM/DD)." % line_no)
ae030e6e838c Uploaded
greg
parents:
diff changeset
39
ae030e6e838c Uploaded
greg
parents:
diff changeset
40
ae030e6e838c Uploaded
greg
parents:
diff changeset
41 def validate_decimal(line_no, decimal_string, accumulated_msgs):
ae030e6e838c Uploaded
greg
parents:
diff changeset
42 try:
ae030e6e838c Uploaded
greg
parents:
diff changeset
43 decimal.Decimal(decimal_string)
ae030e6e838c Uploaded
greg
parents:
diff changeset
44 return accumulated_msgs
ae030e6e838c Uploaded
greg
parents:
diff changeset
45 except Exception:
ae030e6e838c Uploaded
greg
parents:
diff changeset
46 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s)." % (line_no, decimal_string))
ae030e6e838c Uploaded
greg
parents:
diff changeset
47
ae030e6e838c Uploaded
greg
parents:
diff changeset
48
ae030e6e838c Uploaded
greg
parents:
diff changeset
49 def validate_email(line_no, email, accumulated_msgs):
ae030e6e838c Uploaded
greg
parents:
diff changeset
50 if not (VALID_EMAIL_RE.match(email)):
ae030e6e838c Uploaded
greg
parents:
diff changeset
51 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email))
ae030e6e838c Uploaded
greg
parents:
diff changeset
52 elif len(email) > EMAIL_MAX_LEN:
ae030e6e838c Uploaded
greg
parents:
diff changeset
53 return add_error_msg(accumulated_msgs, "Line %d contains an email address (%) that is longer than the maximum length, %d characters." % (line_no, email))
ae030e6e838c Uploaded
greg
parents:
diff changeset
54 return accumulated_msgs
ae030e6e838c Uploaded
greg
parents:
diff changeset
55
ae030e6e838c Uploaded
greg
parents:
diff changeset
56
ae030e6e838c Uploaded
greg
parents:
diff changeset
57 accumulated_msgs = ""
ae030e6e838c Uploaded
greg
parents:
diff changeset
58 # Parse the input file, skipping the header, and validating
ae030e6e838c Uploaded
greg
parents:
diff changeset
59 # that each data line consists of 31 comma-separated items.
ae030e6e838c Uploaded
greg
parents:
diff changeset
60 with open(args.input, "r") as ih:
1
613186952d42 Uploaded
greg
parents: 0
diff changeset
61 for i, line in enumerate(ih):
0
ae030e6e838c Uploaded
greg
parents:
diff changeset
62 if i == 0:
ae030e6e838c Uploaded
greg
parents:
diff changeset
63 # Skip the header.
ae030e6e838c Uploaded
greg
parents:
diff changeset
64 continue
ae030e6e838c Uploaded
greg
parents:
diff changeset
65 line = line.rstrip("\r\n")
ae030e6e838c Uploaded
greg
parents:
diff changeset
66 if i > 97:
ae030e6e838c Uploaded
greg
parents:
diff changeset
67 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 96 data lines.")
ae030e6e838c Uploaded
greg
parents:
diff changeset
68 stop_error(accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
69 items = line.split("\t")
ae030e6e838c Uploaded
greg
parents:
diff changeset
70 if len(items) != 31:
ae030e6e838c Uploaded
greg
parents:
diff changeset
71 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 31)." % (i, len(items)))
ae030e6e838c Uploaded
greg
parents:
diff changeset
72 stop_error()
ae030e6e838c Uploaded
greg
parents:
diff changeset
73 # Required.
ae030e6e838c Uploaded
greg
parents:
diff changeset
74 sample_id = items[0]
ae030e6e838c Uploaded
greg
parents:
diff changeset
75 if len(sample_id) == 0:
ae030e6e838c Uploaded
greg
parents:
diff changeset
76 accumulated_msgs = empty_value(i, "sample_id", accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
77 # Required and validated.
ae030e6e838c Uploaded
greg
parents:
diff changeset
78 date_entered_db = items[1]
ae030e6e838c Uploaded
greg
parents:
diff changeset
79 accumulated_msgs = validate_date_string(date_entered_db, accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
80 # Required.
ae030e6e838c Uploaded
greg
parents:
diff changeset
81 user_specimen_id = items[2]
ae030e6e838c Uploaded
greg
parents:
diff changeset
82 if len(user_specimen_id) == 0:
ae030e6e838c Uploaded
greg
parents:
diff changeset
83 accumulated_msgs = empty_value(i, "user_specimen_id", accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
84 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
85 duplicate_sample = items[3]
ae030e6e838c Uploaded
greg
parents:
diff changeset
86 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
87 matching_samples = items[4]
ae030e6e838c Uploaded
greg
parents:
diff changeset
88 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
89 field_call = items[5]
ae030e6e838c Uploaded
greg
parents:
diff changeset
90 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
91 bcoral_genet_id = items[6]
ae030e6e838c Uploaded
greg
parents:
diff changeset
92 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
93 bsym_genet_id = items[7]
ae030e6e838c Uploaded
greg
parents:
diff changeset
94 # Required.
ae030e6e838c Uploaded
greg
parents:
diff changeset
95 reef = items[8]
ae030e6e838c Uploaded
greg
parents:
diff changeset
96 if len(reef) == 0:
ae030e6e838c Uploaded
greg
parents:
diff changeset
97 accumulated_msgs = empty_value(i, "reef", accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
98 # Required.
ae030e6e838c Uploaded
greg
parents:
diff changeset
99 region = items[9]
ae030e6e838c Uploaded
greg
parents:
diff changeset
100 if len(region) == 0:
ae030e6e838c Uploaded
greg
parents:
diff changeset
101 accumulated_msgs = empty_value(i, "region", accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
102 # Required and validated.
ae030e6e838c Uploaded
greg
parents:
diff changeset
103 latitude = items[10]
ae030e6e838c Uploaded
greg
parents:
diff changeset
104 accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
105 # Required and validated.
ae030e6e838c Uploaded
greg
parents:
diff changeset
106 longitude = items[11]
ae030e6e838c Uploaded
greg
parents:
diff changeset
107 accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
108 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
109 geographic_origin = items[12]
ae030e6e838c Uploaded
greg
parents:
diff changeset
110 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
111 sample_location = items[13]
ae030e6e838c Uploaded
greg
parents:
diff changeset
112 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
113 latitude_outplant = items[14]
ae030e6e838c Uploaded
greg
parents:
diff changeset
114 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
115 longitude_outplant = items[15]
ae030e6e838c Uploaded
greg
parents:
diff changeset
116 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
117 depth = items[16]
ae030e6e838c Uploaded
greg
parents:
diff changeset
118 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
119 dist_shore = items[17]
ae030e6e838c Uploaded
greg
parents:
diff changeset
120 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
121 disease_resist = items[18]
ae030e6e838c Uploaded
greg
parents:
diff changeset
122 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
123 bleach_resist = items[19]
ae030e6e838c Uploaded
greg
parents:
diff changeset
124 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
125 mortality = items[20]
ae030e6e838c Uploaded
greg
parents:
diff changeset
126 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
127 tle = items[21]
ae030e6e838c Uploaded
greg
parents:
diff changeset
128 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
129 spawning = items[22]
ae030e6e838c Uploaded
greg
parents:
diff changeset
130 # Required.
ae030e6e838c Uploaded
greg
parents:
diff changeset
131 collector = items[23]
ae030e6e838c Uploaded
greg
parents:
diff changeset
132 if len(collector) == 0:
ae030e6e838c Uploaded
greg
parents:
diff changeset
133 accumulated_msgs = empty_value(i, "collector", accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
134 # Required.
ae030e6e838c Uploaded
greg
parents:
diff changeset
135 org = items[24]
ae030e6e838c Uploaded
greg
parents:
diff changeset
136 if len(org) == 0:
ae030e6e838c Uploaded
greg
parents:
diff changeset
137 accumulated_msgs = empty_value(i, "org", accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
138 # Required and validated.
ae030e6e838c Uploaded
greg
parents:
diff changeset
139 collection_date = items[25]
ae030e6e838c Uploaded
greg
parents:
diff changeset
140 accumulated_msgs = validate_date_string(date_entered_db, accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
141 # Required and validated.
ae030e6e838c Uploaded
greg
parents:
diff changeset
142 contact_email = items[26]
ae030e6e838c Uploaded
greg
parents:
diff changeset
143 accumulated_msgs = validate_email(contact_email, accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
144 # Required.
ae030e6e838c Uploaded
greg
parents:
diff changeset
145 seq_facility = items[27]
ae030e6e838c Uploaded
greg
parents:
diff changeset
146 if len(seq_facility) == 0:
ae030e6e838c Uploaded
greg
parents:
diff changeset
147 accumulated_msgs = empty_value(i, "seq_facility", accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
148 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
149 array_version = items[28]
ae030e6e838c Uploaded
greg
parents:
diff changeset
150 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
151 data_sharing = items[29]
ae030e6e838c Uploaded
greg
parents:
diff changeset
152 # Optional.
ae030e6e838c Uploaded
greg
parents:
diff changeset
153 data_hold = items[30]
ae030e6e838c Uploaded
greg
parents:
diff changeset
154
ae030e6e838c Uploaded
greg
parents:
diff changeset
155 if len(accumulated_msgs > 0):
ae030e6e838c Uploaded
greg
parents:
diff changeset
156 stop_error(accumulated_msgs)
ae030e6e838c Uploaded
greg
parents:
diff changeset
157
ae030e6e838c Uploaded
greg
parents:
diff changeset
158 shutil.copy_file(args.input, args.output)