comparison validate_affy_metadata.py @ 0:ae030e6e838c draft

Uploaded
author greg
date Wed, 14 Nov 2018 14:47:32 -0500
parents
children 613186952d42
comparison
equal deleted inserted replaced
-1:000000000000 0:ae030e6e838c
1 #!/usr/bin/env python
2 """
3 Validate the metadata file associated with Affymetrix 96 well plate data.
4 """
5 import argparse
6 import datetime
7 import decimal
8 import re
9 import shutil
10 import sys
11
12 parser = argparse.ArgumentParser()
13 parser.add_argument('--input', dest='input', help='Metadata file for Affymetrix 96 well plate data')
14 parser.add_argument('--output', dest='output', help='Output dataset'),
15 args = parser.parse_args()
16
17 EMAIL_MAX_LEN = 255
18 VALID_EMAIL_RE = re.compile("[^@]+@[^@]+\.[^@]+")
19
20
21 def add_error_msg(accumulated_msgs, msg):
22 return "%s\n%s" % (accumulated_msgs, msg)
23
24
25 def empty_value(line_no, label, accumulated_msgs):
26 return add_error_msg(accumulated_msgs, "The required %s value is missing on line %d." % (label, line_no))
27
28
29 def stop_error(msg):
30 sys.exit(msg)
31
32
33 def validate_date_string(line_no, date_string, accumulated_msgs):
34 try:
35 datetime.datetime.strptime(date_string, '%Y/%m/%d')
36 return accumulated_msgs
37 except ValueError:
38 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (must be YY/MM/DD)." % line_no)
39
40
41 def validate_decimal(line_no, decimal_string, accumulated_msgs):
42 try:
43 decimal.Decimal(decimal_string)
44 return accumulated_msgs
45 except Exception:
46 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s)." % (line_no, decimal_string))
47
48
49 def validate_email(line_no, email, accumulated_msgs):
50 if not (VALID_EMAIL_RE.match(email)):
51 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email))
52 elif len(email) > EMAIL_MAX_LEN:
53 return add_error_msg(accumulated_msgs, "Line %d contains an email address (%) that is longer than the maximum length, %d characters." % (line_no, email))
54 return accumulated_msgs
55
56
57 accumulated_msgs = ""
58 # Parse the input file, skipping the header, and validating
59 # that each data line consists of 31 comma-separated items.
60 with open(args.input, "r") as ih:
61 for i, line in ih:
62 if i == 0:
63 # Skip the header.
64 continue
65 line = line.rstrip("\r\n")
66 if i > 97:
67 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 96 data lines.")
68 stop_error(accumulated_msgs)
69 items = line.split("\t")
70 if len(items) != 31:
71 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 31)." % (i, len(items)))
72 stop_error()
73 # Required.
74 sample_id = items[0]
75 if len(sample_id) == 0:
76 accumulated_msgs = empty_value(i, "sample_id", accumulated_msgs)
77 # Required and validated.
78 date_entered_db = items[1]
79 accumulated_msgs = validate_date_string(date_entered_db, accumulated_msgs)
80 # Required.
81 user_specimen_id = items[2]
82 if len(user_specimen_id) == 0:
83 accumulated_msgs = empty_value(i, "user_specimen_id", accumulated_msgs)
84 # Optional.
85 duplicate_sample = items[3]
86 # Optional.
87 matching_samples = items[4]
88 # Optional.
89 field_call = items[5]
90 # Optional.
91 bcoral_genet_id = items[6]
92 # Optional.
93 bsym_genet_id = items[7]
94 # Required.
95 reef = items[8]
96 if len(reef) == 0:
97 accumulated_msgs = empty_value(i, "reef", accumulated_msgs)
98 # Required.
99 region = items[9]
100 if len(region) == 0:
101 accumulated_msgs = empty_value(i, "region", accumulated_msgs)
102 # Required and validated.
103 latitude = items[10]
104 accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs)
105 # Required and validated.
106 longitude = items[11]
107 accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs)
108 # Optional.
109 geographic_origin = items[12]
110 # Optional.
111 sample_location = items[13]
112 # Optional.
113 latitude_outplant = items[14]
114 # Optional.
115 longitude_outplant = items[15]
116 # Optional.
117 depth = items[16]
118 # Optional.
119 dist_shore = items[17]
120 # Optional.
121 disease_resist = items[18]
122 # Optional.
123 bleach_resist = items[19]
124 # Optional.
125 mortality = items[20]
126 # Optional.
127 tle = items[21]
128 # Optional.
129 spawning = items[22]
130 # Required.
131 collector = items[23]
132 if len(collector) == 0:
133 accumulated_msgs = empty_value(i, "collector", accumulated_msgs)
134 # Required.
135 org = items[24]
136 if len(org) == 0:
137 accumulated_msgs = empty_value(i, "org", accumulated_msgs)
138 # Required and validated.
139 collection_date = items[25]
140 accumulated_msgs = validate_date_string(date_entered_db, accumulated_msgs)
141 # Required and validated.
142 contact_email = items[26]
143 accumulated_msgs = validate_email(contact_email, accumulated_msgs)
144 # Required.
145 seq_facility = items[27]
146 if len(seq_facility) == 0:
147 accumulated_msgs = empty_value(i, "seq_facility", accumulated_msgs)
148 # Optional.
149 array_version = items[28]
150 # Optional.
151 data_sharing = items[29]
152 # Optional.
153 data_hold = items[30]
154
155 if len(accumulated_msgs > 0):
156 stop_error(accumulated_msgs)
157
158 shutil.copy_file(args.input, args.output)