comparison validate_affy_metadata.py @ 25:d9f3bcfeecfe draft default tip

Uploaded
author greg
date Thu, 15 Aug 2019 13:15:22 -0400
parents 8a826d1afe69
children
comparison
equal deleted inserted replaced
24:8a826d1afe69 25:d9f3bcfeecfe
35 return 'True' 35 return 'True'
36 else: 36 else:
37 return 'False' 37 return 'False'
38 38
39 39
40 def validate_date_string(line_no, date_string, accumulated_msgs): 40 def validate_date_string(line_no, date_string, column, accumulated_msgs):
41 if len(date_string) == 0: 41 if len(date_string) == 0:
42 return accumulated_msgs 42 return accumulated_msgs
43 try: 43 try:
44 datetime.datetime.strptime(date_string, '%Y-%m-%d') 44 datetime.datetime.strptime(date_string, '%Y-%m-%d')
45 return accumulated_msgs 45 return accumulated_msgs
46 except ValueError: 46 except ValueError:
47 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD)." % (line_no, date_string)) 47 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD) for column %s." % (line_no, date_string, column))
48 48
49 49
50 def validate_decimal(line_no, decimal_string, accumulated_msgs): 50 def validate_decimal(line_no, decimal_string, column, accumulated_msgs):
51 try: 51 try:
52 decimal.Decimal(decimal_string) 52 decimal.Decimal(decimal_string)
53 return accumulated_msgs 53 return accumulated_msgs
54 except Exception: 54 except Exception:
55 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s)." % (line_no, decimal_string)) 55 return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s) for column %s." % (line_no, decimal_string, column))
56 56
57 57
58 def validate_email(line_no, email, accumulated_msgs): 58 def validate_email(line_no, email, accumulated_msgs):
59 if not (VALID_EMAIL_RE.match(email)): 59 if not (VALID_EMAIL_RE.match(email)):
60 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email)) 60 return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s). " % (line_no, email))
69 with open(args.input, "r") as ih: 69 with open(args.input, "r") as ih:
70 for i, line in enumerate(ih): 70 for i, line in enumerate(ih):
71 if i == 0: 71 if i == 0:
72 # Skip the header. 72 # Skip the header.
73 continue 73 continue
74 # Keep 1-based line value for error messages.
75 line_no = i + 1
74 line = line.rstrip("\r\n") 76 line = line.rstrip("\r\n")
75 if i > 97: 77 if i > 97:
76 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 97 lines (must be 1 header line and no more than 96 data lines).") 78 accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 97 lines (must be 1 header line and no more than 96 data lines).")
77 stop_error(accumulated_msgs) 79 stop_error(accumulated_msgs)
78 items = line.split("\t") 80 items = line.split("\t")
79 if len(items) != 32: 81 if len(items) != 32:
80 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 32)." % (i, len(items))) 82 accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 32)." % (line_no, len(items)))
81 stop_error(accumulated_msgs) 83 stop_error(accumulated_msgs)
82 # Required and validated. 84 # Required and validated.
83 # Required. 85 # Required.
84 user_specimen_id = items[0] 86 user_specimen_id = items[0]
85 if len(user_specimen_id) == 0: 87 if len(user_specimen_id) == 0:
86 accumulated_msgs = empty_value(i, "user_specimen_id", accumulated_msgs) 88 accumulated_msgs = empty_value(line_no, "user_specimen_id", accumulated_msgs)
87 # Optional. 89 # Optional.
88 field_call = items[1] 90 field_call = items[1]
89 # Optional. 91 # Optional.
90 bcoral_genet_id = items[2] 92 bcoral_genet_id = items[2]
91 # Optional. 93 # Optional.
92 bsym_genet_id = items[3] 94 bsym_genet_id = items[3]
93 # Required. 95 # Required.
94 reef = items[4] 96 reef = items[4]
95 if len(reef) == 0: 97 if len(reef) == 0:
96 accumulated_msgs = empty_value(i, "reef", accumulated_msgs) 98 accumulated_msgs = empty_value(line_no, "reef", accumulated_msgs)
97 # Required. 99 # Required.
98 region = items[5] 100 region = items[5]
99 if len(region) == 0: 101 if len(region) == 0:
100 accumulated_msgs = empty_value(i, "region", accumulated_msgs) 102 accumulated_msgs = empty_value(line_no, "region", accumulated_msgs)
101 # Required and validated. 103 # Required and validated.
102 latitude = items[6] 104 latitude = items[6]
103 accumulated_msgs = validate_decimal(i, latitude, accumulated_msgs) 105 accumulated_msgs = validate_decimal(line_no, latitude, "latitude", accumulated_msgs)
104 # Required and validated. 106 # Required and validated.
105 longitude = items[7] 107 longitude = items[7]
106 accumulated_msgs = validate_decimal(i, longitude, accumulated_msgs) 108 accumulated_msgs = validate_decimal(line_no, longitude, "longitude", accumulated_msgs)
107 # Optional. 109 # Optional.
108 geographic_origin = items[8] 110 geographic_origin = items[8]
109 # Optional. 111 # Optional.
110 sample_location = items[9] 112 colony_location = items[9]
111 # Optional. 113 # Optional.
112 latitude_outplant = items[10] 114 depth = items[10]
113 # Optional. 115 # Optional.
114 longitude_outplant = items[11] 116 disease_resist = items[11]
115 # Optional. 117 # Optional.
116 depth = items[12] 118 bleach_resist = items[12]
117 # Optional. 119 # Optional.
118 disease_resist = items[13] 120 mortality = items[13]
119 # Optional. 121 # Optional.
120 bleach_resist = items[14] 122 tle = items[14]
121 # Optional. 123 # Optional.
122 mortality = items[15] 124 spawning = string_as_boolean_string(items[15])
125 # Required.
126 collector_last_name = items[16]
127 if len(collector_last_name) == 0:
128 accumulated_msgs = empty_value(line_no, "collector_last_name", accumulated_msgs)
129 # Required.
130 collector_first_name = items[17]
131 if len(collector_first_name) == 0:
132 accumulated_msgs = empty_value(line_no, "collector_first_name", accumulated_msgs)
133 # Required.
134 org = items[18]
135 if len(org) == 0:
136 accumulated_msgs = empty_value(line_no, "org", accumulated_msgs)
137 # Required and validated.
138 collection_date = items[19]
139 accumulated_msgs = validate_date_string(line_no, collection_date, "collection_date", accumulated_msgs)
140 # Required and validated.
141 contact_email = items[20]
142 accumulated_msgs = validate_email(line_no, contact_email, accumulated_msgs)
143 # Required.
144 seq_facility = items[21]
145 if len(seq_facility) == 0:
146 accumulated_msgs = empty_value(line_no, "seq_facility", accumulated_msgs)
123 # Optional. 147 # Optional.
124 tle = items[16] 148 array_version = items[22]
125 # Optional. 149 # Optional.
126 spawning = string_as_boolean_string(items[17]) 150 public = string_as_boolean_string(items[23])
127 # Required. 151 # Optional.
128 collector_last_name = items[18] 152 public_after_date = items[24]
129 if len(collector_last_name) == 0: 153 accumulated_msga = validate_date_string(line_no, public_after_date, "public_after_date", accumulated_msgs)
130 accumulated_msgs = empty_value(i, "collector_last_name", accumulated_msgs)
131 # Required.
132 collector_first_name = items[19]
133 if len(collector_first_name) == 0:
134 accumulated_msgs = empty_value(i, "collector_first_name", accumulated_msgs)
135 # Required.
136 org = items[20]
137 if len(org) == 0:
138 accumulated_msgs = empty_value(i, "org", accumulated_msgs)
139 # Required and validated. 154 # Required and validated.
140 collection_date = items[21] 155 sperm_motility = items[25]
141 accumulated_msgs = validate_date_string(i, collection_date, accumulated_msgs) 156 accumulated_msgs = validate_decimal(line_no, sperm_motility, "sperm_motility", accumulated_msgs)
142 # Required and validated. 157 # Required and validated.
143 contact_email = items[22] 158 healing_time = items[26]
144 accumulated_msgs = validate_email(i, contact_email, accumulated_msgs) 159 accumulated_msgs = validate_decimal(line_no, healing_time, "healing_time", accumulated_msgs)
145 # Required.
146 seq_facility = items[23]
147 if len(seq_facility) == 0:
148 accumulated_msgs = empty_value(i, "seq_facility", accumulated_msgs)
149 # Optional. 160 # Optional.
150 array_version = items[24] 161 dna_extraction_method = items[27]
151 # Optional. 162 # Optional.
152 public = string_as_boolean_string(items[25]) 163 dna_concentration = items[28]
153 # Optional.
154 public_after_date = items[26]
155 accumulated_msga = validate_date_string(i, public_after_date, accumulated_msgs)
156 # Required and validated.
157 sperm_motility = items[27]
158 accumulated_msgs = validate_decimal(i, sperm_motility, accumulated_msgs)
159 # Required and validated.
160 healing_time = items[28]
161 accumulated_msgs = validate_decimal(i, healing_time, accumulated_msgs)
162 # Optional.
163 dna_extraction_method = items[29]
164 # Optional.
165 dna_concentration = items[30]
166 # If dna_concentration has a value, then it must be decimal. 164 # If dna_concentration has a value, then it must be decimal.
167 if len(dna_concentration) > 0: 165 if len(dna_concentration) > 0:
168 accumulated_msgs = validate_decimal(i, dna_concentration, accumulated_msgs) 166 accumulated_msgs = validate_decimal(line_no, dna_concentration, "dna_concentration", accumulated_msgs)
169 # Optional. 167 # Optional.
170 registry_id = items[31] 168 registry_id = items[29]
171 169 # Optional.
170 result_folder_name = items[30]
171 # Optional.
172 plate_barcode = items[31]
173
172 174
173 if len(accumulated_msgs) > 0: 175 if len(accumulated_msgs) > 0:
174 stop_error(accumulated_msgs) 176 stop_error(accumulated_msgs)
175 177
176 shutil.copyfile(args.input, args.output) 178 shutil.copyfile(args.input, args.output)