From 7d9a264de86c108beea79ce9641827071e215b73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannah=20G=C3=B6tsch?= <100220185+Ha-nn-ah@users.noreply.github.com> Date: Mon, 9 Oct 2023 15:11:07 +0200 Subject: [PATCH] Update sf_extract_metadata.py ignore time (two different formats) in "collection_date" and only proceed with the date --- scripts/sf_extract_metadata.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/sf_extract_metadata.py b/scripts/sf_extract_metadata.py index 16804f9..cc46507 100644 --- a/scripts/sf_extract_metadata.py +++ b/scripts/sf_extract_metadata.py @@ -75,6 +75,15 @@ def organism_format(raw_data): # date processing if datacolct!='unknown': import re, calendar + # ignore time + match_time = re.search(r'\dT\d\d:\d\d:\d\dZ', datacolct) # for date-time format + if match_time: + start_index_time = datacolct.index(match_time.group()) + 1 + datacolct = datacolct[:start_index_time] + match_time = re.search(r'\dT\d\dZ', datacolct) # for date-time format but time deleted manually + if match_time: + start_index_time = datacolct.index(match_time.group()) + 1 + datacolct = datacolct[:start_index_time] datacolct= ''.join(datacolct.split('-')) dates=re.findall('\d+', datacolct); # two versions of date: 15-Seq-2011/2014-03-14