-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetch_s3_catalog_lossless.py
executable file
·56 lines (48 loc) · 2.16 KB
/
fetch_s3_catalog_lossless.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.config import Config
from datetime import datetime
def fetch_s3_catalog(bucket_name):
s3_cache = []
s3_client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
paginator = s3_client.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=bucket_name)
counter = 0
for page in page_iterator:
for item in page["Contents"]:
if item["Key"][-1] == "/":
item["Key"] = item["Key"][:-1]
splitted = item["Key"].split("/")
#if "test" in splitted:
# if len(splitted) < 4:
# print(item)
# continue
# item["Sensor"] = splitted[0]
# item["Protocol"] = splitted[1]
# temp_timestamp = splitted[3].split("_")
# item["Timestamp"] = int(datetime.strptime(temp_timestamp[0]+"_"+temp_timestamp[1], "%Y-%m-%d_%H-%M-%S").timestamp())
# item["Filename"] = splitted[3]
# item["LastModified"] = int(item["LastModified"].timestamp())
# item["ETag"] = item["ETag"].replace('"', '')
#else:
#if len(splitted) < 2:
# print(item)
# continue
try:
item["Sensor"] = splitted[0]
item["Protocol"] = splitted[1] if len(splitted) == 3 else splitted[-1].split(".")[-1]
temp_timestamp = splitted[-1].split("_")
item["Timestamp"] = int(datetime.strptime(temp_timestamp[0]+"_"+temp_timestamp[1], "%Y-%m-%d_%H-%M-%S").timestamp())
item["Filename"] = splitted[-1]
item["LastModified"] = int(item["LastModified"].timestamp())
item["ETag"] = item["ETag"].replace('"', '')
except Exception:
print(item)
continue
s3_cache.append(item)
s3_cache_df = pd.DataFrame(s3_cache)
s3_cache_df.to_csv("%s.csv" % bucket_name, index=False)
if __name__ == "__main__":
fetch_s3_catalog("archive-orcasound-net")