-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paths3_get.py
189 lines (173 loc) · 8.81 KB
/
s3_get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import boto3
from botocore.exceptions import ClientError
import datetime
from s3_generate import gen_python_dict_from_tagging_list#, gen_tagging_list_from_python_dict
# at some point these will likely be changed to use metadata stored by AWS either in S3 or RDS
## tag filter
def helper_tag_filter(tag_list:dict, source_tags:dict):
'''helper for get_buckets_tag_filter() and get_object_tag_filter()'''
result = len(tag_list)
for tkey, tval in tag_list.items():
if not result: return True
if source_tags.get(tkey) == tval:
result -= 1
return not result
def get_buckets_with_tags(session, tags: list[dict]|dict, s3_format = True) -> list[tuple[str]]:
'''
Returns list of tuples containing buckets and their tags based on filtering by `tags`
Parameters:
`session` boto3.session.Session()
`tags` list[dict]|dict
See `s3_format` for more information
`s3_format` bool
if True, `tags` is a list S3 tag formatted dicts {"Key":key_arg, "Value":value_arg}
if False, `tags` is a dict of regular key-value pairs {key_arg1: value_arg1, key_arg2: value_arg2}
'''
assert isinstance(tags,(list,dict))
if s3_format:
tags = gen_python_dict_from_tagging_list(tags)
s3_client = session.client("s3")
result = []
for bucket in s3_client.list_buckets()["Buckets"]:
try:
bucket_tags = s3_client.get_bucket_tagging(Bucket=bucket["Name"])["TagSet"]
if helper_tag_filter(tags, gen_python_dict_from_tagging_list(bucket_tags)):
result.append((bucket, bucket_tags))
except ClientError as err:
if err.response["Error"]["Code"] == "NoSuchTagSet": continue
else: print("Error: ", err.response)
return result
def get_objects_with_tags_from_bucket(session, bucket_name:str, tags:list[dict]|dict,
object_prefix:str = None, s3_format = True) -> list[tuple[str]]:
'''
Returns list of tuples containing `bucket_name`'s objects and their tags based on `tags`
Parameters:
`session` boto3.session.Session()
`bucket_name` str
the name of the bucket to check
`tags` list[dict]|dict
See `s3_format` for more information
`object_prefix` str
Prefix of objects that should be returned, makes the request from AWS smaller I think, accelerates the function.
defaults to None, all objects in the bucket are checked for the tags
`s3_format` bool
if True, `tags` is a list S3 tag formatted dicts {"Key":key_arg, "Value":value_arg}
if False, `tags` is a dict of regular key-value pairs {key_arg1: value_arg1, key_arg2: value_arg2}
'''
s3_client = session.client("s3")
result = []
if s3_format:
tags = gen_python_dict_from_tagging_list(tags)
if object_prefix: #!= None
assert isinstance(object_prefix,str)
object_list = s3_client.list_objects(Bucket = bucket_name, Prefix = object_prefix)["Contents"]
else:
object_list = s3_client.list_objects(Bucket = bucket_name)["Contents"]
for object in object_list:
tag_dict = gen_python_dict_from_tagging_list(
s3_client.get_object_tagging(Bucket = bucket_name, Key = object["Key"])["TagSet"]
)
if helper_tag_filter(tags,tag_dict):
result.append((object,tag_dict))
return result
## name date filter
def helper_date_conversion(use_date):
'''helper for name_date filters, converts use_date to datetime.datetime'''
if isinstance(use_date, str):
try:
use_date = [int(i) for i in use_date.split("-")]
use_date = [datetime.datetime(*use_date)]
except Exception:
return f"Incorrect string input for `use_date` = {use_date}"
elif isinstance(use_date, datetime.date):
use_date = [datetime.datetime.combine(use_date, datetime.time(0))]
elif isinstance(use_date, list):
assert len(use_date) == 2, "Date interval can only have two dates."
for i,d in enumerate(use_date):
if isinstance(d,str):
try:
d = [int(num) for num in d.split("-")]
use_date[i] = datetime.datetime(*d)
except Exception:
return f"Incorrect string input for `use_date[{i}]` = {d}"
elif isinstance(d, datetime.date):
use_date[i] = datetime.datetime.combine(d, datetime.time(0))
#use_date is all datetime.datetime now
return use_date
def helper_date_comparison(bucket_creationdate: datetime.datetime, *args) -> bool:
'''helper for name_date filters, returns the date comparison result'''
timezone = bucket_creationdate.tzinfo
args = list(args) #args is a tuple but a mutable object is needed
for i in range(len(args)):
if not args[i].tzinfo:
args[i] = args[i].replace(tzinfo = timezone)
if len(args) == 2:
assert args[0] < args[1], "start date must be before end date in `use_date`."
return bucket_creationdate <= args[1] and bucket_creationdate >= args[0]
else:
if not args[0]: #default value is None in get_bucket_with_name_date
return True #so all dates are retrieved
else:
return bucket_creationdate.date() == args[0].date()
#will use the date portion for single comparison
def get_buckets_with_name_date(session,
prefix: str,
use_date: list[str|datetime.datetime|datetime.date]
| str|datetime.datetime|datetime.date
| None = None) -> list[str]:
'''
Returns list of bucket names who start with `prefix` and made on or in (list) `use_date` using boto3.
Parameters:
`session` boto3.session.Session()
`prefix` str
the first len(prefix) characters in bucket name must be `prefix` to work.
If you want all buckets, to only filter with date, use `prefix` = ""
`use_date` list[str|datetime.datetime|datetime.date] or str|datetime.datetime|datetime.date
defaults to None if only the name is needed as a search.
Used to retrieve the buckets made on the date given.
if str, must be in the format "year-month-day-hour-minute-second".
only year, month, and day are required.
If you want to check an interval of dates, use a for loop with an f-string for `use_date`.
Doesn't throw an indexing error if the prefix is longer than the bucket name.
'''
assert isinstance(use_date, (str, list, datetime.date, datetime.datetime)), f"Invalid type(use_date) = {type(use_date)}"
# turning use_date into datetime.datetime
use_date = helper_date_conversion(use_date)
bucket_list = session.client("s3").list_buckets()["Buckets"]
#removed from list comprehension to separate boto3 interacting with a loop
return [bucket for bucket
in bucket_list
if (bucket["Name"][: min(len(bucket["Name"]), len(prefix))] == prefix)
& (helper_date_comparison(bucket["CreationDate"], *use_date))]
def get_objects_with_name_date(session,
bucket_name: str,
object_prefix: str,
use_date: list[str|datetime.datetime|datetime.date]
| str|datetime.datetime|datetime.date
| None = None) -> list[str]:
'''
Returns list of bucket names who start with `prefix` and made on or in (list) `use_date` using boto3.
Parameters:
`session` boto3.session.Session()
`bucket_name` str
The name of the bucket to be checked.
`object_prefix` str
the first len(prefix) characters in an object name must be `prefix` to work.
use `prefix` = "" to not filter by name
`use_date` list[str|datetime.datetime|datetime.date] or str|datetime.datetime|datetime.date
defaults to None if only the name is needed as a search.
Used to retrieve the buckets made on the date given.
if str, must be in the format "year-month-day-hour-minute-second".
only year, month, and day are required.
If you want to check an interval of dates, use a for loop with an f-string for `use_date`.
Doesn't throw an indexing error if the prefix is longer than the bucket name.
'''
assert isinstance(use_date, (str, list, datetime.date, datetime.datetime)), f"Invalid type(use_date) = {type(use_date)}"
# turning use_date into datetime.datetime
use_date = helper_date_conversion(use_date)
object_list = session.client("s3").list_objects(Bucket = bucket_name,
Prefix = object_prefix)["Contents"]
#removed from list comprehension to separate boto3 interacting with a loop
return [obj for obj
in object_list
if helper_date_comparison(obj["LastModified"], *use_date)]