-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathremote_storage.qmd
209 lines (154 loc) · 4.43 KB
/
remote_storage.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
---
title: "DB connectivity"
execute:
eval: true
warning: true
error: true
keep-ipynb: true
cache: true
jupyter: python3
pdf-engine: lualatex
# theme: pandoc
html:
code-tools: true
fold-code: false
author: Jonathan D. Rosenblatt
data: 04-20-2024
toc: true
number-sections: true
number-depth: 3
embed-resources: true
---
```{python}
#| echo: false
# pip install connectorx
# %pip install --upgrade pip
# %pip install --upgrade polars
# %pip install --upgrade pyarrow
# %pip install --upgrade Pandas
# %pip install --upgrade plotly
# %pip freeze > requirements.txt
```
```{python}
#| label: setup-env
# %pip install -r requirements.txt
```
```{python}
#| label: Polars-version
%pip show Polars # check you Polars version
```
```{python}
#| label: Pandas-version
%pip show Pandas # check you Pandas version
```
```{python}
#| label: preliminaries
import polars as pl
pl.Config(fmt_str_lengths=50)
import polars.selectors as cs
import pandas as pd
import numpy as np
import pyarrow as pa
import plotly.express as px
import string
import random
import os
import sys
%matplotlib inline
import matplotlib.pyplot as plt
from datetime import datetime
# Following two lines only required to view plotly when rendering from VScode.
import plotly.io as pio
# pio.renderers.default = "plotly_mimetype+notebook_connected+notebook"
pio.renderers.default = "plotly_mimetype+notebook"
```
What Polars module and dependencies are installed?
```{python}
#| label: show-versions
pl.show_versions()
```
# Read from S3
```{python}
import boto3
session = boto3.session.Session()
credentials = session.get_credentials()
pl.read_parquet(f's3://{bucket_name}/{file_path}',
storage_options={
"aws_access_key_id": credentials.access_key,
"aws_secret_access_key": credentials.secret_key,
"session_token":credentials.token,
"region": "us-west-2",
})
```
# read_database_uri()
```{python}
import json
from boto3.session import Session
from urllib.parse import quote_plus
REDSHIFT_DATABASE = 'dev'
REDSHIFT_CLUSTER_IDENTIFIER = 'fairmatic-dev'
REDSHIFT_HOST = 'redshift.fairmatic.org'
REDSHIFT_PORT = 5439
# Use boto to discover the AWS credentials to access redshift
session = Session(profile_name="fairmatic")
iam_client = session.client('iam')
redshift_client = session.client('redshift')
redshift_user_name = iam_client.get_user()['User']['UserName']
# Make an object with all redshift required credentials
redshift_creds = redshift_client.get_cluster_credentials(
DbUser=redshift_user_name,
DbName=REDSHIFT_DATABASE,
ClusterIdentifier=REDSHIFT_CLUSTER_IDENTIFIER,
DurationSeconds=3600,
AutoCreate=True,
)
# Tailor the connection string in the read_database_uri() expected format: "postgres://username:password@server:port/database"
REDSHIFT_CONNECTION_STRING = f"redshift://{quote_plus(redshift_creds['DbUser'])}:{quote_plus(redshift_creds['DbPassword'])}@{REDSHIFT_HOST}:{REDSHIFT_PORT}/{REDSHIFT_DATABASE}"
query = "select * from analytics.report_portfolio_policy_insights"
import polars as pl
v = pl.read_database_uri(query=query, uri=REDSHIFT_CONNECTION_STRING)
print(v)
```
# read_database()
```{python}
import json
from boto3.session import Session
from urllib.parse import quote_plus
REDSHIFT_DATABASE = 'dev'
REDSHIFT_CLUSTER_IDENTIFIER = 'fairmatic-dev'
REDSHIFT_HOST = 'redshift.fairmatic.org'
REDSHIFT_PORT = 5439
session = Session()
iam_client = session.client('iam')
redshift_client = session.client('redshift')
redshift_user_name = iam_client.get_user()['User']['UserName']
redshift_creds = redshift_client.get_cluster_credentials(
DbUser=redshift_user_name,
DbName=REDSHIFT_DATABASE,
ClusterIdentifier=REDSHIFT_CLUSTER_IDENTIFIER,
DurationSeconds=3600,
AutoCreate=True,
)
```
```{python}
REDSHIFT_CONNECTION_STRING = f"postgres://{quote_plus(redshift_creds['DbUser'])}:{quote_plus(redshift_creds['DbPassword'])}@{REDSHIFT_HOST}:{REDSHIFT_PORT}/{REDSHIFT_DATABASE}"
REDSHIFT_CONNECTION_STRING
```
```{python}
conn = 'postgres://username:password@server:port/database' # connection token
query = "SELECT * FROM table limit 10"
import connectorx as cx
cx.read_sql(
REDSHIFT_CONNECTION_STRING,
query,
)
```
```{python}
from sqlalchemy import create_engine
conn = create_engine(f"sqlite:///test.db")
query = "SELECT * FROM foo"
pl.read_database(
query=query,
connection=conn.connect()
)
```