Skip to content

Commit

Permalink
Merge pull request #28 from TidierOrg/gsheet-prlim
Browse files Browse the repository at this point in the history
Read Google Sheets
  • Loading branch information
drizk1 authored Feb 9, 2025
2 parents 8491e5b + 51f98b0 commit f73eb88
Show file tree
Hide file tree
Showing 8 changed files with 300 additions and 5 deletions.
10 changes: 8 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
name = "TidierFiles"
uuid = "8ae5e7a9-bdd3-4c93-9cc3-9df4d5d947db"
authors = ["Daniel Rizk <[email protected]> and contributors"]
version = "0.2.2"
version = "0.3.0"

[deps]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d"
RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"

[compat]
Expand All @@ -22,11 +25,14 @@ DataFrames = "1.5"
Dates = "1.9"
Documenter = "0.27, 1"
HTTP = "1.10"
JSON3 = "1.1"
Parquet2 = "0.2"
RData = "0.8, 1.0"
ReadStatTables = "0.3"
Reexport = "0.2, 1"
Sockets = "1.9 - 1.11"
Random = "0.5 - 10"
XLSX = "0.10"
RData = "0.8, 1.0"
julia = "1.10"

[extras]
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Currently supported file types:
- `read_arrow` and `write_arrow`
- `read_parquet` and `write_parquet`
- `read_rdata` (.rdata and .rds)

- `read_gsheet` (Google Sheets)
Agnostic read and write functions that detect the type and dispatch the appropriate function.
- `read_file` and `write_file`

Expand Down
46 changes: 46 additions & 0 deletions docs/examples/UserGuide/gsheet.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# The `read_gsheet` function allows enables TidierFiles to read Google Sheets in as DataFrames.
# `read_gsheet` supports various arguements in line with other TidierFiles readers.

# ## Secrets
# At this time, to use `read_gsheet` the user will need to have a client ID and and client secret.
# To get these, go to the Google Cloud Console -> APIs and Services -> Credentials (in sidebar) -> Create Credentials (at the top) -> OAuth Client ID (choose for desktop) -> save that client id and client secret.
# In the future, we hope to remove this need, so that authorization can happen more simply.

# ## Authorization
# The browser will ask you to allow access, after clicking allow, the browser will intruct you to close the browser window. The connection is stored globally for the functions to access.
# ```
# client_id = "527478*******3dh26e.apps.googleusercontent.com"
# client_secret = "GO******j9yG"
# connect_gsheet(client_id, client_secret)
# ```
# ## Reading
# File paths can either be be full links, or spreadsheet ids
# The sheet name defaults to `Sheet1` but can be changed as shown below.
# ```
# spreadsheet_id = "1QbFtedQs56oIO1bjQKSwdpcWsMpo3r_04ZIrdg9_fFM"
# read_file(spreadsheet_id)
# 4×4 DataFrame
# Row │ this is test sheet
# │ Float64? String? String? String?
# ─────┼──────────────────────────────────────
# 1 │ 3.0 missing missing missing
# 2 │ missing COLUMNS missing missing
# 3 │ missing missing ARE missing
# 4 │ missing missing missing STRINGS
# ```
#
# ## Read a particular sheet
# ```
# public_sheet = "https://docs.google.com/spreadsheets/d/1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms/edit?gid=0#gid=0"
# read_gsheet(public_sheet, sheet="Class Data", col_names=true, n_max=5)
# 5×6 DataFrame
# Row │ Student Name Gender Class Level Home State Major Extracurricular Activity
# │ String String String String String String
# ─────┼───────────────────────────────────────────────────────────────────────────────────
# 1 │ Alexandra Female 4. Senior CA English Drama Club
# 2 │ Andrew Male 1. Freshman SD Math Lacrosse
# 3 │ Anna Female 1. Freshman NC English Basketball
# 4 │ Becky Female 2. Sophomore SD Art Baseball
# 5 │ Benjamin Male 4. Senior WI English Basketball
# ```
#
1 change: 1 addition & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,5 @@ nav:
- "Arrow Files": "examples/generated/UserGuide/Arrow.md"
- "Parquet Files": "examples/generated/UserGuide/parquet.md"
- "R Data Files": "examples/generated/UserGuide/r_files.md"
- "Google Sheets": "examples/generated/UserGuide/gsheet.md"
- "Reference" : "reference.md"
7 changes: 6 additions & 1 deletion src/TidierFiles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@ using Reexport
using Parquet2
using Arrow
using RData
using JSON3
using Random
using Sockets

@reexport using DataFrames: DataFrame

export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx,
read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas,
write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata, list_files
write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata, list_files,
read_gsheet, connect_gsheet


include("docstrings.jl")
Expand All @@ -25,6 +29,7 @@ include("statsfiles.jl")
include("parquet_files.jl")
include("arrow_files.jl")
include("r_data.jl")
include("gsheets.jl")

"""
$docstring_read_csv
Expand Down
64 changes: 64 additions & 0 deletions src/docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -702,4 +702,68 @@ List all files in a directory that match a given pattern.
# Examples
- `list_files("/path/to/folder/", ".csv")`
"""

const docstring_connect_gsheet =
"""
connect_gsheet(client_id::String, client_secret::String; redirect_uri::String = "http://localhost:8081")
Connects to Google Sheets API by obtaining an access token using OAuth 2.0 authorization flow.
To obtain the credentials, go to the Google Cloud Console -> APIs and Services -> Credentials -> Create Credentials -> Create OAuth Client ID -> Desktop App.
This will contain the `client_id` and `client_secret`
# Arguments
- `client_id::String`: The client ID obtained from the Google Cloud Console.
- `client_secret::String`: The client secret obtained from the Google Cloud Console.
- `redirect_uri::String`: The URI to which the authorization server will redirect the user after granting access. Defaults to "http://localhost:8081".
# Returns
- An instance of `GSheetAuth` containing the client ID, client secret, redirect URI, and access token.
# Example
```julia
julia> connect_gsheet("your_client_id", "your_client_secret")
```
"""

const docstring_read_gsheet =
"""
read_gsheet(spreadsheet_id::String;
sheet::String="Sheet1",
range::String="",
col_names::Bool=true,
skip::Int=0,
n_max::Int=1000,
col_select=nothing,
missing_value::String="")
Read data from a Google Sheet into a DataFrame.
# Arguments
- `spreadsheet_id::String`: The unique identifier of the Google Sheet or the full URL.
- `sheet::String`: The name of the sheet to read from. Defaults to "Sheet1".
- `range::String`: The range of cells to read (e.g., "A1:D10"). Defaults to an empty string, which reads the entire sheet.
- `col_names::Bool`: Indicates whether the first row should be used as column names. Defaults to true.
- `skip::Int`: Number of rows to skip before starting to read data. Defaults to 0.
- `n_max::Int`: Maximum number of rows to read after skipping. Defaults to Inf (read all available rows).
- `col_select`: List of column names or indices to select specific columns. Defaults to nothing (all columns).
- `missing_value::String`: Value to represent missing data. Defaults to an empty string.
# Examples
```julia
julia> connect_gsheet("your_client_id", "your_client_secret")
julia> public_sheet = "https://docs.google.com/spreadsheets/d/1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms/edit?gid=0#gid=0";
julia> read_gsheet(public_sheet, sheet="Class Data", n_max=5)
5×6 DataFrame
Row │ Student Name Gender Class Level Home State Major Extracurricular Activity
│ String String String String String String
─────┼───────────────────────────────────────────────────────────────────────────────────
1 │ Alexandra Female 4. Senior CA English Drama Club
2 │ Andrew Male 1. Freshman SD Math Lacrosse
3 │ Anna Female 1. Freshman NC English Basketball
4 │ Becky Female 2. Sophomore SD Art Baseball
5 │ Benjamin Male 4. Senior WI English Basketball
```
"""
4 changes: 3 additions & 1 deletion src/gen_fxn.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ $docstring_read_file
"""
function read_file(filepath::String, args...; kwargs...)
ext = lowercase(splitext(filepath)[2])
if ext == ".csv"
if ext == "" || occursin(filepath, "docs.google.com")
return read_gsheet(filepath, args...; kwargs...)
elseif ext == ".csv"
return read_csv(filepath, args...; kwargs...)
elseif ext == ".tsv"
return read_tsv(filepath, args...; kwargs...)
Expand Down
171 changes: 171 additions & 0 deletions src/gsheets.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@

mutable struct GSheetAuth
client_id::String
client_secret::String
redirect_uri::String
access_token::String
end
global const GSHEET_AUTH = Ref{GSheetAuth}()
# Helper function to open a URL in the default browser
function open_browser(url::String)
if Sys.iswindows()
run(`start $url`)
elseif Sys.isapple()
run(`open $url`)
elseif Sys.islinux()
run(`xdg-open $url`)
else
# println("Please open the following URL in your browser:")
# println(url)
end
end

# Start a simple loopback server to capture the code
function get_authorization_code(port::Int, auth_url::String)
open_browser(auth_url)
println("Waiting for redirect on http://localhost:$port ...")

server = Sockets.listen(port)
sock = Sockets.accept(server)
request = String(readavailable(sock))

# Print the entire request for debugging
println("Received request:")
println(request)

# Look for the 'code' parameter in the GET request
m = match(r"code=([^&\s]+)", request)
if m === nothing
error("Authorization code not found in the request")
end
code = String(m.captures[1]) # Convert to String

# Print the captured code for debugging
# println("Captured code: $code")

# Send a simple HTTP response
response = """
HTTP/1.1 200 OK\r
Content-Type: text/html\r
\r
<html><body><h1>Authorization complete</h1>
You can now read Google Sheets from TidierFiles. Feel free to close this window.</body></html>
"""
write(sock, response)
close(sock)
close(server)
return code
end

# Exchange the authorization code for an access token
function get_access_token(code::String, client_id::String, client_secret::String, redirect_uri::String)
token_url = "https://oauth2.googleapis.com/token"
response = HTTP.post(token_url, ["Content-Type" => "application/x-www-form-urlencoded"],
"code=$code&client_id=$client_id&client_secret=$client_secret&redirect_uri=$redirect_uri&grant_type=authorization_code")
if response.status != 200
error("Failed to get access token: $(String(response.body))")
end
token_data = JSON3.read(String(response.body))
return token_data["access_token"]
end

# Function to connect to Google Sheets and get the access token
"""
$docstring_connect_gsheet
"""
function connect_gsheet(client_id::String, client_secret::String; redirect_uri::String = "http://localhost:8081")
scope = "https://www.googleapis.com/auth/spreadsheets"
state = "random_state_string" # ideally, generate a random string
auth_url = "https://accounts.google.com/o/oauth2/v2/auth?" *
"client_id=$client_id&" *
"redirect_uri=$(HTTP.escapeuri(redirect_uri))&" *
"response_type=code&" *
"scope=$(HTTP.escapeuri(scope))&" *
"state=$state"

code = get_authorization_code(8081, auth_url)
access_token = get_access_token(code, client_id, client_secret, redirect_uri)

global GSHEET_AUTH[] = GSheetAuth(client_id, client_secret, redirect_uri, access_token)
return GSHEET_AUTH[]
end

function parse_value(value::String, missing_value::String)
if isempty(value)
return missing
elseif tryparse(Float64, value) !== nothing
return parse(Float64, value)
else
return value
end
end

"""
$docstring_read_gsheet
"""
function read_gsheet(spreadsheet_id::String;
sheet::String="Sheet1",
range::String="",
col_names::Bool=true,
skip::Int=0,
n_max::Int=10000,
col_select=nothing,
missing_value::String="")

# If a full Google Sheets URL is provided, extract the spreadsheet id.
if occursin("spreadsheets/d/", spreadsheet_id)
m = match(r"spreadsheets/d/([^/]+)", spreadsheet_id)
if m !== nothing
spreadsheet_id = m.captures[1]
end
end

isempty(range) ? range = "A1:Z1000" : range
range = sheet * "!" * range

url = "https://sheets.googleapis.com/v4/spreadsheets/$spreadsheet_id/values/$(HTTP.escapeuri(range))"
headers = ["Authorization" => "Bearer $(GSHEET_AUTH[].access_token)"]
response = HTTP.get(url, headers)
data = JSON3.read(String(response.body))

values = data["values"]

if isempty(values)
return DataFrame()
end

if col_names
header = values[1]
rows = values[2:end]
else
header = ["Column$i" for i in 1:length(values[1])]
rows = values
end

rows = rows[skip+1:min(skip+n_max, end)]

max_length = maximum(length(row) for row in rows)

padded_rows = [vcat(row, fill(missing_value, max_length - length(row))) for row in rows]

parsed_rows = [[parse_value(cell, missing_value) for cell in row] for row in padded_rows]

df = DataFrame([Symbol(header[i]) => [row[i] for row in parsed_rows] for i in 1:length(header)])

for i in (length(header)+1):max_length
df[!, Symbol("x$(i-length(header))")] = [row[i] for row in parsed_rows]
end

if col_select !== nothing
df = df[:, col_select]
end

if missing_value != ""
for col in names(df)
df[!, col] = coalesce.(df[!, col], missing_value)
end
end

return df
end

0 comments on commit f73eb88

Please sign in to comment.