Skip to content

Commit

Permalink
feat: support argument .groups in summarize() (#149)
Browse files Browse the repository at this point in the history
* docs: add additional repo in installation method

* init
  • Loading branch information
etiennebacher authored Oct 18, 2024
1 parent 9144fbf commit 1f40ea5
Show file tree
Hide file tree
Showing 9 changed files with 248 additions and 46 deletions.
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# tidypolars (development)

## Breaking changes

* `summarize()` now drops the last group of the output by default (for
consistency with `dplyr`). Previously it kept the same groups as in the input
data (#149).

## New features

* Add support for argument `.groups` in `summarize()`. Value `"rowwise"` is not
supported for now (#149).

# tidypolars 0.11.0

`tidypolars` requires `polars` >= 0.20.0.
Expand Down
35 changes: 30 additions & 5 deletions R/summarize.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@
#'
#' @param .data A Polars Data/LazyFrame
#' @inheritParams mutate.RPolarsDataFrame
#' @param .groups Grouping structure of the result. Must be one of:
#' * `"drop_last"` (default): drop the last level of grouping;
#' * `"drop"`: all levels of grouping are dropped;
#' * `"keep"`: keep the same grouping structure as `.data`.
#'
#' For now, `"rowwise"` is not supported. Note that `dplyr` uses `.groups =
#' NULL` by default, whose behavior depends on the number of rows by group in
#' the output. However, returning several rows by group in `summarize()` is
#' deprecated (one should use `reframe()` instead), which is why `.groups =
#' NULL` is not supported by `tidypolars`.
#'
#' @export
#' @examplesIf require("dplyr", quietly = TRUE) && require("tidyr", quietly = TRUE)
Expand All @@ -19,17 +29,23 @@
#' mtcars |>
#' as_polars_df() |>
#' summarize(m_gear = mean(gear), sd_gear = sd(gear), .by = cyl)

summarize.RPolarsDataFrame <- function(.data, ..., .by = NULL) {

summarize.RPolarsDataFrame <- function(.data, ..., .by = NULL, .groups = "drop_last") {
grps <- get_grps(.data, rlang::enquo(.by), env = rlang::current_env())
mo <- attributes(.data)$maintain_grp_order
if (is.null(mo)) mo <- FALSE
is_grouped <- !is.null(grps)
is_rowwise <- attributes(.data)$grp_type == "rowwise"

# do not take the groups into account, especially useful when applying across()
# on everything()
# Technically, .groups can be NULL and then the value depends on the number
# of rows for each group after aggregation, but returning multiple rows is
# deprecated so I only use those 4 values.
.groups <- rlang::arg_match0(.groups, values = c("drop_last", "drop", "keep", "rowwise"))
if (.groups == "rowwise") {
abort("`tidypolars` doesn't support `.groups = \"rowwise\"` for now.")
}

# Do not take the groups into account, especially useful when applying across()
# on everything().
.data_for_translation <- select(.data, -all_of(grps))
polars_exprs <- translate_dots(
.data = .data_for_translation,
Expand Down Expand Up @@ -57,6 +73,15 @@ summarize.RPolarsDataFrame <- function(.data, ..., .by = NULL) {
}

out <- if (is_grouped && missing(.by)) {
grps <- switch(.groups,
"drop_last" = grps[-length(grps)],
"drop" = character(0),
"keep" = grps,
abort("Unreachable")
)
if (length(grps) == 0) {
return(.data)
}
group_by(.data, all_of(grps), maintain_order = mo)
} else if (isTRUE(is_rowwise)) {
rowwise(.data)
Expand Down
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ R-universe.

```{r eval=FALSE}
Sys.setenv(NOT_CRAN = "true")
install.packages("tidypolars", repos = "https://community.r-multiverse.org")
install.packages("tidypolars", repos = c("https://community.r-multiverse.org", 'https://cloud.r-project.org'))
```


Expand Down
29 changes: 15 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ thorough, representative benchmarks about `polars`, take a look at

``` r
library(collapse, warn.conflicts = FALSE)
#> collapse 2.0.15, see ?`collapse-package` or ?`collapse-documentation`
#> collapse 2.0.16, see ?`collapse-package` or ?`collapse-documentation`
library(dplyr, warn.conflicts = FALSE)
library(dtplyr)
library(polars)
Expand Down Expand Up @@ -114,16 +114,17 @@ bench::mark(
check = FALSE,
iterations = 40
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> Warning: Some expressions had a GC in every iteration;
#> so filtering is disabled.
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 polars 142.5ms 173.96ms 4.43 4.51MB 0.222
#> 2 tidypolars 161.9ms 206.56ms 4.70 1.78MB 2.00
#> 3 dplyr 3.8s 4.07s 0.231 1.79GB 0.554
#> 4 dtplyr 810.6ms 1s 0.999 1.72GB 2.82
#> 5 collapse 400.8ms 493.3ms 1.97 745.96MB 1.33
#> expression min median `itr/sec` mem_alloc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt>
#> 1 polars 260.22ms 317.05ms 3.03 19.2KB
#> 2 tidypolars 305.11ms 362.84ms 2.21 157.66KB
#> 3 dplyr 2.85s 3.19s 0.290 1.79GB
#> 4 dtplyr 1.36s 2.53s 0.416 1.72GB
#> 5 collapse 662.73ms 825.88ms 1.21 745.96MB
#> # ℹ 1 more variable: `gc/sec` <dbl>

# NOTE: do NOT take the "mem_alloc" results into account.
# `bench::mark()` doesn't report the accurate memory usage for packages calling
Expand All @@ -134,13 +135,13 @@ bench::mark(

## Installation

`tidypolars` is built on `polars`, which is not available on CRAN. This means
that `tidypolars` also can't be on CRAN. However, you can install it from
R-universe.
`tidypolars` is built on `polars`, which is not available on CRAN. This
means that `tidypolars` also cant be on CRAN. However, you can install
it from R-universe.

``` r
Sys.setenv(NOT_CRAN = "true")
install.packages("tidypolars", repos = "https://community.r-multiverse.org")
install.packages("tidypolars", repos = c("https://community.r-multiverse.org", 'https://cloud.r-project.org'))
```

## Contributing
Expand Down
19 changes: 15 additions & 4 deletions man/summarize.RPolarsDataFrame.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions tests/testthat/_snaps/summarize-lazy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# argument .groups works

Code
current$collect()
Condition
Error in `summarise()`:
! `tidypolars` doesn't support `.groups = "rowwise"` for now.

---

Code
current$collect()
Condition
Error in `summarise()`:
! `.groups` must be one of "drop_last", "drop", "keep", or "rowwise", not "foobar".

16 changes: 16 additions & 0 deletions tests/testthat/_snaps/summarize.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# argument .groups works

Code
summarise(group_by(pl_mtcars, am, cyl, vs), cyl_n = n(), .groups = "rowwise")
Condition
Error in `summarise()`:
! `tidypolars` doesn't support `.groups = "rowwise"` for now.

---

Code
summarise(group_by(pl_mtcars, am, cyl, vs), cyl_n = n(), .groups = "foobar")
Condition
Error in `summarise()`:
! `.groups` must be one of "drop_last", "drop", "keep", or "rowwise", not "foobar".

82 changes: 71 additions & 11 deletions tests/testthat/test-summarize-lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ test_that("basic behavior works", {

expect_equal_lazy(
summarize(pl_iris_g,
x = sum(Sepal.Length),
y = mean(Sepal.Length)) |>
x = sum(Sepal.Length),
y = mean(Sepal.Length)
) |>
pull(y),
c(5.006, 5.936, 6.588)
)
Expand All @@ -49,35 +50,37 @@ test_that("basic behavior works", {
})

test_that("correctly handles attributes", {
pl_iris <- polars::pl$LazyFrame(iris)
pl_iris_g <- pl_iris |>
group_by(Species, maintain_order = TRUE)
pl_mtcars <- polars::pl$LazyFrame(mtcars)
pl_mtcars_g <- pl_mtcars |>
group_by(cyl, am, maintain_order = TRUE)

expect_equal_lazy(
summarize(pl_iris_g, x = mean(Sepal.Length)) |>
summarize(pl_mtcars_g, x = mean(mpg)) |>
attr("pl_grps"),
"Species"
"cyl"
)

expect_equal_lazy(
summarize(pl_iris_g, x = mean(Sepal.Length)) |>
summarize(pl_mtcars_g, x = mean(mpg)) |>
attr("maintain_grp_order"),
TRUE
)

expect_equal_lazy(
summarize(pl_iris, x = mean(Sepal.Length), .by = Species) |>
summarize(pl_mtcars, x = mean(mpg), .by = c(cyl, am)) |>
attr("pl_grps"),
NULL
)

expect_equal_lazy(
summarize(pl_iris, x = mean(Sepal.Length), .by = Species) |>
summarize(pl_mtcars, x = mean(mpg), .by = c(cyl, am)) |>
attr("maintain_grp_order"),
NULL
)

expect_is_tidypolars(summarize(pl_iris, x = mean(Sepal.Length), .by = Species))
expect_is_tidypolars(
summarize(pl_mtcars, x = mean(mpg), .by = c(cyl, am))
)
})

test_that("works with a local variable defined in a function", {
Expand Down Expand Up @@ -112,4 +115,61 @@ test_that("check .add argument of group_by works", {
)
})

test_that("argument .groups works", {
pl_mtcars <- as_polars_lf(mtcars)

# default is "drop_last"
expect_equal_lazy(
pl_mtcars |>
group_by(am, cyl, vs) |>
summarise(cyl_n = n()) |>
group_vars(),
c("am", "cyl")
)

# other values
expect_equal_lazy(
pl_mtcars |>
group_by(am, cyl, vs) |>
summarise(cyl_n = n(), .groups = "drop_last") |>
group_vars(),
c("am", "cyl")
)
expect_equal_lazy(
pl_mtcars |>
group_by(am, cyl, vs) |>
summarise(cyl_n = n(), .groups = "keep") |>
group_vars(),
c("am", "cyl", "vs")
)
expect_equal_lazy(
pl_mtcars |>
group_by(am, cyl, vs) |>
summarise(cyl_n = n(), .groups = "drop") |>
group_vars(),
character(0)
)
expect_snapshot_lazy(
pl_mtcars |>
group_by(am, cyl, vs) |>
summarise(cyl_n = n(), .groups = "rowwise"),
error = TRUE
)
expect_snapshot_lazy(
pl_mtcars |>
group_by(am, cyl, vs) |>
summarise(cyl_n = n(), .groups = "foobar"),
error = TRUE
)

# "drop_last" with one group originally
expect_equal_lazy(
pl_mtcars |>
group_by(am) |>
summarise(cyl_n = n(), .groups = "drop_last") |>
group_vars(),
character(0)
)
})

Sys.setenv('TIDYPOLARS_TEST' = FALSE)
Loading

0 comments on commit 1f40ea5

Please sign in to comment.