Compute expressions over the given groups
Description
This expression is similar to performing a group by aggregation and joining the result back into the original DataFrame. The outcome is similar to how window functions work in PostgreSQL.
Usage
<Expr>$over(..., mapping_strategy = "group_to_rows")
Arguments
…
|
Column(s) to group by. Accepts expression input. Characters are parsed as column names. |
mapping_strategy
|
One of the following:
|
Value
Expr
Examples
library(polars)
# Pass the name of a column to compute the expression over that column.
df = pl$DataFrame(
a = c("a", "a", "b", "b", "b"),
b = c(1, 2, 3, 5, 3),
c = c(5, 4, 2, 1, 3)
)
df$with_columns(
pl$col("c")$max()$over("a")$name$suffix("_max")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a ┆ b ┆ c ┆ c_max │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a ┆ 1.0 ┆ 5.0 ┆ 5.0 │
#> │ a ┆ 2.0 ┆ 4.0 ┆ 5.0 │
#> │ b ┆ 3.0 ┆ 2.0 ┆ 3.0 │
#> │ b ┆ 5.0 ┆ 1.0 ┆ 3.0 │
#> │ b ┆ 3.0 ┆ 3.0 ┆ 3.0 │
#> └─────┴─────┴─────┴───────┘
# Expression input is supported.
df$with_columns(
pl$col("c")$max()$over(pl$col("b") %/% 2)$name$suffix("_max")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a ┆ b ┆ c ┆ c_max │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a ┆ 1.0 ┆ 5.0 ┆ 5.0 │
#> │ a ┆ 2.0 ┆ 4.0 ┆ 4.0 │
#> │ b ┆ 3.0 ┆ 2.0 ┆ 4.0 │
#> │ b ┆ 5.0 ┆ 1.0 ┆ 1.0 │
#> │ b ┆ 3.0 ┆ 3.0 ┆ 4.0 │
#> └─────┴─────┴─────┴───────┘
# Group by multiple columns by passing a character vector of column names
# or list of expressions.
df$with_columns(
pl$col("c")$min()$over(c("a", "b"))$name$suffix("_min")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a ┆ b ┆ c ┆ c_min │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a ┆ 1.0 ┆ 5.0 ┆ 5.0 │
#> │ a ┆ 2.0 ┆ 4.0 ┆ 4.0 │
#> │ b ┆ 3.0 ┆ 2.0 ┆ 2.0 │
#> │ b ┆ 5.0 ┆ 1.0 ┆ 1.0 │
#> │ b ┆ 3.0 ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┴───────┘
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a ┆ b ┆ c ┆ c_min │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a ┆ 1.0 ┆ 5.0 ┆ 5.0 │
#> │ a ┆ 2.0 ┆ 4.0 ┆ 4.0 │
#> │ b ┆ 3.0 ┆ 2.0 ┆ 2.0 │
#> │ b ┆ 5.0 ┆ 1.0 ┆ 1.0 │
#> │ b ┆ 3.0 ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┴───────┘
# Or use positional arguments to group by multiple columns in the same way.
df$with_columns(
pl$col("c")$min()$over("a", pl$col("b") %% 2)$name$suffix("_min")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a ┆ b ┆ c ┆ c_min │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a ┆ 1.0 ┆ 5.0 ┆ 5.0 │
#> │ a ┆ 2.0 ┆ 4.0 ┆ 4.0 │
#> │ b ┆ 3.0 ┆ 2.0 ┆ 1.0 │
#> │ b ┆ 5.0 ┆ 1.0 ┆ 1.0 │
#> │ b ┆ 3.0 ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┴───────┘
# Alternative mapping strategy: join values in a list output
df$with_columns(
top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬────────────┐
#> │ a ┆ b ┆ c ┆ top_2 │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 ┆ list[f64] │
#> ╞═════╪═════╪═════╪════════════╡
#> │ a ┆ 1.0 ┆ 5.0 ┆ [5.0, 4.0] │
#> │ a ┆ 2.0 ┆ 4.0 ┆ [5.0, 4.0] │
#> │ b ┆ 3.0 ┆ 2.0 ┆ [3.0, 2.0] │
#> │ b ┆ 5.0 ┆ 1.0 ┆ [3.0, 2.0] │
#> │ b ┆ 3.0 ┆ 3.0 ┆ [3.0, 2.0] │
#> └─────┴─────┴─────┴────────────┘