#####################################################
### Replication Script for the JSS Article:
### collapse: Advanced and Fast Statistical Computing
### and Data Transformation in R
### By: Sebastian Krantz, IfW Kiel
### E-Mail: sebastian.krantz@ifw-kiel.de
#####################################################
###################################################
### code chunk number 0: Preliminaries
###################################################
options(prompt = "R> ", continue = "+ ", width = 77, digits = 4, useFancyQuotes = FALSE, warn = 1)
# Loading libraries and installing if unavailable
if(!requireNamespace("fastverse", quietly = TRUE)) install.packages("fastverse")
options(fastverse.styling = FALSE)
library(fastverse) # loads data.table, collapse, magrittr and kit (not used)
## -- Attaching packages ------------------------------------ fastverse 0.3.2 --
## v data.table 1.15.0 v kit 0.0.13
## v magrittr 2.0.3 v collapse 2.0.10
fastverse_extend(microbenchmark, Rfast, fixest, install = TRUE) # loads and installs if unavailable
## -- Attaching extension packages -------------------------- fastverse 0.3.2 --
## Warning: package 'Rcpp' was built under R version 4.3.1
## v microbenchmark 1.4.10 v fixest 0.11.3
## v Rfast 2.1.0
## -- Conflicts --------------------------------------- fastverse_conflicts() --
## x fixest::fdim() masks collapse::fdim()
## x Rfast::group() masks collapse::group()
## x Rfast::transpose() masks data.table::transpose()
# Package versions used in the article:
# fastverse 0.3.2, collapse 2.0.10, data.table 1.15.0, magrittr 2.0.3,
# microbenchmark 1.4.10, Rfast 2.1.0, and fixest 0.11.3
###################################################
### code chunk number 1: collapse Topics and Documentation
###################################################
.COLLAPSE_TOPICS
## [1] "collapse-documentation" "fast-statistical-functions"
## [3] "fast-grouping-ordering" "fast-data-manipulation"
## [5] "quick-conversion" "advanced-aggregation"
## [7] "data-transformations" "time-series-panel-series"
## [9] "list-processing" "summary-statistics"
## [11] "recode-replace" "efficient-programming"
## [13] "small-helpers" "collapse-options"
help("collapse-documentation")
###################################################
### code chunk number 2: Fast Statistical Functions: Basic Examples
###################################################
fmean(mtcars$mpg)
## [1] 20.09
fmean(EuStockMarkets)
## DAX SMI CAC FTSE
## 2531 3376 2228 3566
fmean(mtcars[5:10])
## drat wt qsec vs am gear
## 3.5966 3.2173 17.8488 0.4375 0.4062 3.6875
fmean(mtcars$mpg, w = mtcars$wt)
## [1] 18.55
fmean(mtcars$mpg, g = mtcars$cyl)
## 4 6 8
## 26.66 19.74 15.10
fmean(mtcars$mpg, g = mtcars$cyl, w = mtcars$wt)
## 4 6 8
## 25.94 19.65 14.81
fmean(mtcars[5:10], g = mtcars$cyl, w = mtcars$wt)
## drat wt qsec vs am gear
## 4 4.031 2.415 19.38 0.9149 0.6498 4.047
## 6 3.569 3.152 18.12 0.6212 0.3788 3.821
## 8 3.206 4.133 16.89 0.0000 0.1204 3.241
fmean(mtcars$mpg, g = mtcars$cyl, TRA = "fill") |> head(20)
## [1] 19.74 19.74 26.66 19.74 15.10 19.74 15.10 26.66 26.66 19.74 19.74 15.10
## [13] 15.10 15.10 15.10 15.10 15.10 26.66 26.66 26.66
###################################################
### code chunk number 3: Airquality Dataset
###################################################
fnobs(airquality)
## Ozone Solar.R Wind Temp Month Day
## 116 146 153 153 153 153
###################################################
### code chunk number 4: Imputation by Reference
###################################################
fmedian(airquality[1:2], airquality$Month, TRA = "replace_na", set = TRUE)
###################################################
### code chunk number 5: Transformation Example
###################################################
airquality |> fmutate(
rad_day = fsum(as.double(Solar.R), Day, TRA = "/"),
ozone_deg = Ozone / Temp,
ozone_amed = Ozone > fmedian(Ozone, Month, TRA = "fill"),
ozone_resid = fmean(Ozone, list(Month, ozone_amed), ozone_deg, "-")
) |> head(3)
## Ozone Solar.R Wind Temp Month Day rad_day ozone_deg ozone_amed ozone_resid
## 1 41 190 7.4 67 5 1 0.191 0.6119 TRUE -10.279
## 2 36 118 8.0 72 5 2 0.135 0.5000 TRUE -15.279
## 3 12 149 12.6 74 5 3 0.168 0.1622 FALSE -3.035
###################################################
### code chunk number 6: GRP Objects
###################################################
str(g <- GRP(mtcars, ~ cyl + vs + am))
## Class 'GRP' hidden list of 9
## $ N.groups : int 7
## $ group.id : int [1:32] 4 4 3 5 6 5 6 2 2 5 ...
## $ group.sizes : int [1:7] 1 3 7 3 4 12 2
## $ groups :'data.frame': 7 obs. of 3 variables:
## ..$ cyl: num [1:7] 4 4 4 6 6 8 8
## ..$ vs : num [1:7] 0 1 1 0 1 0 0
## ..$ am : num [1:7] 1 0 1 1 0 0 1
## $ group.vars : chr [1:3] "cyl" "vs" "am"
## $ ordered : Named logi [1:2] TRUE FALSE
## ..- attr(*, "names")= chr [1:2] "ordered" "sorted"
## $ order : int [1:32] 27 8 9 21 3 18 19 20 26 28 ...
## ..- attr(*, "starts")= int [1:7] 1 2 5 12 15 19 31
## ..- attr(*, "maxgrpn")= int 12
## ..- attr(*, "sorted")= logi FALSE
## $ group.starts: int [1:7] 27 8 3 1 4 5 29
## $ call : language GRP.default(X = mtcars, by = ~cyl + vs + am)
###################################################
### code chunk number 7: Aggregation with GRP Objects
###################################################
dat <- get_vars(mtcars, c("mpg", "disp")); w <- mtcars$wt
add_vars(g$groups,
fmean(dat, g, w, use.g.names = FALSE) |> add_stub("w_mean_"),
fsd(dat, g, w, use.g.names = FALSE) |> add_stub("w_sd_")) |> head(2)
## cyl vs am w_mean_mpg w_mean_disp w_sd_mpg w_sd_disp
## 1 4 0 1 26.00 120.3 0.000 0.0
## 2 4 1 0 23.02 137.1 1.236 11.6
###################################################
### code chunk number 8: Transformation with GRP Objects
###################################################
mtcars |> add_vars(fmean(dat, g, w, "-") |> add_stub("w_demean_"),
fscale(dat, g, w) |> add_stub("w_scale_")) |> head(2)
## mpg cyl disp hp drat wt qsec vs am gear carb w_demean_mpg
## Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4 0.4357
## Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4 0.4357
## w_demean_disp w_scale_mpg w_scale_disp
## Mazda RX4 5.027 0.6657 0.6657
## Mazda RX4 Wag 5.027 0.6657 0.6657
###################################################
### code chunk number 9: fsummarise Integration
###################################################
mtcars |>
fsubset(mpg > 11) |>
fgroup_by(cyl, vs, am) |>
fsummarise(across(c(mpg, carb, hp), fmean),
qsec_w_med = fmean(qsec, wt)) |> head(2)
## cyl vs am mpg carb hp qsec_w_med
## 1 4 0 1 26.0 2.000 91.00 16.70
## 2 4 1 0 22.9 1.667 84.67 21.04
###################################################
### code chunk number 10: grouped_df Methods for Fast Statistical Functions
###################################################
mtcars |>
fsubset(mpg > 11, cyl, vs, am, mpg, carb, hp, wt) |>
fgroup_by(cyl, vs, am) |>
fmean(wt) |> head(2)
## cyl vs am sum.wt mpg carb hp
## 1 4 0 1 2.140 26.00 2.00 91.0
## 2 4 1 0 8.805 23.02 1.72 83.6
###################################################
### code chunk number 11: Vectorized Grouped Linear Regression
###################################################
mtcars |>
fgroup_by(vs) |>
fmutate(dm_carb = fmean(carb, TRA = "-")) |>
fsummarise(slope = fsum(mpg, dm_carb) %/=% fsum(dm_carb^2))
## vs slope
## 1 0 -0.5557
## 2 1 -2.0706
###################################################
### code chunk number 12: Advanced Weighted Group Statistics
###################################################
mtcars |>
fgroup_by(cyl, vs, am) |>
fmutate(o = radixorder(GRPid(), mpg)) |>
fsummarise(mpg_min = fmin(mpg),
mpg_Q1 = fnth(mpg, 0.25, wt, o = o, ties = "q8"),
mpg_mean = fmean(mpg, wt),
mpg_median = fmedian(mpg, wt, o = o, ties = "q8"),
mpg_mode = fmode(mpg, wt, ties = "max"),
mpg_Q3 = fnth(mpg, 0.75, wt, o = o, ties = "q8"),
mpg_max = fmax(mpg)) |> head(3)
## cyl vs am mpg_min mpg_Q1 mpg_mean mpg_median mpg_mode mpg_Q3 mpg_max
## 1 4 0 1 26.0 26.00 26.00 26.00 26.0 26.00 26.0
## 2 4 1 0 21.5 21.90 23.02 23.16 24.4 24.38 24.4
## 3 4 1 1 21.4 22.37 27.74 28.28 30.4 31.51 33.9
###################################################
### code chunk number 13: Data Aggregation with collap()
###################################################
collap(wlddev, country + PCGDP + LIFEEX ~ year + income, w = ~ POP) |>
head(4)
## country year income PCGDP LIFEEX POP
## 1 United States 1960 High income 12768.7 68.59 7.495e+08
## 2 Ethiopia 1960 Low income 658.5 38.33 1.474e+08
## 3 India 1960 Lower middle income 500.8 45.27 9.280e+08
## 4 China 1960 Upper middle income 1166.1 49.86 1.184e+09
###################################################
### code chunk number 14: Growth Rate of Airmiles Time Series
###################################################
fgrowth(airmiles) |> round(2)
## Time Series:
## Start = 1937
## End = 1960
## Frequency = 1
## [1] NA 16.50 42.29 54.03 31.65 2.38 15.23 33.29 54.36 76.92 2.71 -2.10
## [13] 12.91 18.51 32.03 18.57 17.82 13.61 18.19 12.83 13.32 0.01 15.49 4.25
###################################################
### code chunk number 15: Creating an Irregular Series and Demonstrating Indexation
###################################################
am_ir <- airmiles[-c(3, 15)]
t <- time(airmiles)[-c(3, 15)]
fgrowth(am_ir, t = t) |> round(2)
## [1] NA 16.50 NA 31.65 2.38 15.23 33.29 54.36 76.92 2.71 -2.10 12.91
## [13] 18.51 NA 17.82 13.61 18.19 12.83 13.32 0.01 15.49 4.25
fgrowth(am_ir, -1:3, t = t) |> head(4)
## FG1 -- G1 L2G1 L3G1
## [1,] -14.167 412 NA NA NA
## [2,] NA 480 16.50 NA NA
## [3,] -24.043 1052 NA 119.2 155.3
## [4,] -2.327 1385 31.65 NA 188.5
###################################################
### code chunk number 16: Ad-Hoc Transformations on World Bank Panel Data Supplied with collapse
###################################################
G(wlddev, c(1, 10), by = POP + LIFEEX ~ iso3c, t = ~ year) |> head(3)
## iso3c year G1.POP L10G1.POP G1.LIFEEX L10G1.LIFEEX
## 1 AFG 1960 NA NA NA NA
## 2 AFG 1961 1.917 NA 1.590 NA
## 3 AFG 1962 1.985 NA 1.544 NA
settransform(wlddev, POP_growth = G(POP, g = iso3c, t = year))
###################################################
### code chunk number 17: Integration with Data Manipualtion Functions
###################################################
wlddev |> fgroup_by(iso3c) |> fselect(iso3c, year, POP, LIFEEX) |>
fmutate(across(c(POP, LIFEEX), G, t = year)) |> head(2)
## iso3c year POP LIFEEX G1.POP G1.LIFEEX
## 1 AFG 1960 8996973 32.45 NA NA
## 2 AFG 1961 9169410 32.96 1.917 1.59
###################################################
### code chunk number 18: Two Solutions for Grouped Scaling
###################################################
iris |> fgroup_by(Species) |> fscale() |> head(2)
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 0.2667 0.1899 -0.357 -0.4365
## 2 setosa -0.3007 -1.1291 -0.357 -0.4365
STD(iris, ~ Species) |> head(2)
## Species STD.Sepal.Length STD.Sepal.Width STD.Petal.Length STD.Petal.Width
## 1 setosa 0.2667 0.1899 -0.357 -0.4365
## 2 setosa -0.3007 -1.1291 -0.357 -0.4365
###################################################
### code chunk number 19: Fixed Effects Regression a la Mundlak (1978)
###################################################
lm(mpg ~ carb + B(carb, cyl), data = mtcars) |> coef()
## (Intercept) carb B(carb, cyl)
## 34.8297 -0.4655 -4.7750
###################################################
### code chunk number 20: Detrending with Country-Level Cubic Polynomials: Requires {fixest}
###################################################
HDW(wlddev, PCGDP + LIFEEX ~ iso3c * poly(year, 3), stub = F) |> head(2)
## PCGDP LIFEEX
## 1 8.885 0.023614
## 2 13.685 0.006724
###################################################
### code chunk number 21: Indexed Frame
###################################################
wldi <- wlddev |> findex_by(iso3c, year)
wldi |> fsubset(-3, iso3c, year, PCGDP:POP) |> G() |> head(4)
## iso3c year G1.PCGDP G1.LIFEEX G1.GINI G1.ODA G1.POP
## 1 AFG 1960 NA NA NA NA NA
## 2 AFG 1961 NA 1.590 NA 98.75 1.917
## 3 AFG 1963 NA NA NA NA NA
## 4 AFG 1964 NA 1.448 NA 24.48 2.112
##
## Indexed by: iso3c [1] | year [4 (61)]
###################################################
### code chunk number 22: Indexed Series
###################################################
LIFEEXi = wldi$LIFEEX
str(LIFEEXi, width = 70, strict = "cut")
## 'indexed_series' num [1:13176] 32.4 33 33.5 34 34.5 ...
## - attr(*, "label")= chr "Life expectancy at birth, total (years)"
## - attr(*, "index_df")=Classes 'index_df', 'pindex' and 'data.frame'..
## ..$ iso3c: Factor w/ 216 levels "ABW","AFG","AGO",..: 2 2 2 2 2 2 ..
## .. ..- attr(*, "label")= chr "Country Code"
## ..$ year : Ord.factor w/ 61 levels "1960"<"1961"<..: 1 2 3 4 5 6 7..
## .. ..- attr(*, "label")= chr "Year"
c(is_irregular(LIFEEXi), is_irregular(LIFEEXi[-5]))
## [1] FALSE TRUE
G(LIFEEXi[c(1:5, 7:10)])
## [1] NA 1.590 1.544 1.494 1.448 NA 1.366 1.362 1.365
##
## Indexed by: iso3c [1] | year [9 (61)]
###################################################
### code chunk number 23: Demonstrating Deep Indexation
###################################################
settransform(wldi, PCGDP_ld = Dlog(PCGDP))
lm(D(LIFEEX) ~ L(PCGDP_ld, 0:5) + B(PCGDP_ld), wldi) |>
summary() |> coef() |> round(3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.299 0.007 44.412 0.000
## L(PCGDP_ld, 0:5)-- 0.300 0.080 3.735 0.000
## L(PCGDP_ld, 0:5)L1 0.269 0.081 3.332 0.001
## L(PCGDP_ld, 0:5)L2 0.227 0.079 2.854 0.004
## L(PCGDP_ld, 0:5)L3 0.200 0.078 2.563 0.010
## L(PCGDP_ld, 0:5)L4 0.143 0.076 1.871 0.061
## L(PCGDP_ld, 0:5)L5 0.095 0.073 1.301 0.193
## B(PCGDP_ld) -1.021 0.316 -3.234 0.001
###################################################
### code chunk number 24: Using 3rd Party Functions: Rolling Average
###################################################
BY(LIFEEXi, findex(LIFEEXi)$iso3c, data.table::frollmean, 5) |> head(10)
## [1] NA NA NA NA 33.46 33.96 34.46 34.95 35.43 35.92
##
## Indexed by: iso3c [1] | year [10 (61)]
###################################################
### code chunk number 25: Joins: Adding Join Column
###################################################
df1 <- data.frame(id1 = c(1, 1, 2, 3),
id2 = c("a", "b", "b", "c"),
name = c("John", "Jane", "Bob", "Carl"),
age = c(35, 28, 42, 50))
df2 <- data.frame(id1 = c(1, 2, 3, 3),
id2 = c("a", "b", "c", "e"),
salary = c(60000, 55000, 70000, 80000),
dept = c("IT", "Marketing", "Sales", "IT"))
join(df1, df2, on = c("id1", "id2"), how = "full", column = TRUE)
## full join: df1[id1, id2] 3/4 (75%) <m:m> df2[id1, id2] 3/4 (75%)
## id1 id2 name age salary dept .join
## 1 1 a John 35 60000 IT matched
## 2 1 b Jane 28 NA <NA> df1
## 3 2 b Bob 42 55000 Marketing matched
## 4 3 c Carl 50 70000 Sales matched
## 5 3 e <NA> NA 80000 IT df2
###################################################
### code chunk number 26: Validation + Join Attribute
###################################################
join(df1, df2, on = c("id1", "id2"), validate = "1:1", attr = "join") |>
attr("join") |> str(width = 70, strict = "cut")
## left join: df1[id1, id2] 3/4 (75%) <1:1> df2[id1, id2] 3/4 (75%)
## List of 3
## $ call : language join(x = df1, y = df2, on = c("id1", "id2"), v"..
## $ on.cols:List of 2
## ..$ x: chr [1:2] "id1" "id2"
## ..$ y: chr [1:2] "id1" "id2"
## $ match : 'qG' int [1:4] 1 NA 2 3
## ..- attr(*, "N.nomatch")= int 1
## ..- attr(*, "N.groups")= int 4
## ..- attr(*, "N.distinct")= int 3
###################################################
### code chunk number 27: Overidentification Warning
###################################################
df2$name = df1$name
join(df1, df2) |> capture.output(type="m") |> strwrap(77) |> cat(sep="\n")
## Warning in fmatch(x[ixon], y[iyon], nomatch = NA_integer_, count = count, :
## Overidentified match/join: the first 2 of 3 columns uniquely match the
## records. With overid > 0, fmatch() continues to match columns. Consider
## removing columns or setting overid = 0 to terminate the algorithm after 2
## columns (the results may differ, see ?fmatch). Alternatively set overid = 2
## to silence this warning.
## left join: df1[id1, id2, name] 1/4 (25%) <m:m> df2[id1, id2, name] 1/4 (25%)
## id1 id2 name age salary dept
## 1 1 a John 35 60000 IT
## 2 1 b Jane 28 NA <NA>
## 3 2 b Bob 42 NA <NA>
## 4 3 c Carl 50 NA <NA>
###################################################
### code chunk number 28: Automatic Renaming
###################################################
join(df1, df2, on = c("id1", "id2"))
## left join: df1[id1, id2] 3/4 (75%) <m:m> df2[id1, id2] 3/4 (75%)
## duplicate columns: name => renamed using suffix '_df2' for y
## id1 id2 name age salary dept name_df2
## 1 1 a John 35 60000 IT John
## 2 1 b Jane 28 NA <NA> <NA>
## 3 2 b Bob 42 55000 Marketing Jane
## 4 3 c Carl 50 70000 Sales Bob
###################################################
### code chunk number 29: Data for Pivots
###################################################
data <- data.frame(type = rep(c("A", "B"), each = 2),
type_name = rep(c("Apples", "Bananas"), each = 2),
id = rep(1:2, 2), r = abs(rnorm(4)), h = abs(rnorm(4)*2))
setrelabel(data, id = "Fruit Id", r = "Fruit Radius", h = "Fruit Height")
print(data)
## type type_name id r h
## 1 A Apples 1 0.1465 3.1713
## 2 A Apples 2 0.4870 1.7201
## 3 B Bananas 1 0.7365 0.4584
## 4 B Bananas 2 2.4150 0.9972
vlabels(data)
## type type_name id r h
## NA NA "Fruit Id" "Fruit Radius" "Fruit Height"
###################################################
### code chunk number 30: Pivot Longer
###################################################
(dl <- pivot(data, ids = c("type", "type_name", "id"), labels = "label"))
## type type_name id variable label value
## 1 A Apples 1 r Fruit Radius 0.1465
## 2 A Apples 2 r Fruit Radius 0.4870
## 3 B Bananas 1 r Fruit Radius 0.7365
## 4 B Bananas 2 r Fruit Radius 2.4150
## 5 A Apples 1 h Fruit Height 3.1713
## 6 A Apples 2 h Fruit Height 1.7201
## 7 B Bananas 1 h Fruit Height 0.4584
## 8 B Bananas 2 h Fruit Height 0.9972
vlabels(dl)
## type type_name id variable label value
## NA NA "Fruit Id" NA NA NA
###################################################
### code chunk number 31: Pivot Wider
###################################################
(dw <- pivot(data, "id", names = "type", labels = "type_name", how = "w"))
## id r_A r_B h_A h_B
## 1 1 0.1465 0.7365 3.171 0.4584
## 2 2 0.4870 2.4150 1.720 0.9972
namlab(dw)
## Variable Label
## 1 id Fruit Id
## 2 r_A Fruit Radius - Apples
## 3 r_B Fruit Radius - Bananas
## 4 h_A Fruit Height - Apples
## 5 h_B Fruit Height - Bananas
###################################################
### code chunk number 32: Pivot Recast
###################################################
(dr <- pivot(data, ids = "id", names = list(from = "type"),
labels = list(from = "type_name", to = "label"), how = "r"))
## id variable label A B
## 1 1 r Fruit Radius 0.1465 0.7365
## 2 2 r Fruit Radius 0.4870 2.4150
## 3 1 h Fruit Height 3.1713 0.4584
## 4 2 h Fruit Height 1.7201 0.9972
vlabels(dr)
## id variable label A B
## "Fruit Id" NA NA "Apples" "Bananas"
###################################################
### code chunk number 33: Recursive Splitting: Creates Nested List of Data Frames
###################################################
(dl <- mtcars |> rsplit(mpg + hp + carb ~ vs + am)) |> str(max.level = 2)
## List of 2
## $ 0:List of 2
## ..$ 0:'data.frame': 12 obs. of 3 variables:
## ..$ 1:'data.frame': 6 obs. of 3 variables:
## $ 1:List of 2
## ..$ 0:'data.frame': 7 obs. of 3 variables:
## ..$ 1:'data.frame': 7 obs. of 3 variables:
###################################################
### code chunk number 34: Fitting Linear Models and Obtaining Coefficient Matrices
###################################################
nest_lm_coef <- dl |>
rapply2d(lm, formula = mpg ~ .) |>
rapply2d(summary, classes = "lm") |>
get_elem("coefficients")
nest_lm_coef |> str(give.attr = FALSE, strict = "cut")
## List of 2
## $ 0:List of 2
## ..$ 0: num [1:3, 1:4] 15.8791 0.0683 -4.5715 3.655 0.0345 ...
## ..$ 1: num [1:3, 1:4] 26.9556 -0.0319 -0.308 2.293 0.0149 ...
## $ 1:List of 2
## ..$ 0: num [1:3, 1:4] 30.896903 -0.099403 -0.000332 3.346033 0.03587 ...
## ..$ 1: num [1:3, 1:4] 37.0012 -0.1155 0.4762 7.3316 0.0894 ...
###################################################
### code chunk number 35: Unlisting to Data Frame
###################################################
nest_lm_coef |> unlist2d(c("vs", "am"), row.names = "variable") |> head(2)
## vs am variable Estimate Std. Error t value Pr(>|t|)
## 1 0 0 (Intercept) 15.87915 3.65495 4.345 0.001865
## 2 0 0 hp 0.06832 0.03449 1.981 0.078938
###################################################
### code chunk number 36: Removing Generated Series (Hidden)
###################################################
wldi <- wldi[1:13]
###################################################
### code chunk number 37: Which Columns/Countries have Time Varying Information?
###################################################
varying(wldi)
## country date year decade region income OECD PCGDP LIFEEX
## FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
## GINI ODA POP
## TRUE TRUE TRUE
varying(wldi, any_group = FALSE) |> head(3)
## country date year decade region income OECD PCGDP LIFEEX GINI ODA POP
## ABW FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE NA TRUE TRUE
## AFG FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE NA TRUE TRUE
## AGO FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
###################################################
### code chunk number 38: Demonstrating Panel Decomposition
###################################################
all.equal(fvar(W(LIFEEXi)) + fvar(B(LIFEEXi)), fvar(LIFEEXi))
## [1] TRUE
###################################################
### code chunk number 39: Panel Summary Statistics
###################################################
qsu(LIFEEXi)
## N/T Mean SD Min Max
## Overall 11670 64.2963 11.4764 18.907 85.4171
## Between 207 64.9537 9.8936 40.9663 85.4171
## Within 56.3768 64.2963 6.0842 32.9068 84.4198
###################################################
### code chunk number 40: Weighted Panel Summary Statistics by Groups
###################################################
qsu(LIFEEXi, g = wlddev$OECD, w = wlddev$POP, higher = TRUE) |> aperm()
## , , FALSE
##
## N/T Mean SD Min Max Skew Kurt
## Overall 9503 63.5476 9.2368 18.907 85.4171 -0.7394 2.7961
## Between 171 63.5476 6.0788 43.0905 85.4171 -0.8041 3.082
## Within 55.5731 65.8807 6.9545 30.3388 82.8832 -1.0323 4.0998
##
## , , TRUE
##
## N/T Mean SD Min Max Skew Kurt
## Overall 2156 74.9749 5.3627 45.369 84.3563 -1.2966 6.5505
## Between 36 74.9749 2.9256 66.2983 78.6733 -1.3534 4.5999
## Within 59.8889 65.8807 4.4944 44.9513 77.2733 -0.627 3.9839
###################################################
### code chunk number 41: Detailed (Grouped, Weighted) Statistical Description
###################################################
descr(wlddev, LIFEEX ~ OECD, w = ~ replace_na(POP))
## Dataset: wlddev, 1 Variables, N = 13176, WeightSum = 313233706778
## Grouped by: OECD [2]
## N Perc WeightSum Perc
## FALSE 10980 83.33 2.49344474e+11 79.6
## TRUE 2196 16.67 6.38892329e+10 20.4
## -----------------------------------------------------------------------------
## LIFEEX (numeric): Life expectancy at birth, total (years)
## Statistics (N = 11659, 11.51% NAs)
## N Perc Ndist Mean SD Min Max Skew Kurt
## FALSE 9503 81.51 8665 63.55 9.24 18.91 85.42 -0.74 2.8
## TRUE 2156 18.49 2016 74.97 5.36 45.37 84.36 -1.3 6.55
##
## Quantiles
## 1% 5% 10% 25% 50% 75% 90% 95% 99%
## FALSE 41.39 45.78 49.08 57.51 65.98 70.14 74.12 75.63 76.91
## TRUE 56.65 65.98 69.7 71.85 75.38 78.64 81.26 82.43 83.6
## -----------------------------------------------------------------------------
###################################################
### code chunk number 42: qtab: Basic Usage
###################################################
library(magrittr) # World after 2015 (latest country data)
wlda15 <- wlddev |> fsubset(year >= 2015) |> fgroup_by(iso3c) |> flast()
wlda15 %$% qtab(OECD, income)
## income
## OECD High income Low income Lower middle income Upper middle income
## FALSE 45 30 47 58
## TRUE 34 0 0 2
###################################################
### code chunk number 43: qtab: Population Counts
###################################################
wlda15 %$% qtab(OECD, income, w = POP) %>% divide_by(1e6)
## income
## OECD High income Low income Lower middle income Upper middle income
## FALSE 93.01 694.89 3063.54 2459.71
## TRUE 1098.75 0.00 0.00 211.01
###################################################
### code chunk number 44: qtab: Average Life Expectancy
###################################################
wlda15 %$% qtab(OECD, income, w = LIFEEX, wFUN = fmean) %>% replace_na(0)
## income
## OECD High income Low income Lower middle income Upper middle income
## FALSE 78.75 62.81 68.30 73.81
## TRUE 81.09 0.00 0.00 76.37
###################################################
### code chunk number 45: qtab: Population Weighted Average Life Expectancy
###################################################
wlda15 %$% qtab(OECD, income, w = LIFEEX, wFUN = fmean,
wFUN.args = list(w = POP)) %>% replace_na(0)
## income
## OECD High income Low income Lower middle income Upper middle income
## FALSE 77.91 63.81 68.76 75.93
## TRUE 81.13 0.00 0.00 76.10
###################################################
### code chunk number 46: Benchmark: Statistics and Data Manipulation
###################################################
setDTthreads(4)
set_collapse(na.rm = FALSE, sort = FALSE, nthreads = 4)
set.seed(101)
m <- matrix(rnorm(1e7), ncol = 1000)
data <- qDT(replicate(100, rnorm(1e5), simplify = FALSE))
g <- sample.int(1e4, 1e5, TRUE)
microbenchmark(R = colMeans(m),
Rfast = Rfast::colmeans(m, parallel = TRUE, cores = 4),
collapse = fmean(m))
## Warning in microbenchmark(R = colMeans(m), Rfast = Rfast::colmeans(m,
## parallel = TRUE, : less accurate nanosecond times to avoid potential integer
## overflows
## Unit: milliseconds
## expr min lq mean median uq max neval
## R 14.993 20.939 21.907 22.855 22.886 34.40 100
## Rfast 1.900 2.750 3.247 2.872 2.894 17.56 100
## collapse 1.304 1.365 1.688 1.413 1.506 11.55 100
microbenchmark(R = rowsum(data, g, reorder = FALSE),
data.table = data[, lapply(.SD, sum), by = g],
collapse = fsum(data, g))
## Unit: milliseconds
## expr min lq mean median uq max neval
## R 25.605 26.828 28.462 27.96 29.312 35.27 100
## data.table 20.276 26.763 29.495 28.69 30.079 98.10 100
## collapse 3.723 4.654 5.357 4.95 5.247 12.75 100
add_vars(data) <- g
microbenchmark(data.table = data[, lapply(.SD, median), by = g],
collapse = data |> fgroup_by(g) |> fmedian())
## Unit: milliseconds
## expr min lq mean median uq max neval
## data.table 251.56 265.33 274.76 276.70 283.5 312.6 100
## collapse 84.87 91.22 96.28 94.81 101.6 112.7 100
d <- data.table(g = unique(g), x = 1, y = 2, z = 3)
microbenchmark(data.table = d[data, on = "g"],
collapse = join(data, d, on = "g", verbose = 0))
## Unit: milliseconds
## expr min lq mean median uq max neval
## data.table 14.302 17.93 39.755 23.818 34.00 108.242 100
## collapse 2.902 3.14 3.438 3.309 3.79 4.086 100
microbenchmark(data.table = melt(data, "g"),
collapse = pivot(data, "g"))
## Unit: milliseconds
## expr min lq mean median uq max neval
## data.table 13.90 17.76 26.84 19.48 21.39 74.73 100
## collapse 13.89 17.40 26.00 18.98 20.48 86.71 100
settransform(data, id = rowid(g))
cols <- grep("^V", names(data), value = TRUE)
microbenchmark(data.table = dcast(data, g ~ id, value.var = cols),
collapse = pivot(data, ids = "g", names = "id", how = "w"))
## Unit: milliseconds
## expr min lq mean median uq max neval
## data.table 121.6 210.6 216.6 218.6 221.9 259.4 100
## collapse 112.3 137.8 143.4 142.9 148.8 184.1 100
###################################################
### code chunk number 47: Benchmark: Unique Values and Matching
###################################################
set.seed(101)
g_int <- sample.int(1e3, 1e7, replace = TRUE)
char <- c(letters, LETTERS, month.abb, month.name)
char <- outer(char, char, paste0)
g_char <- sample(char, 1e7, replace = TRUE)
microbenchmark(base_int = unique(g_int), collapse_int = funique(g_int),
base_char = unique(g_char), collapse_char = funique(g_char))
## Unit: milliseconds
## expr min lq mean median uq max neval
## base_int 92.95 98.78 103.21 102.06 104.94 171.41 100
## collapse_int 13.31 13.92 15.97 14.97 15.78 29.41 100
## base_char 143.94 150.15 159.89 161.40 166.20 231.22 100
## collapse_char 33.99 35.14 38.79 36.72 39.88 101.54 100
microbenchmark(base_int = match(g_int, 1:1000),
collapse_int = fmatch(g_int, 1:1000),
base_char = match(g_char, char),
data.table_char = chmatch(g_char, char),
collapse_char = fmatch(g_char, char), times = 10)
## Unit: milliseconds
## expr min lq mean median uq max neval
## base_int 42.31 42.87 48.17 46.14 49.44 69.59 10
## collapse_int 13.35 13.55 15.54 14.73 17.24 21.11 10
## base_char 123.00 124.22 134.12 128.18 139.92 176.19 10
## data.table_char 66.00 66.25 70.52 66.51 73.50 89.45 10
## collapse_char 43.25 43.34 47.29 43.60 46.45 64.42 10
###################################################
### Print Session Information
###################################################
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] fixest_0.11.3 Rfast_2.1.0 RcppParallel_5.1.7
## [4] RcppZiggurat_0.1.6 Rcpp_1.0.12 microbenchmark_1.4.10
## [7] collapse_2.0.10 kit_0.0.13 magrittr_2.0.3
## [10] data.table_1.15.0 fastverse_0.3.2
##
## loaded via a namespace (and not attached):
## [1] Formula_1.2-5 numDeriv_2016.8-1.1 xfun_0.39
## [4] lattice_0.21-8 stringmagic_1.0.0 zoo_1.8-12
## [7] knitr_1.43 parallel_4.3.0 dreamerr_1.4.0
## [10] sandwich_3.1-0 grid_4.3.0 compiler_4.3.0
## [13] rstudioapi_0.14 tools_4.3.0 nlme_3.1-162
## [16] evaluate_0.21