Skip to content

Commit e7d10d9

Browse files
authored
Merge pull request #362 from stan-dev/rootogram
adding discrete option to ppc_rootogram
2 parents 0efe3ec + e0bb2ee commit e7d10d9

File tree

7 files changed

+430
-97
lines changed

7 files changed

+430
-97
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* Add `ppc_dots()` and `ppd_dots()` by @behramulukir (#357)
66
* Add `x` argument to `ppc_error_binned` by @behramulukir (#359)
77
* Add `x` argument to `ppc_error_scatter_avg()` by @behramulukir (#367)
8+
* Add `discrete` style to `ppc_rootogram` by @behramulukir (#362)
89

910
# bayesplot 1.13.0
1011

R/ppc-discrete.R

Lines changed: 194 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,12 @@
2121
#' @param size,fatten,linewidth For bar plots, `size`, `fatten`, and `linewidth`
2222
#' are passed to [ggplot2::geom_pointrange()] to control the appearance of the
2323
#' `yrep` points and intervals. For rootograms `size` is passed to
24-
#' [ggplot2::geom_line()].
24+
#' [ggplot2::geom_line()] and [ggplot2::geom_pointrange()].
2525
#' @param freq For bar plots only, if `TRUE` (the default) the y-axis will
2626
#' display counts. Setting `freq=FALSE` will put proportions on the y-axis.
27+
#' @param bound_distinct For `ppc_rootogram(style = "discrete)`,
28+
#' if `TRUE` then the observed counts will be plotted with different shapes
29+
#' depending on whether they are within the bounds of the `y` quantiles.
2730
#'
2831
#' @template return-ggplot-or-data
2932
#'
@@ -44,18 +47,29 @@
4447
#' }
4548
#' \item{`ppc_rootogram()`}{
4649
#' Rootograms allow for diagnosing problems in count data models such as
47-
#' overdispersion or excess zeros. They consist of a histogram of `y` with the
48-
#' expected counts based on `yrep` overlaid as a line along with uncertainty
49-
#' intervals. The y-axis represents the square roots of the counts to
50-
#' approximately adjust for scale differences and thus ease comparison between
51-
#' observed and expected counts. Using the `style` argument, the histogram
52-
#' style can be adjusted to focus on different aspects of the data:
50+
#' overdispersion or excess zeros. In `standing`, `hanging`, and `suspended`
51+
#' styles, they consist of a histogram of `y` with the expected counts based on
52+
#' `yrep` overlaid as a line along with uncertainty intervals.
53+
#'
54+
#' Meanwhile, in `discrete` style, median counts based on `yrep` are laid
55+
#' as a point range with uncertainty intervals along with dots
56+
#' representing the `y`.
57+
#'
58+
#' The y-axis represents the square roots of the counts to approximately
59+
#' adjust for scale differences and thus ease comparison between observed
60+
#' and expected counts. Using the `style` argument, the rootogram can be
61+
#' adjusted to focus on different aspects of the data:
5362
#' * _Standing_: basic histogram of observed counts with curve
5463
#' showing expected counts.
55-
#' * _Hanging_: observed counts counts hanging from the curve
64+
#' * _Hanging_: observed counts hanging from the curve
5665
#' representing expected counts.
5766
#' * _Suspended_: histogram of the differences between expected and
5867
#' observed counts.
68+
#' * _Discrete_: a dot-and-whisker plot of the median counts and
69+
#' dots representing observed counts.
70+
#'
71+
#' As it emphasizes the discrete nature of the count data,
72+
#' using `discrete` style is suggested.
5973
#'
6074
#' **All of the rootograms are plotted on the square root scale**. See Kleiber
6175
#' and Zeileis (2016) for advice on interpreting rootograms and selecting
@@ -198,22 +212,22 @@ ppc_bars_grouped <-
198212
fatten = 2.5,
199213
linewidth = 1,
200214
freq = TRUE) {
201-
check_ignored_arguments(...)
202-
call <- match.call(expand.dots = FALSE)
203-
g <- eval(ungroup_call("ppc_bars", call), parent.frame())
204-
if (fixed_y(facet_args)) {
205-
g <- g + expand_limits(y = 1.05 * max(g$data[["h"]], na.rm = TRUE))
215+
check_ignored_arguments(...)
216+
call <- match.call(expand.dots = FALSE)
217+
g <- eval(ungroup_call("ppc_bars", call), parent.frame())
218+
if (fixed_y(facet_args)) {
219+
g <- g + expand_limits(y = 1.05 * max(g$data[["h"]], na.rm = TRUE))
220+
}
221+
g +
222+
bars_group_facets(facet_args) +
223+
force_axes_in_facets()
206224
}
207-
g +
208-
bars_group_facets(facet_args) +
209-
force_axes_in_facets()
210-
}
211225

212226

213227
#' @rdname PPC-discrete
214228
#' @export
215229
#' @param style For `ppc_rootogram`, a string specifying the rootogram
216-
#' style. The options are `"standing"`, `"hanging"`, and
230+
#' style. The options are `"discrete"`, `"standing"`, `"hanging"`, and
217231
#' `"suspended"`. See the **Plot Descriptions** section, below, for
218232
#' details on the different styles.
219233
#'
@@ -234,61 +248,34 @@ ppc_bars_grouped <-
234248
#'
235249
#' ppc_rootogram(y, yrep, style = "hanging", prob = 0.8)
236250
#' ppc_rootogram(y, yrep, style = "suspended")
251+
#' ppc_rootogram(y, yrep, style = "discrete")
237252
#'
238253
ppc_rootogram <- function(y,
239254
yrep,
240-
style = c("standing", "hanging", "suspended"),
255+
style = c("standing", "hanging", "suspended", "discrete"),
241256
...,
242257
prob = 0.9,
243-
size = 1) {
258+
size = 1,
259+
bound_distinct = TRUE) {
244260
check_ignored_arguments(...)
245261
style <- match.arg(style)
246-
y <- validate_y(y)
247-
yrep <- validate_predictions(yrep, length(y))
248-
if (!all_counts(y)) {
249-
abort("ppc_rootogram expects counts as inputs to 'y'.")
250-
}
251-
if (!all_counts(yrep)) {
252-
abort("ppc_rootogram expects counts as inputs to 'yrep'.")
253-
}
254262

255-
alpha <- (1 - prob) / 2
256-
probs <- c(alpha, 1 - alpha)
257-
ymax <- max(y, yrep)
258-
xpos <- 0L:ymax
259-
260-
# prepare a table for yrep
261-
tyrep <- as.list(rep(NA, nrow(yrep)))
262-
for (i in seq_along(tyrep)) {
263-
tyrep[[i]] <- table(yrep[i,])
264-
matches <- match(xpos, rownames(tyrep[[i]]))
265-
tyrep[[i]] <- as.numeric(tyrep[[i]][matches])
266-
}
267-
tyrep <- do.call(rbind, tyrep)
268-
tyrep[is.na(tyrep)] <- 0
269-
tyexp <- sqrt(colMeans(tyrep))
270-
tyquantile <- sqrt(t(apply(tyrep, 2, quantile, probs = probs)))
271-
colnames(tyquantile) <- c("tylower", "tyupper")
272-
273-
# prepare a table for y
274-
ty <- table(y)
275-
ty <- sqrt(as.numeric(ty[match(xpos, rownames(ty))]))
276-
if (style == "suspended") {
277-
ty <- tyexp - ty
278-
}
279-
ty[is.na(ty)] <- 0
280-
ypos <- ty / 2
281-
if (style == "hanging") {
282-
ypos <- tyexp - ypos
283-
}
263+
data <- .ppc_rootogram_data(
264+
y = y,
265+
yrep = yrep,
266+
style = style,
267+
prob = prob,
268+
bound_distinct = bound_distinct
269+
)
284270

285-
data <- data.frame(xpos, ypos, ty, tyexp, tyquantile)
286-
graph <- ggplot(data) +
287-
aes(
288-
ymin = .data$tylower,
289-
ymax = .data$tyupper,
290-
height = .data$ty
291-
) +
271+
# Building geoms for y and y_rep
272+
geom_y <- if (style == "discrete") {
273+
geom_point(
274+
aes(y = .data$obs, shape = .data$obs_shape),
275+
size = size * 1.5,
276+
color = get_color("d"),
277+
fill = get_color("d"))
278+
} else {
292279
geom_tile(
293280
aes(
294281
x = .data$xpos,
@@ -298,34 +285,69 @@ ppc_rootogram <- function(y,
298285
color = get_color("lh"),
299286
linewidth = 0.25,
300287
width = 1
301-
) +
302-
bayesplot_theme_get()
303-
304-
if (style != "standing") {
305-
graph <- graph + hline_0(size = 0.4)
288+
)
306289
}
307290

308-
graph <- graph +
291+
geom_yrep <- if (style == "discrete") {
292+
geom_pointrange(
293+
aes(y = .data$pred_median, ymin = .data$lower, ymax = .data$upper, color = "y_rep"),
294+
fill = get_color("lh"),
295+
linewidth = size,
296+
size = size,
297+
fatten = 2,
298+
alpha = 1
299+
)
300+
} else {
309301
geom_smooth(
310-
aes(
311-
x = .data$xpos,
312-
y = .data$tyexp,
313-
color = "Expected"
314-
),
302+
aes(x = .data$xpos, y = .data$tyexp, color = "Expected"),
315303
fill = get_color("d"),
316304
linewidth = size,
317305
stat = "identity"
318-
) +
319-
scale_fill_manual("", values = get_color("l")) +
320-
scale_color_manual("", values = get_color("dh")) +
321-
labs(x = expression(italic(y)),
322-
y = expression(sqrt(Count)))
323-
324-
if (style == "standing") {
325-
graph <- graph + dont_expand_y_axis()
306+
)
326307
}
327308

328-
graph + reduce_legend_spacing(0.25)
309+
# Creating the graph
310+
graph <- ggplot(data)
311+
312+
if (style == "discrete") {
313+
graph <- graph +
314+
geom_yrep +
315+
geom_y +
316+
aes(x = .data$xpos) +
317+
scale_y_sqrt() +
318+
scale_fill_manual("", values = get_color("d"), guide = "none") +
319+
scale_color_manual("", values = get_color("lh"), labels = yrep_label()) +
320+
labs(x = expression(italic(y)), y = "Count") +
321+
bayesplot_theme_get() +
322+
reduce_legend_spacing(0.25) +
323+
scale_shape_manual(values = c("In" = 22, "Out" = 23, "y" = 22), guide = "legend", labels = c("y" = expression(italic(y))))
324+
if (bound_distinct) {
325+
graph <- graph + guides(shape = guide_legend(expression(italic(y)~within~bounds)))
326+
} else {
327+
graph <- graph + guides(shape = guide_legend(" "))
328+
}
329+
} else {
330+
graph <- graph +
331+
geom_y +
332+
geom_yrep +
333+
aes(
334+
ymin = .data$tylower,
335+
ymax = .data$tyupper,
336+
height = .data$ty
337+
) +
338+
scale_fill_manual("", values = get_color("l")) +
339+
scale_color_manual("", values = get_color("dh")) +
340+
labs(x = expression(italic(y)), y = expression(sqrt(Count))) +
341+
bayesplot_theme_get() +
342+
reduce_legend_spacing(0.25)
343+
if (style == "standing") {
344+
graph <- graph + dont_expand_y_axis()
345+
} else {
346+
graph <- graph + hline_0(size = 0.4)
347+
}
348+
}
349+
350+
return(graph)
329351
}
330352

331353

@@ -395,7 +417,7 @@ ppc_bars_data <-
395417
data <-
396418
reshape2::melt(tmp_data, id.vars = "group") %>%
397419
count(.data$group, .data$value, .data$variable) %>%
398-
tidyr::complete(.data$group, .data$value, .data$variable, fill = list(n = 0)) %>%
420+
tidyr::complete(.data$group, .data$value, .data$variable, fill = list(n = 0)) %>%
399421
group_by(.data$variable, .data$group) %>%
400422
mutate(proportion = .data$n / sum(.data$n)) %>%
401423
ungroup() %>%
@@ -440,3 +462,90 @@ bars_group_facets <- function(facet_args, scales_default = "fixed") {
440462
fixed_y <- function(facet_args) {
441463
!isTRUE(facet_args[["scales"]] %in% c("free", "free_y"))
442464
}
465+
466+
#' Internal function for `ppc_rootogram()`
467+
#' @param y,yrep User's `y` and `yrep` arguments.
468+
#' @param style,prob,bound_distinct User's `style`, `prob`, and
469+
#' (if applicable) `bound_distinct` arguments.
470+
#' @noRd
471+
.ppc_rootogram_data <- function(y,
472+
yrep,
473+
style = c("standing", "hanging", "suspended", "discrete"),
474+
prob = 0.9,
475+
bound_distinct) {
476+
477+
y <- validate_y(y)
478+
yrep <- validate_predictions(yrep, length(y))
479+
if (!all_counts(y)) {
480+
abort("ppc_rootogram expects counts as inputs to 'y'.")
481+
}
482+
if (!all_counts(yrep)) {
483+
abort("ppc_rootogram expects counts as inputs to 'yrep'.")
484+
}
485+
486+
alpha <- (1 - prob) / 2
487+
probs <- c(alpha, 1 - alpha)
488+
ymax <- max(y, yrep)
489+
xpos <- 0L:ymax
490+
491+
# prepare a table for yrep
492+
tyrep <- as.list(rep(NA, nrow(yrep)))
493+
for (i in seq_along(tyrep)) {
494+
tyrep[[i]] <- table(yrep[i,])
495+
matches <- match(xpos, rownames(tyrep[[i]]))
496+
tyrep[[i]] <- as.numeric(tyrep[[i]][matches])
497+
}
498+
tyrep <- do.call(rbind, tyrep)
499+
tyrep[is.na(tyrep)] <- 0
500+
501+
# discrete style
502+
if (style == "discrete"){
503+
pred_median <- apply(tyrep, 2, median)
504+
pred_quantile <- t(apply(tyrep, 2, quantile, probs = probs))
505+
colnames(pred_quantile) <- c("lower", "upper")
506+
507+
# prepare a table for y
508+
ty <- table(y)
509+
y_count <- as.numeric(ty[match(xpos, rownames(ty))])
510+
y_count[is.na(y_count)] <- 0
511+
512+
if (bound_distinct) {
513+
# If the observed count is within the bounds of the predicted quantiles,
514+
# use a different shape for the point
515+
obs_shape <- obs_shape <- ifelse(y_count >= pred_quantile[, "lower"] & y_count <= pred_quantile[, "upper"], "In", "Out")
516+
} else {
517+
obs_shape <- rep("y", length(y_count)) # all points are the same shape for observed
518+
}
519+
520+
data <- data.frame(
521+
xpos = xpos,
522+
obs = y_count,
523+
pred_median = pred_median,
524+
lower = pred_quantile[, "lower"],
525+
upper = pred_quantile[, "upper"],
526+
obs_shape = obs_shape
527+
)
528+
}
529+
# standing, hanging, suspended styles
530+
else {
531+
tyexp <- sqrt(colMeans(tyrep))
532+
tyquantile <- sqrt(t(apply(tyrep, 2, quantile, probs = probs)))
533+
colnames(tyquantile) <- c("tylower", "tyupper")
534+
535+
# prepare a table for y
536+
ty <- table(y)
537+
ty <- sqrt(as.numeric(ty[match(xpos, rownames(ty))]))
538+
if (style == "suspended") {
539+
ty <- tyexp - ty
540+
}
541+
ty[is.na(ty)] <- 0
542+
ypos <- ty / 2
543+
if (style == "hanging") {
544+
ypos <- tyexp - ypos
545+
}
546+
547+
data <- data.frame(xpos, ypos, ty, tyexp, tyquantile)
548+
}
549+
550+
return(data)
551+
}

0 commit comments

Comments
 (0)