Crie um gráfico em cascata (Waterfall Chart) usando ggplot

Este gráfico foi criado em resposta a essa pergunta no StackOverflow. O objetivo é replicar o seguinte gráfico: ! [] (waterfall_original.png)

O truque para plotar gráficos em cascata com ggplot2 é criar um conjunto de dados com os grupos (valores x - estou chamando isso no meu código como x.axis.Var) no ordem exata que você deseja plotar. Depois disso, você precisa obter os pontos inicial e final das barras para cada categoria (categorias em sua legenda - cat.Var) dentro dos grupos. Em seguida, crie outro grupo com os * totais por categoria *. Você também precisará de um índice numérico para os grupos manipularem as barras. Finalmente, obtenha uma coluna com o * total por grupo * para os números acima das barras.

Suponha que seu banco de dados esteja assim:

df <- 
  data.frame(
    x.axis.Var = rep(c("Widgets", "Gridgets", "Groms", "Wobs"), 3),
    cat.Var = rep(c("High End", "Mid Range", "Low End"), each = 4),
    values = c(600, 500, 300, 200, # high end
               300, 200, 300, 250, # mid range
               100, 80, 200, 150   # low end
               )
  )

Ou,

   x.axis.Var   cat.Var values
1     Widgets  High End    600
2    Gridgets  High End    500
3       Groms  High End    300
4        Wobs  High End    200
5     Widgets Mid Range    300
6    Gridgets Mid Range    200
7       Groms Mid Range    300
8        Wobs Mid Range    250
9     Widgets   Low End    100
10   Gridgets   Low End     80
11      Groms   Low End    200
12       Wobs   Low End    150

Siga os passos abaixo para obter um novo banco de dados:

library('tidyverse')

df.tmp <- df %>%
  # \_Set the factor levels in the order you want ----
  mutate(
    x.axis.Var = factor(x.axis.Var,
                        levels = c("Widgets", "Gridgets", "Groms", "Wobs")),
    cat.Var = factor(cat.Var,
                        levels = c("Low End", "Mid Range", "High End"))
  ) %>%
  # \_Sort by Group and Category ----
  arrange(x.axis.Var, desc(cat.Var)) %>%
  # \_Get the start and end points of the bars ----
  mutate(end.Bar = cumsum(values),
         start.Bar = c(0, head(end.Bar, -1))) %>%
  # \_Add a new Group called 'Total' with total by category ----
  rbind(
    df %>%
      # \___Sum by Categories ----
      group_by(cat.Var) %>% 
      summarise(values = sum(values)) %>%
      # \___Create new Group: 'Total' ----
      mutate(
        x.axis.Var = "Total",
        cat.Var = factor(cat.Var,
                         levels = c("Low End", "Mid Range", "High End"))
      ) %>%
      # \___Sort by Group and Category ----
      arrange(x.axis.Var, desc(cat.Var)) %>%
      # \___Get the start and end points of the bars ----
      mutate(end.Bar = cumsum(values),
             start.Bar = c(0, head(end.Bar, -1))) %>%
      # \___Put variables in the same order ----
      select(names(df),end.Bar,start.Bar)
  ) %>%
  # \_Get numeric index for the groups ----
  mutate(group.id = group_indices(., x.axis.Var)) %>%
  # \_Create new variable with total by group ----
  group_by(x.axis.Var) %>%
  mutate(total.by.x = sum(values)) %>%
  # \_Order the columns ----
  select(x.axis.Var, cat.Var, group.id, start.Bar, values, end.Bar, total.by.x)

Isso produz:

x.axis.Var cat.Var   group.id start.Bar values end.Bar total.by.x
   <fct>      <fct>        <int>     <dbl>  <dbl>   <dbl>      <dbl>
 1 Widgets    High End         1         0    600     600       1000
 2 Widgets    Mid Range        1       600    300     900       1000
 3 Widgets    Low End          1       900    100    1000       1000
 4 Gridgets   High End         2      1000    500    1500        780
 5 Gridgets   Mid Range        2      1500    200    1700        780
 6 Gridgets   Low End          2      1700     80    1780        780
 7 Groms      High End         3      1780    300    2080        800
 8 Groms      Mid Range        3      2080    300    2380        800
 9 Groms      Low End          3      2380    200    2580        800
10 Wobs       High End         4      2580    200    2780        600
11 Wobs       Mid Range        4      2780    250    3030        600
12 Wobs       Low End          4      3030    150    3180        600
13 Total      High End         5         0   1600    1600       3180
14 Total      Mid Range        5      1600   1050    2650       3180
15 Total      Low End          5      2650    530    3180       3180

Então, eu posso usar o seguinte código para obter o gráfico que eu quero:

ggplot(df.tmp, aes(x = group.id, fill = cat.Var)) + 
  # \_Simple Waterfall Chart ----
  geom_rect(aes(x = group.id,
                xmin = group.id - 0.25, # control bar gap width
                xmax = group.id + 0.25, 
                ymin = end.Bar,
                ymax = start.Bar),
            color="black", 
            alpha=0.95) + 
  # \_Lines Between Bars ----
  geom_segment(aes(x=ifelse(group.id == last(group.id),
                            last(group.id),
                            group.id+0.25), 
                   xend=ifelse(group.id == last(group.id),
                               last(group.id),
                               group.id+0.75), 
                   y=ifelse(cat.Var == "Low End",
                            end.Bar,
                            # these will be removed once we set the y limits
                            max(end.Bar)*2), 
                   yend=ifelse(cat.Var == "Low End",
                               end.Bar,
                               # these will be removed once we set the y limits
                               max(end.Bar)*2)), 
               colour="black") +
  # \_Numbers inside bars (each category) ----
  geom_text(
    mapping = 
      aes(
        label = ifelse(values < 150, 
                       "",
                       ifelse(nchar(values) == 3,
                              as.character(values),
                              sub("(.{1})(.*)", "\\1.\\2", 
                                  as.character(values)
                              )
                            )
                       ),
        y = rowSums(cbind(start.Bar,values/2))
        ),
    color = "white",
    fontface = "bold"
    ) + 
  # \_Total for each category above bars ----
  geom_text(
    mapping = 
      aes(
        label = ifelse(cat.Var != "Low End", 
                       "",
                       ifelse(nchar(total.by.x) == 3,
                              as.character(total.by.x),
                              sub("(.{1})(.*)", "\\1.\\2", 
                                  as.character(total.by.x)
                                )
                            )
                      ),
        y = end.Bar+200
      ),
    color = "#4e4d47",
    fontface = "bold"
  ) + 
  # \_Change colors ----
  scale_fill_manual(values=c('#c8f464','#ff6969','#55646e')) +
  # \_Change y axis to same scale as original ----
  scale_y_continuous(
    expand=c(0,0),
    limits = c(0, 3500),
    breaks = seq(0, 3500, 500),
    labels = ifelse(nchar(seq(0, 3500, 500)) < 4,
                    as.character(seq(0, 3500, 500)),
                    sub("(.{1})(.*)", "\\1.\\2", 
                        as.character(seq(0, 3500, 500))
                    )
    )
  ) +
  # \_Add tick marks on x axis to look like the original plot ----
  scale_x_continuous(
    expand=c(0,0),
    limits = c(min(df.tmp$group.id)-0.5,max(df.tmp$group.id)+0.5),
    breaks = c(min(df.tmp$group.id)-0.5,
               unique(df.tmp$group.id), 
               unique(df.tmp$group.id) + 0.5
               ),
    labels = 
      c("", 
        as.character(unique(df.tmp$x.axis.Var)), 
        rep(c(""), length(unique(df.tmp$x.axis.Var)))
      )
  ) +
  # \_Theme options to make it look like the original plot ----
  theme(
    text = element_text(size = 14, color = "#4e4d47"),
    axis.text = element_text(size = 10, color = "#4e4d47", face = "bold"),
    axis.text.y = element_text(margin = margin(r = 0.3, unit = "cm")),
    axis.ticks.x =
      element_line(color =
                     c("black",
                       rep(NA, length(unique(df.tmp$x.axis.Var))),
                       rep("black", length(unique(df.tmp$x.axis.Var))-1)
                     )
                   ),
    axis.line = element_line(colour = "#4e4d47", size = 0.5),
    axis.ticks.length = unit(.15, "cm"),
    axis.title.x =       element_blank(),
    axis.title.y =       element_blank(),
    panel.background =   element_blank(),
    plot.margin =        unit(c(1, 1, 1, 1), "lines"),
    legend.text =        element_text(size = 10, 
                                      color = "#4e4d47",
                                      face = "bold",
                                      margin = margin(l = 0.25, unit = "cm")
                                      ),
    legend.title =       element_blank()
  )

E o gráfico final: