Adding professional polish to your data visualizations (CC398)
In this livestream, Pat refactors a plot from a paper recently published in Nature Microbiology that had considerable room for improvement. This plot of LDA values generated using LEfSe is frequently used in the microbiome literature. Here he refactors the plot to make it easier to interpret and add some professional polish. He created the figure using R, dplyr, ggplot2, ggtext, and other tools from the tidyverse. The functions he used from these packages included aes, annotate, case_when, coord_cartesian, element_blank, element_line, element_text, fct_reorder, geom_col, geom_richtext, ggplot, ggsave, if_else, labs, library, margin, mutate, scale_fill_identity, scale_x_continuous, seq, str_remove, str_replace, theme, tibble, and unit. The original Nature Microbiology article can be found here (sadly not open access). If you have a figure that you would like to see me discuss in a future newsletter and episode of Code Club, email me at pat@riffomonas.org!
library(tidyverse)
library(glue)
library(ggtext)
tibble(
taxa = c("c_Bacteroidia", "p_Bacteroidota", "o_Bacteroidales",
"f_Muribaculaceae", "f_Lactobacillaceae", "g_Lactobacillus",
"o_Lactobacillales", "p_Firmicutes", "c_Bacilli",
"s_Lactobacillus_murinus", "s_Lactobacillus_intestinalis",
"s_Lactobacillus_johnsonii", "f_Rikenellaceae", "f_Prevotellaceae",
"g_Alloprevotella", "s_Lactobacillus_reuteri", "o_Campylobacterales",
"g_Campylobacteria", "p_Campylobacterota", "g_Helicobacter",
"f_Helicobacteraceae", "g_Parasutterella", "f_Sutterellaceae",
"o_Burkholderiales", "g_Alistipes", "s_Escherichia_coli",
"g_Escherichia_Shigella", "g_Klebsiella",
"p_Proteobacteria", "c_Gammaproteabacteria", "f_Enterobacteriaceae",
"o_Enterobacterales"),
lda = c(seq(5.2, 3.8, length.out = 25), seq(-4.9, -5.8, length.out = 7))
) %>%
mutate(tax_level = str_replace(taxa, "^(.)_.*", "\\1"),
tax_level = case_when(tax_level == "p" ~ "Phylum",
tax_level == "c" ~ "Class",
tax_level == "o" ~ "Order",
tax_level == "f" ~ "Family",
tax_level == "g" ~ "Genus",
tax_level == "s" ~ "Species"
),
taxa = str_remove(taxa, "^._") %>% str_replace("_", " "),
taxa = if_else(taxa == "Escherichia Shigella",
"Escherichia*/*Shigella",
taxa),
pretty_taxa = if_else(tax_level == "Genus" | tax_level == "Species",
glue("*{taxa}* ({tax_level})"),
glue("{taxa} ({tax_level})")),
pretty_taxa = fct_reorder(pretty_taxa, lda),
pretty_x = if_else(lda > 0, -0.05, 0.05),
pretty_hjust = if_else(lda > 0, 1, 0),
fill = if_else(lda > 0, "red", "blue")) %>%
ggplot(aes(x = lda, y = pretty_taxa, fill = fill, hjust = pretty_hjust)) +
geom_col() +
geom_richtext(aes(x = pretty_x, y = pretty_taxa, label = pretty_taxa),
fill = "white", label.color = NA, size = 3.5,
label.padding = unit(c(2, 0, 2, 0), "pt")) +
annotate(geom = "text",
x = c(-3, 3),
y = c(33.25, 33.25),
label = c("Ampicillin", "Specific Pathogen Free"),
fontface = "bold") +
scale_fill_identity() +
scale_x_continuous(breaks = seq(from = -6, to = 6, by = 1.2)) +
coord_cartesian(
xlim = c(-6, 6),
ylim = c(0.5, 32.5),
expand = FALSE, clip = "off"
) +
labs(x = "LDA Score (log10)",
y = NULL) +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(color = "black"),
panel.grid = element_blank(),
panel.grid.major.x = element_line(color = "gray", linewidth = 0.2,
linetype = "dashed"),
panel.background = element_blank(),
plot.margin = margin(l = 10, r = 10, t = 20, b = 0)
)
ggsave("lda_plot.png", width = 5, height = 6)