Testing for significance with microbiome data on individual taxa using R (CC122)
Code
This is the R script that I generated in this episode
library(tidyverse)
library(broom)
library(ggtext)
set.seed(19760620)
shared <- read_tsv("raw_data/baxter.subsample.shared",
col_types = cols(Group = col_character(),
.default = col_double())) %>%
rename_all(tolower) %>%
select(group, starts_with("otu")) %>%
pivot_longer(-group, names_to="otu", values_to="count")
taxonomy <- read_tsv("raw_data/baxter.cons.taxonomy") %>%
rename_all(tolower) %>%
select(otu, taxonomy) %>%
mutate(otu = tolower(otu),
taxonomy = str_replace_all(taxonomy, "\\(\\d+\\)", ""),
taxonomy = str_replace(taxonomy, ";unclassified", "_unclassified"),
taxonomy = str_replace_all(taxonomy, ";unclassified", ""),
taxonomy = str_replace_all(taxonomy, ";$", ""),
taxonomy = str_replace_all(taxonomy, ".*;", "")
)
metadata <- read_tsv("raw_data/baxter.metadata.tsv",
col_types=cols(sample = col_character())) %>%
rename_all(tolower) %>%
rename(group = sample) %>%
mutate(srn = dx_bin == "Adv Adenoma" | dx_bin == "Cancer",
lesion = dx_bin == "Adv Adenoma" | dx_bin == "Cancer" | dx_bin == "Adenoma")
composite <- inner_join(shared, taxonomy, by="otu") %>%
group_by(group, taxonomy) %>%
summarize(count = sum(count), .groups="drop") %>%
group_by(group) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup() %>%
select(-count) %>%
inner_join(., metadata, by="group")
sig_genera <- composite %>%
nest(data = -taxonomy) %>%
mutate(test = map(.x=data, ~wilcox.test(rel_abund~srn, data=.x) %>% tidy)) %>%
unnest(test) %>%
mutate(p.adjust = p.adjust(p.value, method="BH")) %>%
filter(p.adjust < 0.05) %>%
select(taxonomy, p.adjust)
composite %>%
inner_join(sig_genera, by="taxonomy") %>%
mutate(rel_abund = 100 * (rel_abund + 1/20000),
taxonomy = str_replace(taxonomy, "(.*)", "*\\1*"),
taxonomy = str_replace(taxonomy, "\\*(.*)_unclassified\\*",
"Unclassified<br>*\\1*"),
srn = factor(srn, levels = c(T, F))) %>%
ggplot(aes(x=rel_abund, y=taxonomy, color=srn, fill=srn)) +
geom_vline(xintercept = 100/10530, size=0.5, color="gray") +
geom_jitter(position = position_jitterdodge(dodge.width = 0.8,
jitter.width = 0.5),
shape=21) +
stat_summary(fun.data = median_hilow, fun.args = list(conf.int=0.5),
geom="pointrange",
position = position_dodge(width=0.8),
color="black", show.legend = FALSE) +
scale_x_log10() +
scale_color_manual(NULL,
breaks = c(F, T),
values = c("gray", "dodgerblue"),
labels = c("Healthy", "SRN")) +
scale_fill_manual(NULL,
breaks = c(F, T),
values = c("gray", "dodgerblue"),
labels = c("Healthy", "SRN")) +
labs(x= "Relative abundance (%)", y=NULL) +
theme_classic() +
theme(
axis.text.y = element_markdown()
)
ggsave("figures/significant_genera.tiff", width=6, height=4)
Installations
If you haven’t been following along, you can get caught up by doing the following:
- (windows) Install the Ubuntu Linux BASH shell for Windows 10
- (mac) Install
homebrew
andgit
/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" brew install git
- To get to where we are at the beginning of this episode (you won’t have the same issue numbers at Pat)…
- Set up a GitHub account
- Create a new GitHub repository
- Call it “mikropml_demo”
- Make it Public
- Don’t check the box next to “Initialize this repository with a README”
- Click the green “Create repository” button
-
Go to your command line and enter the following replacing
<your_github_id>
with your GitHub user idgit clone git@github.com:SchlossLab/mikropml_demo.git cd mikropml_demo git reset --hard 793d7973aca94e22aed4e1133094971d8e99f2f2 git remote set-url origin git@github.com:<your_github_id>/mikropml_demo.git git push -u origin master
- Return to GitHub and refresh your browser.
- Go to the
mikropml_demo
directory on your computer and double click on themikropml_demo.Rproj
icon. This will launch RStudio and you’ll be good to go.