source("custom_functions.R")Geography-independent mucosal microbiota alterations in primary sclerosing cholangitis persist after liver transplantation
5. Hypothesis: PSC disease is associated with fecal calprotectin
1 Introduction
1.1 About
An IBD vs. no-IBD comparison was performed within PSC patients (pre-LTx and rPSC individuals combined) to discover IBD-specific pattern.
IBD vs no_IBD analysis on merged data
Alpha diversity –> group effect, country effect, interaction effect
Beta diversity – PERMANOVA, PCA -> group effect, country effect, interaction effect
DAA ->
o Group effect – linDA + MaAsLin2 intersection
o Country effect – – taxa with significant interaction effect were excluded based on individual post-hoc analysis. Taxa had to have the same direction in both countries
IBD vs no_IBD – finding IBD-specific pattern in PSC patients
Importing libraries and custom functions built for this analysis
1.2 Data Import
Importing ASV, taxa and metadata tables for both Czech and Norway samples.
Czech
MICROBIOME
path = "../../../../data/analysis_ready_data/ikem/"
asv_tab_ikem <- as.data.frame(fread(file.path(path,"asv_table_ikem.csv"),
check.names = FALSE))
taxa_tab_ikem <- as.data.frame(fread(file.path(path,"taxa_table_ikem.csv"),
check.names = FALSE))
metadata_ikem <- as.data.frame(fread(file.path(path,"metadata_ikem.csv"),
check.names = FALSE))Fecal calprotectin info
# clinical metadata
metadata_clinical <- read.csv("../../../../data/clinical_data/clinical_metadata_cz.csv")
metadata_clinical$PatientID <- as.character(metadata_clinical$PatientID)Norway
MICROBIOME
path = "../../../../data/analysis_ready_data/norway/"
asv_tab_norway <- as.data.frame(fread(file.path(path,"asv_table_norway.csv"),
check.names = FALSE))
taxa_tab_norway <- as.data.frame(fread(file.path(path,"taxa_table_norway.csv"),
check.names = FALSE))
metadata_norway <- as.data.frame(fread(file.path(path,"metadata_norway.csv"),
check.names = FALSE))Fecal calprotectin info
# clinical metadata
metadata_clinical_no <- read.csv("../../../../data/clinical_data/clinical_metadata_no.csv")
metadata_clinical_no$PatientID <- paste0("NO_", as.character(metadata_clinical_no$subjectid))1.2.1 Merging
Clinical data
metadata_ikem <- merge(metadata_ikem,metadata_clinical[,c("SampleID","Fecal.calprotectin")],by="SampleID",all.x=TRUE) %>%
dplyr::mutate(Calprotectin=Fecal.calprotectin) %>%
dplyr::mutate(Calprotectin = case_when(
Calprotectin == ">6000" ~ "6000",
TRUE ~ Calprotectin)) %>%
dplyr::mutate(Calprotectin= as.numeric(Calprotectin))
metadata_norway <- merge(metadata_norway,metadata_clinical_no[,c("SampleID","Calprotectin")],
by="SampleID",all.x=TRUE) %>%
dplyr::mutate(Calprotectin= as.numeric(Calprotectin))Terminal ileum
ileum_data <- merging_data(asv_tab_1=asv_tab_ikem,
asv_tab_2= asv_tab_norway,
taxa_tab_1=taxa_tab_ikem,
taxa_tab_2=taxa_tab_norway,
metadata_1=metadata_ikem,
metadata_2=metadata_norway,
segment="TI",Q="Q5")Removing 1498 ASV(s)
Removing 1834 ASV(s)
Merging at ASV level
Finding inconsistencies in taxonomy, trying to keep the ones that have better taxonomy assignment
Removing 2296 ASV(s)
ileum_asv_tab <- ileum_data[[1]]
ileum_taxa_tab <- ileum_data[[2]]
ileum_metadata <- ileum_data[[3]]ileum_metadata %<>% dplyr::mutate(
log_Calprotectin=log(Calprotectin,base = 10))
median(ileum_metadata$Calprotectin,na.rm = TRUE)[1] 84.5
median(ileum_metadata$log_Calprotectin,na.rm = TRUE)[1] 1.926667
p1 <- ggplot(ileum_metadata,aes(log_Calprotectin)) +
geom_histogram() +
geom_vline(xintercept = log(250,base=10), linetype="dashed",
color = "red")Splitting to groups
ileum_metadata %<>% dplyr::mutate(Calprotectin_groups = case_when(
Calprotectin <= 250 ~ "low",
Calprotectin > 250 ~ "high"))
ileum_metadata %>% group_by(Calprotectin_groups) %>%
count()# A tibble: 3 × 2
# Groups: Calprotectin_groups [3]
Calprotectin_groups n
<chr> <int>
1 high 32
2 low 62
3 <NA> 14
p2 <- ggplot(ileum_metadata %>% dplyr::filter(!is.na(Calprotectin_groups)),aes(Calprotectin_groups)) +
geom_bar()p <- ggarrange(p1,p2,ncol = 2)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 14 rows containing non-finite outside the scale range
(`stat_bin()`).
pileum_metadata %<>% drop_na(Calprotectin) %>% dplyr::filter(Group!="healthy")
ileum_asv_tab %<>% dplyr::select("SeqID",ileum_metadata$SampleID)
ileum_taxa_tab %<>% dplyr::filter(SeqID %in% ileum_asv_tab$SeqID)Colon
colon_data <- merging_data(asv_tab_1=asv_tab_ikem,
asv_tab_2=asv_tab_norway,
taxa_tab_1=taxa_tab_ikem,
taxa_tab_2=taxa_tab_norway,
metadata_1=metadata_ikem,
metadata_2=metadata_norway,
segment="colon",Q="Q5")Removing 739 ASV(s)
Removing 266 ASV(s)
Merging at ASV level
Finding inconsistencies in taxonomy, trying to keep the ones that have better taxonomy assignment
Removing 2852 ASV(s)
colon_asv_tab <- colon_data[[1]]
colon_taxa_tab <- colon_data[[2]]
colon_metadata <- colon_data[[3]]colon_metadata %<>% dplyr::mutate(
log_Calprotectin=log(Calprotectin,base = 10))
colon_metadata_forplot <- colon_metadata %>%
group_by(Patient) %>%
distinct(Patient, .keep_all = TRUE) %>%
as.data.frame()
median(colon_metadata_forplot$Calprotectin,na.rm = TRUE)[1] 79
median(colon_metadata_forplot$log_Calprotectin,na.rm = TRUE)[1] 1.897488
p1 <- ggplot(colon_metadata_forplot,aes(log_Calprotectin)) +
geom_histogram() +
geom_vline(xintercept = log(250,base=10), linetype="dashed",
color = "red")Splitting to groups
colon_metadata %<>% dplyr::mutate(Calprotectin_groups = case_when(
Calprotectin <= 250 ~ "low",
Calprotectin > 250 ~ "high"))
colon_metadata_forplot %<>% dplyr::mutate(Calprotectin_groups = case_when(
Calprotectin <= 250 ~ "low",
Calprotectin > 250 ~ "high"))
colon_metadata_forplot %>% group_by(Calprotectin_groups) %>%
count()# A tibble: 3 × 2
# Groups: Calprotectin_groups [3]
Calprotectin_groups n
<chr> <int>
1 high 37
2 low 87
3 <NA> 22
p2 <- ggplot(colon_metadata_forplot %>% dplyr::filter(!is.na(Calprotectin_groups)),aes(Calprotectin_groups)) +
geom_bar()p <- ggarrange(p1,p2,ncol = 2)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 22 rows containing non-finite outside the scale range
(`stat_bin()`).
pcolon_metadata %<>% drop_na(Calprotectin) %>% dplyr::filter(Group!="healthy")
colon_asv_tab %<>% dplyr::select("SeqID",colon_metadata$SampleID)
colon_taxa_tab %<>% dplyr::filter(SeqID %in% colon_asv_tab$SeqID)1.3 CHI SQUARE TEST
1.3.1 pre_LTx, rPSC, non-rPSC
metadata <- rbind(ileum_metadata,colon_metadata) %>%
group_by(Patient) %>%
distinct(Patient, .keep_all = TRUE) %>%
as.data.frame() df <- metadata %>% dplyr::group_by(Calprotectin_groups,Group) %>%
distinct(Patient, .keep_all = TRUE) %>%
count() %>% drop_na()
# Summarize by Nancy and Group
data <- df %>%
group_by(Calprotectin_groups, Group) %>%
summarise(n = sum(n), .groups = "drop") %>%
pivot_wider(names_from = Group, values_from = n) %>%
column_to_rownames("Calprotectin_groups") %>%
as.matrix()
chi_res <- chisq.test(data)
data pre_ltx rPSC
high 25 13
low 76 17
chi_res$expected pre_ltx rPSC
high 29.29771 8.70229
low 71.70229 21.29771
chi_res
Pearson's Chi-squared test with Yates' continuity correction
data: data
X-squared = 3.0279, df = 1, p-value = 0.08184
2 Data Analysis - Terminal ileum
segment="terminal_ileum"2.1 Filtering
Rules:
sequencing depth > 10000
nearZeroVar() with default settings
Rarefaction Curve
path="../../../intermediate_files/rarecurves"
seq_depth_threshold <- 10000ps <- construct_phyloseq(ileum_asv_tab,ileum_taxa_tab,ileum_metadata)
rareres <- get_rarecurve(obj=ps, chunks=500)
save(rareres,file = file.path(path,"rarefaction_ileum.Rdata"))load(file.path(path,"rarefaction_ileum.Rdata"))
prare <- ggrarecurve(obj=rareres,
factorNames="Country",
indexNames=c("Observe")) +
theme_bw() +
theme(axis.text=element_text(size=8), panel.grid=element_blank(),
strip.background = element_rect(colour=NA,fill="grey"),
strip.text.x = element_text(face="bold")) +
geom_vline(xintercept = seq_depth_threshold,
linetype="dashed",
color = "red") +
xlim(0, 20000)The color has been set automatically, you can reset it manually by adding scale_color_manual(values=yourcolors)
prareLibrary size
read_counts(ileum_asv_tab, line = c(5000,10000))Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
2.1.1 Sequencing depth
data_filt <- seq_depth_filtering(ileum_asv_tab,
ileum_taxa_tab,
ileum_metadata,
seq_depth_threshold = 10000)Removing 238 ASV(s)
filt_ileum_asv_tab <- data_filt[[1]]; alpha_ileum_asv_tab <- filt_ileum_asv_tab
filt_ileum_taxa_tab <- data_filt[[2]]; alpha_ileum_taxa_tab <- filt_ileum_taxa_tab
filt_ileum_metadata <- data_filt[[3]]; alpha_ileum_metadata <- filt_ileum_metadata
seq_step <- dim(filt_ileum_asv_tab)[1]Library size
read_counts(filt_ileum_asv_tab,line = c(5000,10000))Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
2.1.2 NearZeroVar
data_filt <- nearzerovar_filtering(filt_ileum_asv_tab,
filt_ileum_taxa_tab,
filt_ileum_metadata)
filt_ileum_asv_tab <- data_filt[[1]]
filt_ileum_taxa_tab <- data_filt[[2]]
nearzero_step <- dim(filt_ileum_asv_tab)[1]Library size
read_counts(filt_ileum_asv_tab,line = c(5000,10000))Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
2.1.3 Final Counts
final_counts_filtering(ileum_asv_tab,
filt_ileum_asv_tab,
filt_ileum_metadata,
seq_step, 0, nearzero_step) %>% `colnames<-`("Count") Count
Raw data: ASVs 2836
Raw data: Samples 94
Sequencing depth filt: ASVs 2598
Prevalence filt: ASVs 0
NearZeroVar filt: ASVs 627
Filt data: ASVs 627
Filt data: Samples 90
Filt data: Patients 90
Filt data: Patients.1 0
Filtered samples 4
Matrices TI
healthy 0
non-rPSC 0
rPSC 26
pre_ltx 64
post_ltx 0
ETOH 0
2.2 Alpha diversity
path = "../../../results/merged_sites/main_analysis/Q5/alpha_diversity"Calculation
# Construct MPSE object
alpha_ileum_metadata$Sample <- alpha_ileum_metadata$SampleID
ileum_mpse <- as.MPSE(construct_phyloseq(alpha_ileum_asv_tab,
alpha_ileum_taxa_tab,
alpha_ileum_metadata))
ileum_mpse %<>% mp_rrarefy(raresize = 10000,seed = 123)
# Calculate alpha diversity - rarefied counts
ileum_mpse %<>% mp_cal_alpha(.abundance=RareAbundance, force=TRUE)alpha_div_plots <- list()
# preparing data frame
alpha_data <- data.frame(SampleID=ileum_mpse$Sample.x,
Observe=ileum_mpse$Observe,
Shannon=ileum_mpse$Shannon,
Simpson=ileum_mpse$Simpson,
Pielou=ileum_mpse$Pielou,
Group=ileum_mpse$Calprotectin_groups,
Country=ileum_mpse$Country,
Patient=ileum_mpse$Patient)
write.csv(alpha_data,file.path(path,paste0("alpha_indices_",segment,".csv")),
row.names = FALSE)
alpha_data %<>% drop_na()2.2.1 Plots
p_boxplot_alpha <- alpha_diversity_countries(alpha_data,
show_legend = FALSE,
size=1,gap=FALSE)Using SampleID, Group, Country, Patient as id variables
Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Using SampleID, Group, Country, Patient as id variables
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
# save the results
alpha_div_plots[[paste(segment,"Country")]] <- p_boxplot_alpha
# see the results
p_boxplot_alpha2.2.2 Linear Model
path = "../../../results/merged_sites/main_analysis/Q5/alpha_diversity"
alpha_data <- read.csv(file.path(path,paste0("alpha_indices_",segment,".csv")))Richness
results_model <- pairwise.lm(formula = "Observe ~ Group * Country",
factors=alpha_data$Group,
data=alpha_data)
# check interaction
if (!is.data.frame(results_model)){
results_model_observe <- results_model[[1]]
results_model_observe_emeans <- results_model[[2]]
} else {
results_model_observe <- results_model
results_model_observe_emeans <- NA
}
# save the results
pc_observed <- list();
pc_observed[[segment]] <- results_model_observe# see the results
knitr::kable(results_model_observe,digits = 3,
caption = "Raw results of linear model of richness estimation.")| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| low vs Grouphigh | -9.672 | 16.717 | -0.579 | 0.564 |
| low , high - CZ vs NO | 3.275 | 14.092 | 0.232 | 0.817 |
| low vs Grouphigh:CountryNO | -15.219 | 23.797 | -0.640 | 0.524 |
knitr::kable(results_model_observe_emeans,digits = 3,
caption = "Raw results of independent country analysis")Table: Raw results of independent country analysis
Shannon
results_model <- pairwise.lm(formula = "Shannon ~ Group * Country",
factors=alpha_data$Group,
data=alpha_data)
# check interaction
if (!is.data.frame(results_model)){
results_model_shannon <- results_model[[1]]
results_model_shannon_emeans <- results_model[[2]]
} else {
results_model_shannon <- results_model
results_model_shannon_emeans <- NA
}
# save the results
pc_shannon <- list();
pc_shannon[[segment]] <- as.data.frame(results_model_shannon)# see the results
knitr::kable(results_model_shannon,digits = 3,
caption = "Raw results of linear model of Shannon estimation.")| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| low vs Grouphigh | -0.010 | 0.219 | -0.044 | 0.965 |
| low , high - CZ vs NO | 0.030 | 0.185 | 0.164 | 0.870 |
| low vs Grouphigh:CountryNO | -0.323 | 0.312 | -1.034 | 0.304 |
knitr::kable(results_model_shannon_emeans,digits = 3,
caption = "Raw results of independent country analysis")Table: Raw results of independent country analysis
Simpson
results_model <- pairwise.lm(formula = "Simpson ~ Group * Country",
factors=alpha_data$Group,
data=alpha_data)
# check interaction
if (!is.data.frame(results_model)){
results_model_simpson <- results_model[[1]]
results_model_simpson_emeans <- results_model[[2]]
} else {
results_model_simpson <- results_model
results_model_simpson_emeans <- NA
}
# save the results
pc_simpson <- list();
pc_simpson[[segment]] <- as.data.frame(results_model_simpson)# see the results
knitr::kable(results_model_simpson,digits = 3,
caption = "Raw results of linear model of Simpson estimation.")| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| low vs Grouphigh | 0.000 | 0.029 | -0.001 | 0.999 |
| low , high - CZ vs NO | 0.001 | 0.024 | 0.055 | 0.956 |
| low vs Grouphigh:CountryNO | -0.034 | 0.041 | -0.826 | 0.411 |
knitr::kable(results_model_simpson_emeans,digits = 3,
caption = "Raw results of independent country analysis")Table: Raw results of independent country analysis
2.2.3 Saving results
alpha_list <- list(
Richness=pc_observed[[segment]],
Shannon=pc_shannon[[segment]])
write.xlsx(alpha_list,
file = file.path(path,paste0("alpha_diversity_results_",segment,".xlsx")))2.3 Beta diversity
Calculating Aitchison distance (euclidean distance on clr-transformed data), both at ASV and genus level.
2.3.1 Main analysis
Genus level, Aitchison distance
level="genus"path = "../../../results/merged_sites/main_analysis/Q5/beta_diversity"pairwise_aitchison_raw <- list()
pca_plots_list <- list()Aggregation, filtering
# Aggregation
genus_data <- aggregate_taxa(ileum_asv_tab,
ileum_taxa_tab,
taxonomic_level=level,
names=TRUE)
# Filtration
filt_data <- filtering_steps(genus_data[[1]],
genus_data[[2]],
ileum_metadata,
seq_depth_threshold=10000)Removing 17 ASV(s)
filt_ileum_genus_tab <- filt_data[[1]]
filt_ileum_genus_taxa <- filt_data[[2]]
filt_ileum_metadata <- filt_data[[3]] %>% dplyr::mutate(Group=Calprotectin_groups)2.3.1.0.1 PERMANOVA
pairwise_df <- filt_ileum_genus_tab %>% column_to_rownames("SeqID") %>% t()
# main effect
pp_main <- pairwise.adonis(pairwise_df,
filt_ileum_metadata$Group,
covariate = filt_ileum_metadata$Country, sim.method = "robust.aitchison", p.adjust.m="BH")
# interaction
pp_int <- pairwise.adonis(pairwise_df,filt_ileum_metadata$Group,covariate = filt_ileum_metadata$Country, interaction = TRUE, sim.method = "robust.aitchison", p.adjust.m="BH")
# tidy the results
pp_factor <- pp_main[[1]]
pp_cov <- pp_main[[2]]
pp_fac.cov <- pp_int[[3]]
cols <- c("pairs","Df","SumsOfSqs", "F.Model","R2","p.value", "p.adjusted", "sig")
colnames(pp_factor) <- cols; colnames(pp_cov) <- cols; colnames(pp_fac.cov) <- cols;
# save raw results
pairwise_aitchison_raw[[paste(level, segment)]] <- rbind(pp_factor,pp_cov,pp_fac.cov)# see the results
knitr::kable(pp_factor,digits = 3,caption = "PERMANOVA, GROUP separation")| pairs | Df | SumsOfSqs | F.Model | R2 | p.value | p.adjusted | sig |
|---|---|---|---|---|---|---|---|
| low vs high | 1 | 186.558 | 1.046 | 0.012 | 0.32 | 0.32 |
knitr::kable(pp_cov,digits = 3,caption = "PERMANOVA, COUNTRY separation")| pairs | Df | SumsOfSqs | F.Model | R2 | p.value | p.adjusted | sig |
|---|---|---|---|---|---|---|---|
| low vs high , Country | 1 | 473.188 | 2.652 | 0.029 | 0.001 | 0.001 | *** |
knitr::kable(pp_fac.cov,digits = 3,caption = "PERMANOVA, INTERACTION GROUP:Country")| pairs | Df | SumsOfSqs | F.Model | R2 | p.value | p.adjusted | sig |
|---|---|---|---|---|---|---|---|
| low vs high : Country | 1 | 171.142 | 0.959 | 0.011 | 0.567 | 0.567 |
Interaction check
interaction_sig <- pp_fac.cov$pairs[pp_fac.cov$p.adjusted < 0.1]
if (length(interaction_sig)>0){
for (i in 1:length(interaction_sig)){
group1 <- unlist(strsplit(interaction_sig[i],split = " vs "))[1]
group2 <- unlist(strsplit(interaction_sig[i],split = " vs "))[2]
group2 <- unlist(strsplit(group2,split = " : "))[1]
result_list <- adonis_postanalysis(x=pairwise_df,
factors = filt_ileum_metadata$v,
covariate = filt_ileum_metadata$Country,
group1 = group1,
group2 = group2)
print(result_list)
}
}2.3.1.0.2 Plots
PCA
p <- pca_plot_custom(filt_ileum_genus_tab,
filt_ileum_genus_taxa,
filt_ileum_metadata,
show_boxplots = TRUE,
variable = "Group", size=3, show_legend=FALSE)
# save the results
pca_plots_list[[paste(segment,level,"custom")]] <- p
# see the results
p2.3.1.1 Saving results
write.xlsx(pairwise_aitchison_raw[[paste(level, segment)]],
file = file.path(path,
paste0("beta_diversity_results_", segment,".xlsx")))2.4 Univariate Analysis
2.4.1 Main analysis
level="genus"# needed paths
path = "../../../results/merged_sites/main_analysis/Q5/univariate_analysis"
path_maaslin=file.path("../../../intermediate_files/maaslin/Q5",level)# variables
raw_linda_results_genus <- list();
raw_linda_results_genus[[segment]] <- list()
linda_results_genus <- list();
linda_results_genus[[segment]] <- list()
# country and interaction problems
list_intersections <- list()
list_venns <- list()
uni_statistics <- list()
# workbook for final df
wb <- createWorkbook()Aggregate taxa
genus_data <- aggregate_taxa(ileum_asv_tab,
ileum_taxa_tab,
taxonomic_level = level)
ileum_genus_tab <- genus_data[[1]]
ileum_genus_taxa_tab <- genus_data[[2]]
ileum_genus_asv_taxa_tab <- create_asv_taxa_table(ileum_genus_tab,
ileum_genus_taxa_tab)2.4.1.1 low vs high
group <- c("low","high")
comparison_name <- paste0(group[1], " vs ",group[2])2.4.1.1.1 linDA
ileum_metadata %<>% dplyr::mutate(Group=Calprotectin_groups)
# prepare the data
linda_data <- binomial_prep(ileum_genus_tab,
ileum_genus_taxa_tab,
ileum_metadata,
group, usage="linDA")Removing 11 ASV(s)
Removing 6 ASV(s)
filt_ileum_uni_data <- linda_data[[1]]
filt_ileum_uni_taxa <- linda_data[[2]]
filt_ileum_uni_metadata <- linda_data[[3]]
# fit the model
linda.obj <- linda(filt_ileum_uni_data,
filt_ileum_uni_metadata,
formula = '~ Group * Country')0 features are filtered!
The filtered data has 90 samples and 202 features will be tested!
Imputation approach is used.
Fit linear models ...
Completed.
linda.output <- linda.obj$output
linda.output <- linda_renaming(linda.output, group)
# save the results
group1 <- paste0(group[1], " vs ","Group",group[2])
group2 <- paste0(group[1], " , ",group[2], " - CZ vs NO")
group3 <- paste0(group[1], " vs ","Group",group[2], ":CountryNO")
for (grp in c(group1,group2,group3)){
raw_linda_results_genus[[segment]][[grp]] <-
rawlinda.df(linda.output,
grp,
filt_ileum_uni_data,
filt_ileum_uni_taxa)
linda_results_genus[[segment]][[grp]] <-
linda.df(linda.output,
grp,
filt_ileum_uni_data,
filt_ileum_uni_taxa)
}# volcano plot
volcano_1 <- volcano_plot_linda(linda.output,
group1,
taxa_table = filt_ileum_uni_taxa) +
ggtitle(comparison_title)
volcano_2 <- volcano_plot_linda(linda.output, group2,
taxa_table = filt_ileum_uni_taxa) +
ggtitle("NO vs CZ")
volcano_3 <- volcano_plot_linda(linda.output, group3,
taxa_table = filt_ileum_uni_taxa) +
ggtitle("Interaction effect")
volcano <- ggarrange(volcano_1,volcano_2,volcano_3, ncol=3)
volcano2.4.1.1.2 MaAsLin2
volcano1 <- volcano_plot_maaslin(fit_data,filt_ileum_uni_taxa) +
ggtitle(comparison_title)
volcano2 <- volcano_plot_maaslin(fit_data,filt_ileum_uni_taxa,variable="Country") +
ggtitle("Country effect")
volcano <- ggarrange(volcano1,volcano2, ncol=2)
volcano2.4.1.1.3 Group - Intersection
intersection_results <- group_intersection(group, list_intersections, list_venns,
linda.output, fit_data,
raw_linda_results_genus,
segment = segment,
level=level)
list_intersections <- intersection_results[[1]]
list_venns <- intersection_results[[2]]
venn <- intersection_results[[3]]
# show the results
venn2.4.1.1.4 Interaction effect
list_interaction_significant <- country_interaction(group,
linda.output,
list_intersections,
filt_ileum_uni_data,
filt_ileum_uni_metadata,
segment=segment,
level=level)
# see the result
## significant interaction effect
list_interaction_significant[[1]]NULL
## results for czech cohort
list_interaction_significant[[2]][1] NA
## results for norwegian cohort
list_interaction_significant[[3]][1] NA
Removing problematic taxa
list_intersections <- removing_interaction_problems(group,
list_interaction_significant,
list_intersections,
segment=segment,
level=level)2.4.1.1.5 Basic statistics
uni_df <- merge(basic_univariate_statistics(linda_data,group),
raw_linda_results_genus[[segment]][[group1]],
by="SeqID",all=TRUE)[1] "low"
[1] "high"
[1] "low"
[1] "high"
uni_df[["final_sig"]] <- uni_df$SeqID %in% list_intersections[[paste(segment,level,comparison_name)]][["SeqID"]]
uni_statistics[[segment]][[paste(level,comparison_name)]] <- uni_df
# for comparison
new_name <- comparison_name
addWorksheet(wb, sheetName = new_name)
writeData(wb, sheet = new_name, uni_df, rowNames=FALSE)2.4.1.2 Visualization
Heatmap visualizing the linDA’s logFoldChange for taxa with p < 0.1.
list_heatmap <- list_intersections[grep(paste(segment,level),
names(list_intersections),value=TRUE)]
if (length((list_heatmap[[1]][[1]]))>1){
p_heatmap_linda <- heatmap_linda(list_heatmap,ileum_taxa_tab)
p_heatmap_linda
}Dotheatmap
if (length((list_heatmap[[1]][[1]]))>0){
dotheatmap_linda <- dot_heatmap_linda(list_heatmap,
uni_statistics$terminal_ileum[grepl(level,names(uni_statistics$terminal_ileum))],
ileum_taxa_tab)
dotheatmap_linda
}Horizontal bar plot
if (length((list_heatmap[[1]][[1]]))>0){
p_prevalence <- horizontal_barplot(wb,taxa=levels(dotheatmap_linda$data$SeqID))
p_prevalence_final <- ggarrange(p_prevalence,
ggplot() + theme_minimal(),
nrow = 2,heights = c(1,0.085))
p <- ggarrange(dotheatmap_linda + theme(legend.position = "none"),p_prevalence_final,ncol=2,widths = c(1,0.2))
p
}2.4.1.3 Saving results
# ALL DATA
saveWorkbook(wb,file.path(path,paste0("uni_analysis_wb_",segment,".xlsx")),
overwrite = TRUE)
# SIGNIFICANT taxa
write.xlsx(list_intersections[grepl(segment,names(list_intersections))] %>%
`names<-`(gsub(segment, "", names(
list_intersections[grepl(segment,names(list_intersections))]))),
file.path(path,paste0("significant_taxa_",segment,".xlsx")))2.5 Machine learning
path = "../../../results/merged_sites/main_analysis/Q5/models"2.5.1 ElasticNet
model="enet"2.5.1.1 ASV level
level="ASV"2.5.1.1.1 low vs high
group <- c("low","high")
comparison_name <- paste0(group[1], " vs ",group[2])model_name <- paste(comparison_name,level,segment)
# prepare the data
filt_ileum_uni_data <- binomial_prep(ileum_asv_tab,
ileum_taxa_tab,
ileum_metadata,
group, usage="ml_clr")Removing 167 ASV(s)
Removing 71 ASV(s)
# fit the model
enet_model <- glmnet_binomial(filt_ileum_uni_data,
sample_method = "atypboot",
outcome="Group",
N=10,
reuse=TRUE,
file=model_name,
Q="Q5")
# ROC curve
roc_c <- roc_curve(enet_model, group)
# save the results
models_summ <- list()
models_cm <- list()
betas <- list()
roc_cs <- list()
models_summ[[model_name]] <- enet_model$model_summary
models_cm[[model_name]] <- enet_model$conf_matrices$original
roc_cs[[model_name]] <- enet_model$kfold_rocobjs
betas[[model_name]] <- as.matrix(enet_model$betas)
# see the results
enet_model$model_summary %>% t() [,1]
alpha 0.0000000
lambda 9.3347781
auc 0.9694444
auc_czech 0.9638889
auc_no 0.9729167
auc_optimism_corrected 0.6255978
auc_optimism_corrected_CIL 0.4005091
auc_optimism_corrected_CIU 0.7861538
accuracy 0.6777778
accuracy_czech NaN
accuracy_no 0.7692308
accuracy_optimism_corrected 0.6413384
accuracy_optimism_corrected_CIL 0.4346841
accuracy_optimism_corrected_CIU 0.7214410
enet_model$conf_matrices$original
Predicted
True 0 1
0 1 29
1 0 60
$czech
Predicted
True 0 1
0 1 17
1 0 20
$no
1
0 12 0
1 40 0
enet_model$plot`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
roc_c2.5.1.2 Genus level
level="genus"Aggregate taxa
genus_data <- aggregate_taxa(ileum_asv_tab,
ileum_taxa_tab,
taxonomic_level = level)
ileum_genus_tab <- genus_data[[1]]
ileum_genus_taxa_tab <- genus_data[[2]]2.5.1.2.1 low vs high
group <- c("low","high")
comparison_name <- paste0(group[1], " vs ",group[2])model_name <- paste(comparison_name,level,segment)
# prepare the data
filt_ileum_uni_data <- binomial_prep(ileum_genus_tab,
ileum_genus_taxa_tab,
ileum_metadata,
group,
usage="ml_clr")Removing 11 ASV(s)
Removing 6 ASV(s)
# fit the model
enet_model <- glmnet_binomial(filt_ileum_uni_data,
sample_method = "atypboot",
outcome="Group",
N=10,
reuse=TRUE,
file=model_name,
Q="Q5")
# ROC
roc_c <- roc_curve(enet_model, group)
# save the results
models_summ[[model_name]] <- enet_model$model_summary
models_cm[[model_name]] <- enet_model$conf_matrices$original
roc_cs[[model_name]] <- enet_model$kfold_rocobjs
betas[[model_name]] <- as.matrix(enet_model$betas)
# see the results
enet_model$model_summary %>% t() [,1]
alpha 0.0000000
lambda 0.1511817
auc 0.9420571
auc_czech 0.8925971
auc_no 0.9579125
auc_optimism_corrected 0.6199997
auc_optimism_corrected_CIL 0.5498124
auc_optimism_corrected_CIU 0.6872995
accuracy 0.8465909
accuracy_czech NaN
accuracy_no 0.9027356
accuracy_optimism_corrected 0.6540349
accuracy_optimism_corrected_CIL 0.5724537
accuracy_optimism_corrected_CIU 0.7100395
enet_model$conf_matrices$original
Predicted
True 0 1
0 90 67
1 14 357
$czech
Predicted
True 0 1
0 62 41
1 8 88
$no
Predicted
True 0 1
0 28 26
1 6 269
enet_model$plot`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
roc_c2.5.1.3 Saving results
models_summ_df_ileum <- do.call(rbind,
models_summ[grep(segment,names(models_summ),value = TRUE)])
write.csv(models_summ_df_ileum,file.path(path,paste0("elastic_net_",segment,".csv")))2.6 Results overview
2.6.0.1 Alpha diversity
pc_observed[[segment]] Estimate Std. Error t value Pr(>|t|)
low vs Grouphigh -9.672222 16.71735 -0.5785741 0.5643888
low , high - CZ vs NO 3.275000 14.09151 0.2324095 0.8167721
low vs Grouphigh:CountryNO -15.219444 23.79693 -0.6395550 0.5241619
pc_shannon[[segment]] Estimate Std. Error t value Pr(>|t|)
low vs Grouphigh -0.009581973 0.2194188 -0.0436698 0.9652689
low , high - CZ vs NO 0.030263347 0.1849541 0.1636263 0.8704093
low vs Grouphigh:CountryNO -0.322952862 0.3123398 -1.0339792 0.3040448
pc_simpson[[segment]] Estimate Std. Error t value Pr(>|t|)
low vs Grouphigh -2.636033e-05 0.02876313 -0.0009164625 0.9992709
low , high - CZ vs NO 1.331900e-03 0.02424523 0.0549345433 0.9563180
low vs Grouphigh:CountryNO -3.380300e-02 0.04094395 -0.8255921413 0.4113199
Plots
alpha_div_plots[[paste(segment,"Country")]]2.6.0.2 Beta diversity
Main results
pairwise_aitchison_raw[[paste("genus", segment)]] pairs Df SumsOfSqs F.Model R2 p.value p.adjusted
1 low vs high 1 186.5579 1.0457378 0.01150575 0.320 0.320
2 low vs high , Country 1 473.1877 2.6524217 0.02918333 0.001 0.001
3 low vs high : Country 1 171.1419 0.9588709 0.01055499 0.567 0.567
sig
1
2 ***
3
PCA
pca_plots_list[[paste(segment,"genus custom")]]2.6.0.3 Univariate analysis
Number of significant taxa
knitr::kable(as.data.frame(lapply(list_intersections,nrow)) %>% t() %>%
`colnames<-`("Count") %>%
`rownames<-`(c(names(list_intersections))),caption="Number of significant taxa")| Count | |
|---|---|
| terminal_ileum genus low vs high | 0 |
2.6.0.4 Machine learning
Main models
knitr::kable(models_summ_df_ileum %>% dplyr::select(
"alpha","lambda",
"auc_optimism_corrected",
"auc_optimism_corrected_CIL",
"auc_optimism_corrected_CIU"),
digits=2,caption="Elastic net results")| alpha | lambda | auc_optimism_corrected | auc_optimism_corrected_CIL | auc_optimism_corrected_CIU | |
|---|---|---|---|---|---|
| low vs high ASV terminal_ileum | 0 | 9.33 | 0.63 | 0.40 | 0.79 |
| low vs high genus terminal_ileum | 0 | 0.15 | 0.62 | 0.55 | 0.69 |
ROC - Genus level
p <- roc_curve_all_custom(roc_cs[2],Q="Q5",
model_name="enet_model")
p3 Data Analysis - Colon
segment="colon"3.1 Filtering
Rules:
sequencing depth > 10000
nearZeroVar() with default settings
Rarefaction Curve
path="../../../intermediate_files/rarecurves"
seq_depth_threshold <- 10000ps <- construct_phyloseq(colon_asv_tab,colon_taxa_tab,colon_metadata)
rareres <- get_rarecurve(obj=ps, chunks=500)
save(rareres,file = file.path(path,"rarefaction_colon.Rdata"))load(file.path(path,"rarefaction_colon.Rdata"))
prare <- ggrarecurve(obj=rareres,
factorNames="Country",
indexNames=c("Observe")) +
theme_bw() +
theme(axis.text=element_text(size=8), panel.grid=element_blank(),
strip.background = element_rect(colour=NA,fill="grey"),
strip.text.x = element_text(face="bold")) +
geom_vline(xintercept = seq_depth_threshold,
linetype="dashed",
color = "red") +
xlim(0, 20000)The color has been set automatically, you can reset it manually by adding scale_color_manual(values=yourcolors)
prareLibrary size
read_counts(colon_asv_tab, line = c(5000,10000))Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
3.1.1 Sequencing depth
data_filt <- seq_depth_filtering(colon_asv_tab,
colon_taxa_tab,
colon_metadata,
seq_depth_threshold = 10000)Removing 308 ASV(s)
filt_colon_asv_tab <- data_filt[[1]]; alpha_colon_asv_tab <- filt_colon_asv_tab
filt_colon_taxa_tab <- data_filt[[2]]; alpha_colon_taxa_tab <- filt_colon_taxa_tab
filt_colon_metadata <- data_filt[[3]]; alpha_colon_metadata <- filt_colon_metadata
seq_step <- dim(filt_colon_asv_tab)[1]Library size
read_counts(filt_colon_asv_tab,line = c(5000,10000))Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
3.1.2 NearZeroVar
data_filt <- nearzerovar_filtering(filt_colon_asv_tab,
filt_colon_taxa_tab,
filt_colon_metadata)
filt_colon_asv_tab <- data_filt[[1]]
filt_colon_taxa_tab <- data_filt[[2]]
nearzero_step <- dim(filt_colon_asv_tab)[1]Library size
read_counts(filt_colon_asv_tab,line = c(5000,10000))Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
3.1.3 Final Counts
final_counts_filtering(colon_asv_tab,
filt_colon_asv_tab,
filt_colon_metadata,
seq_step, 0, nearzero_step) %>% `colnames<-`("Count") Count
Raw data: ASVs 4084
Raw data: Samples 306
Sequencing depth filt: ASVs 3776
Prevalence filt: ASVs 0
NearZeroVar filt: ASVs 342
Filt data: ASVs 342
Filt data: Samples 293
Filt data: Patients 121
Filt data: Patients.1 0
Filtered samples 13
Matrices Cecum;Rectum;CD;CA;SI
healthy 0
non-rPSC 0
rPSC 73
pre_ltx 220
post_ltx 0
ETOH 0
3.2 Alpha diversity
path = "../../../results/merged_sites/main_analysis/Q5/alpha_diversity"Calculation
# Construct MPSE object
alpha_colon_metadata$Sample <- alpha_colon_metadata$SampleID
colon_mpse <- as.MPSE(construct_phyloseq(alpha_colon_asv_tab,
alpha_colon_taxa_tab,
alpha_colon_metadata))
colon_mpse %<>% mp_rrarefy(raresize = 10000,seed = 123)
# Calculate alpha diversity - rarefied counts
colon_mpse %<>% mp_cal_alpha(.abundance=RareAbundance, force=TRUE)# preparing data frame
alpha_data <- data.frame(SampleID=colon_mpse$Sample.x,
Observe=colon_mpse$Observe,
Shannon=colon_mpse$Shannon,
Simpson=colon_mpse$Simpson,
Pielou=colon_mpse$Pielou,
Group=colon_mpse$Calprotectin_groups,
Country=colon_mpse$Country,
Patient=colon_mpse$Patient)
write.csv(alpha_data,file.path(path,paste0("alpha_indices_",segment,".csv")),
row.names = FALSE)
alpha_data %<>% drop_na()3.2.1 Plots
p_boxplot_alpha <- alpha_diversity_countries(alpha_data,
show_legend = FALSE,
size=1,gap=FALSE)Using SampleID, Group, Country, Patient as id variables
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Using SampleID, Group, Country, Patient as id variables
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
# save the results
alpha_div_plots[[paste(segment,"Country")]] <- p_boxplot_alpha
# see the results
p_boxplot_alpha3.2.2 Linear Model
path = "../../../results/merged_sites/main_analysis/Q5/alpha_diversity"
alpha_data <- read.csv(file.path(path,paste0("alpha_indices_",segment,".csv")))Richness
results_model <- pairwise.lmer(formula = "Observe ~ Group * Country + (1|Patient)",
factors=alpha_data$Group,
data=alpha_data)
# check interaction
if (!is.data.frame(results_model)){
results_model_observe <- results_model[[1]]
results_model_observe_emeans <- results_model[[2]]
} else {
results_model_observe <- results_model
results_model_observe_emeans <- NA
}
# save the results
pc_observed[[segment]] <- results_model_observe# see the results
knitr::kable(results_model_observe,digits = 3,
caption = "Raw results of linear model of richness estimation.")| Estimate | Std..Error | df | t.value | Pr…t.. | p.adj | sig | |
|---|---|---|---|---|---|---|---|
| low vs Grouphigh | -10.382 | 17.179 | 118.864 | -0.604 | 0.547 | 0.547 | |
| low vs high - CZ vs NO | -14.423 | 13.672 | 119.220 | -1.055 | 0.294 | 0.294 | |
| low vs Grouphigh:CountryNO | -10.672 | 22.099 | 117.864 | -0.483 | 0.630 | 0.630 |
knitr::kable(results_model_observe_emeans,digits = 3,
caption = "Raw results of independent country analysis")Table: Raw results of independent country analysis
Shannon
results_model <- pairwise.lmer(formula = "Shannon ~ Group * Country + (1|Patient)",
factors=alpha_data$Group,
data=alpha_data)
# check interaction
if (!is.data.frame(results_model)){
results_model_shannon <- results_model[[1]]
results_model_shannon_emeans <- results_model[[2]]
} else {
results_model_shannon <- results_model
results_model_shannon_emeans <- NA
}
# save the results
pc_shannon[[segment]] <- as.data.frame(results_model_shannon)# see the results
knitr::kable(results_model_shannon,digits = 3,
caption = "Raw results of linear model of Shannon estimation.")| Estimate | Std..Error | df | t.value | Pr…t.. | p.adj | sig | |
|---|---|---|---|---|---|---|---|
| low vs Grouphigh | -0.203 | 0.238 | 119.622 | -0.853 | 0.395 | 0.395 | |
| low vs high - CZ vs NO | -0.259 | 0.190 | 120.062 | -1.363 | 0.175 | 0.175 | |
| low vs Grouphigh:CountryNO | -0.005 | 0.307 | 118.364 | -0.015 | 0.988 | 0.988 |
knitr::kable(results_model_shannon_emeans,digits = 3,
caption = "Raw results of independent country analysis")Table: Raw results of independent country analysis
Simpson
results_model <- pairwise.lmer(formula = "Simpson ~ Group * Country + (1|Patient)",
factors=alpha_data$Group,
data=alpha_data)
# check interaction
if (!is.data.frame(results_model)){
results_model_simpson <- results_model[[1]]
results_model_simpson_emeans <- results_model[[2]]
} else {
results_model_simpson <- results_model
results_model_simpson_emeans <- NA
}
# save the results
pc_simpson[[segment]] <- as.data.frame(results_model_simpson)# see the results
knitr::kable(results_model_simpson,digits = 3,
caption = "Raw results of linear model of Simpson estimation.")| Estimate | Std..Error | df | t.value | Pr…t.. | p.adj | sig | |
|---|---|---|---|---|---|---|---|
| low vs Grouphigh | -0.035 | 0.040 | 119.722 | -0.888 | 0.376 | 0.376 | |
| low vs high - CZ vs NO | -0.038 | 0.032 | 120.159 | -1.206 | 0.230 | 0.230 | |
| low vs Grouphigh:CountryNO | 0.023 | 0.051 | 118.473 | 0.455 | 0.650 | 0.650 |
knitr::kable(results_model_simpson_emeans,digits = 3,
caption = "Raw results of independent country analysis")Table: Raw results of independent country analysis
3.2.3 Saving results
alpha_list <- list(
Richness=pc_observed[[segment]],
Shannon=pc_shannon[[segment]])
write.xlsx(alpha_list,
file = file.path(path,paste0("alpha_diversity_results_",segment,".xlsx")))3.3 Beta diversity
Calculating Aitchison distance (euclidean distance on clr-transformed data), both at ASV and genus level.
3.3.1 Main analysis
Genus level, Aitchison distance
level="genus"path = "../../../results/merged_sites/main_analysis/Q5/beta_diversity"Aggregation, filtering
# Aggregation
genus_data <- aggregate_taxa(colon_asv_tab,
colon_taxa_tab,
taxonomic_level=level,
names=TRUE)
# Filtration
filt_data <- filtering_steps(genus_data[[1]],
genus_data[[2]],
colon_metadata,
seq_depth_threshold=10000)Removing 32 ASV(s)
filt_colon_genus_tab <- filt_data[[1]]
filt_colon_genus_taxa <- filt_data[[2]]
filt_colon_metadata <- filt_data[[3]]
filt_colon_metadata %<>% dplyr::mutate(Group=Calprotectin_groups)3.3.1.0.1 PERMANOVA
pairwise_df <- filt_colon_genus_tab %>% column_to_rownames("SeqID") %>% t()
# main effect
pp_main <- pairwise.adonis(pairwise_df,filt_colon_metadata$Group,
covariate = filt_colon_metadata$Country,
patients = filt_colon_metadata$Patient,
sim.method = "robust.aitchison", p.adjust.m="BH")
# interaction
pp_int <- pairwise.adonis(pairwise_df,filt_colon_metadata$Group,
covariate = filt_colon_metadata$Country,
interaction = TRUE,
patients = filt_colon_metadata$Patient,
sim.method = "robust.aitchison", p.adjust.m="BH")
# tidy the results
pp_factor <- pp_main[[1]]
pp_cov <- pp_main[[2]]
pp_fac.cov <- pp_int[[3]]
cols <- c("pairs","Df","SumsOfSqs", "F.Model","R2","p.value", "p.adjusted", "sig")
colnames(pp_factor) <- cols; colnames(pp_cov) <- cols; colnames(pp_fac.cov) <- cols;
# save raw results
pairwise_aitchison_raw[[paste(level, segment)]] <-rbind(pp_factor,pp_cov,pp_fac.cov)# see the results
knitr::kable(pp_factor,digits = 3,caption = "PERMANOVA, GROUP separation")| pairs | Df | SumsOfSqs | F.Model | R2 | p.value | p.adjusted | sig |
|---|---|---|---|---|---|---|---|
| low vs high | 1 | 318.485 | 2.059 | 0.007 | 0.857 | 0.857 |
knitr::kable(pp_cov,digits = 3,caption = "PERMANOVA, COUNTRY separation")| pairs | Df | SumsOfSqs | F.Model | R2 | p.value | p.adjusted | sig |
|---|---|---|---|---|---|---|---|
| low vs high , Country | 1 | 661.475 | 4.277 | 0.014 | 0.003 | 0.003 | ** |
knitr::kable(pp_fac.cov,digits = 3,caption = "PERMANOVA, INTERACTION GROUP:Country")| pairs | Df | SumsOfSqs | F.Model | R2 | p.value | p.adjusted | sig |
|---|---|---|---|---|---|---|---|
| low vs high : Country | 1 | 246.488 | 1.597 | 0.005 | 0.955 | 0.955 |
Interaction check
interaction_sig <- pp_fac.cov$pairs[pp_fac.cov$p.adjusted < 0.1]
if (length(interaction_sig)>0){
for (i in 1:length(interaction_sig)){
group1 <- unlist(strsplit(interaction_sig[i],split = " vs "))[1]
group2 <- unlist(strsplit(interaction_sig[i],split = " vs "))[2]
group2 <- unlist(strsplit(group2,split = " : "))[1]
result_list <- adonis_postanalysis(x=pairwise_df,
factors = filt_colon_genus_metadata$Group,
covariate = filt_colon_genus_metadata$Country,
group1 = group1,
group2 = group2,
patients = filt_colon_genus_metadata$Patient)
print(result_list)
}
}3.3.1.0.2 Plots
PCoA custom
p <- pca_plot_custom(filt_colon_genus_tab,
filt_colon_genus_taxa,
filt_colon_metadata,
show_boxplots = TRUE,
variable = "Group", size=2,
show_legend=FALSE)
# save the results
pca_plots_list[[paste(segment,level,"custom")]] <- p
# see the results
p3.3.1.1 Saving results
write.xlsx(pairwise_aitchison_raw[[paste(level, segment)]],
file = file.path(path,
paste0("beta_diversity_results_", segment,".xlsx")))3.4 Univariate Analysis
3.4.1 Main - Genus level
level="genus"# needed paths
path = "../../../results/merged_sites/main_analysis/Q5/univariate_analysis"
path_maaslin=file.path("../../../intermediate_files/maaslin/Q5",level)# variables
raw_linda_results_genus[[segment]] <- list()
linda_results_genus[[segment]] <- list()
# country and interaction problems
list_intersections <- list()
list_venns <- list()
uni_statistics <- list()
# workbook for final df
wb <- createWorkbook()3.4.1.1 Genus level
level="genus"Aggregate taxa
genus_data <- aggregate_taxa(colon_asv_tab,
colon_taxa_tab,
taxonomic_level = level)
colon_genus_tab <- genus_data[[1]]
colon_genus_taxa_tab <- genus_data[[2]]
colon_genus_asv_taxa_tab <- create_asv_taxa_table(colon_genus_tab,
colon_genus_taxa_tab)3.4.1.1.1 low vs high
group <- c("low","high")
comparison_name <- paste0(group[1], " vs ",group[2])3.4.1.1.1.1 linda
colon_metadata %<>% dplyr::mutate(Group=Calprotectin_groups)
# prepare the data
linda_data <- binomial_prep(colon_genus_tab,
colon_genus_taxa_tab,
colon_metadata,
group, usage="linDA")Removing 25 ASV(s)
Removing 7 ASV(s)
filt_colon_uni_data <- linda_data[[1]]
filt_colon_uni_taxa <- linda_data[[2]]
filt_colon_uni_metadata <- linda_data[[3]]
# fit the model
linda.obj <- linda(filt_colon_uni_data,
filt_colon_uni_metadata,
formula = '~ Group * Country + (1|Patient)')0 features are filtered!
The filtered data has 293 samples and 164 features will be tested!
Imputation approach is used.
Fit linear mixed effects models ...
Completed.
linda.output <- linda.obj$output
linda.output <- linda_renaming(linda.output, group)
# save the results
group1 <- paste0(group[1], " vs ","Group",group[2])
group2 <- paste0(group[1], " , ",group[2], " - CZ vs NO")
group3 <- paste0(group[1], " vs ","Group",group[2], ":CountryNO")
for (grp in c(group1,group2,group3)){
raw_linda_results_genus[[segment]][[grp]] <-
rawlinda.df(linda.output,
grp,
filt_colon_uni_data,
filt_colon_uni_taxa)
linda_results_genus[[segment]][[grp]] <-
linda.df(linda.output,
grp,
filt_colon_uni_data,
filt_colon_uni_taxa)
}# volcano plot
volcano_1 <- volcano_plot_linda(linda.output, group1,
taxa_table = filt_colon_uni_taxa) +
ggtitle(paste(group,collapse=" vs "))
volcano_2 <- volcano_plot_linda(linda.output, group2,
taxa_table = filt_colon_uni_taxa) +
ggtitle("NO vs CZ")
volcano_3 <- volcano_plot_linda(linda.output, group3,
taxa_table = filt_colon_uni_taxa) +
ggtitle("Interaction effect")
volcano <- ggarrange(volcano_1,volcano_2,volcano_3, ncol=3)
# see the plot
volcano3.4.1.1.1.2 MaAsLin2
volcano1 <- volcano_plot_maaslin(fit_data,filt_colon_uni_taxa) +
ggtitle(paste(group[1], "vs", group[2]))
volcano2 <- volcano_plot_maaslin(fit_data,filt_colon_uni_taxa,variable="Country") +
ggtitle("Country effect")
volcano <- ggarrange(volcano1,volcano2, ncol=2)
volcano3.4.1.1.1.3 Group - Intersection
intersection_results <- group_intersection(group, list_intersections, list_venns,
linda.output, fit_data,
raw_linda_results_genus,
segment = segment,
level=level)
list_intersections <- intersection_results[[1]]
list_venns <- intersection_results[[2]]
venn <- intersection_results[[3]]
# show the results
venn3.4.1.1.1.4 Interaction effect
list_interaction_significant <- country_interaction(group,
linda.output,
list_intersections,
filt_colon_uni_data,
filt_colon_uni_metadata,
segment=segment,
level=level)
# see the result
## significant interaction effect
list_interaction_significant[[1]]NULL
## results for czech cohort
list_interaction_significant[[2]][1] NA
## results for norwegian cohort
list_interaction_significant[[3]][1] NA
Removing problematic taxa
list_intersections <- removing_interaction_problems(group,
list_interaction_significant,
list_intersections,
segment=segment,
level=level)3.4.1.1.2 Basic statistics
uni_df <- merge(basic_univariate_statistics(linda_data,group),
raw_linda_results_genus[[segment]][[group1]],
by="SeqID",all=TRUE)[1] "low"
[1] "high"
[1] "low"
[1] "high"
uni_df[["final_sig"]] <- uni_df$SeqID %in% list_intersections[[paste(segment,level,comparison_name)]][["SeqID"]]
uni_statistics[[segment]][[paste(level,comparison_name)]] <- uni_df
# for comparison
new_name <- comparison_name
addWorksheet(wb, sheetName = new_name)
writeData(wb, sheet = new_name, uni_df, rowNames=FALSE)3.4.1.1.3 Visualization
Heatmap visualizing the linDA’s logFoldChange for taxa with p < 0.1.
list_heatmap <- list_intersections[grep(paste(segment,level),
names(list_intersections),value=TRUE)]
if (length((list_heatmap[[1]][[1]]))>1){
p_heatmap_linda <- heatmap_linda(list_heatmap,colon_taxa_tab)
p_heatmap_linda
}Dot heatmap
if (length((list_heatmap[[1]][[1]]))>0){
dotheatmap_linda <- dot_heatmap_linda(list_heatmap,
uni_statistics$terminal_ileum[grepl(level,names(uni_statistics$terminal_ileum))],
colon_taxa_tab)
dotheatmap_linda
}Horizontal bar plot
if (length((list_heatmap[[1]][[1]]))>0){
p_prevalence <- horizontal_barplot(wb,taxa=levels(dotheatmap_linda$data$SeqID))
p_prevalence_final <- ggarrange(p_prevalence,
ggplot() + theme_minimal(),
nrow = 2,heights = c(1,0.085))
p <- ggarrange(dotheatmap_linda + theme(legend.position = "none"),p_prevalence_final,ncol=2,widths = c(1,0.2))
p
}3.4.1.2 Saving results
# ALL DATA
saveWorkbook(wb,file.path(path,paste0("uni_analysis_wb_",segment,".xlsx")),
overwrite = TRUE)
# SIGNIFICANT taxa
write.xlsx(list_intersections[grepl(segment,names(list_intersections))] %>%
`names<-`(gsub(segment, "", names(
list_intersections[grepl(segment,names(list_intersections))]))),
file.path(path,paste0("significant_taxa_",segment,".xlsx")))3.5 Machine learning
path = "../../../results/merged_sites/main_analysis/Q5/models"3.5.1 ElasticNet
model="enet"3.5.1.1 ASV level
level="ASV"#####low vs high
group <- c("low","high")
comparison_name <- paste0(group[1], " vs ",group[2])model_name <- paste(comparison_name,level,segment)
# prepare the data
filt_colon_uni_data <- binomial_prep(colon_asv_tab,
colon_taxa_tab,
colon_metadata,
group, usage="ml_clr",patient = TRUE)Removing 264 ASV(s)
Removing 44 ASV(s)
# fit the model
enet_model <- glmnet_binomial(filt_colon_uni_data,
sample_method = "atypboot",
outcome="Group",
N=10,
reuse=TRUE,
file=model_name,clust_var = "Patient",
Q="Q5")
# ROC curve
roc_c <- roc_curve(enet_model, group)
# save the results
models_summ <- list()
models_cm <- list()
betas <- list()
roc_cs <- list()
models_summ[[model_name]] <- enet_model$model_summary
models_cm[[model_name]] <- enet_model$conf_matrices$original
roc_cs[[model_name]] <- enet_model$kfold_rocobjs
betas[[model_name]] <- as.matrix(enet_model$betas)
# see the results
enet_model$model_summary %>% t() [,1]
alpha 1.000000000
lambda 0.002352995
auc 1.000000000
auc_czech 1.000000000
auc_no 1.000000000
auc_optimism_corrected 0.576388400
auc_optimism_corrected_CIL 0.418467650
auc_optimism_corrected_CIU 0.690289785
accuracy 1.000000000
accuracy_czech NaN
accuracy_no 1.000000000
accuracy_optimism_corrected 0.640802202
accuracy_optimism_corrected_CIL 0.528153409
accuracy_optimism_corrected_CIU 0.733313084
enet_model$conf_matrices$original
Predicted
True 0 1
0 83 0
1 0 210
$czech
Predicted
True 0 1
0 36 0
1 0 30
$no
Predicted
True 0 1
0 47 0
1 0 180
enet_model$plot`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
roc_c3.5.1.2 Genus level
level="genus"Aggregate taxa
genus_data <- aggregate_taxa(colon_asv_tab,
colon_taxa_tab,
taxonomic_level = level)
colon_genus_tab <- genus_data[[1]]
colon_genus_taxa_tab <- genus_data[[2]]3.5.1.2.1 low vs high
group <- c("low","high")
comparison_name <- paste0(group[1], " vs ",group[2])model_name <- paste(comparison_name,level,segment)
# prepare the data
filt_colon_uni_data <- binomial_prep(colon_genus_tab,
colon_genus_taxa_tab,
colon_metadata,
group,
usage="ml_clr",patient = TRUE)Removing 25 ASV(s)
Removing 7 ASV(s)
# fit the model
enet_model <- glmnet_binomial(filt_colon_uni_data,
sample_method = "atypboot",
outcome="Group",
N=10,
reuse=TRUE,clust_var = "Patient",
file=model_name,
Q="Q5")
# ROC
roc_c <- roc_curve(enet_model, group)
# save the results
models_summ[[model_name]] <- enet_model$model_summary
models_cm[[model_name]] <- enet_model$conf_matrices$original
roc_cs[[model_name]] <- enet_model$kfold_rocobjs
betas[[model_name]] <- as.matrix(enet_model$betas)
# see the results
enet_model$model_summary %>% t() [,1]
alpha 0.20000000
lambda 0.01770854
auc 1.00000000
auc_czech 1.00000000
auc_no 1.00000000
auc_optimism_corrected 0.61887455
auc_optimism_corrected_CIL 0.50314091
auc_optimism_corrected_CIU 0.73986549
accuracy 0.99658703
accuracy_czech NaN
accuracy_no 0.99559471
accuracy_optimism_corrected 0.66245735
accuracy_optimism_corrected_CIL 0.55821114
accuracy_optimism_corrected_CIU 0.72898906
enet_model$conf_matrices$original
Predicted
True 0 1
0 82 1
1 0 210
$czech
Predicted
True 0 1
0 36 0
1 0 30
$no
Predicted
True 0 1
0 46 1
1 0 180
enet_model$plot`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
roc_c3.5.1.3 Saving results
models_summ_df_colon <- do.call(rbind,
models_summ[grep(segment,names(models_summ),value = TRUE)])
write.csv(models_summ_df_colon,file.path(path,paste0("elastic_net_",segment,".csv")))3.6 Results overview
3.6.0.1 Alpha diversity
pc_observed[[segment]] Estimate Std..Error df t.value Pr...t..
low vs Grouphigh -10.38203 17.17915 118.8636 -0.6043391 0.5467700
low vs high - CZ vs NO -14.42260 13.67170 119.2200 -1.0549238 0.2935933
low vs Grouphigh:CountryNO -10.67238 22.09860 117.8642 -0.4829439 0.6300314
p.adj sig
low vs Grouphigh 0.5467700
low vs high - CZ vs NO 0.2935933
low vs Grouphigh:CountryNO 0.6300314
pc_shannon[[segment]] Estimate Std..Error df t.value
low vs Grouphigh -0.203414982 0.2384651 119.6223 -0.85301770
low vs high - CZ vs NO -0.258723672 0.1898142 120.0617 -1.36303680
low vs Grouphigh:CountryNO -0.004629771 0.3065856 118.3638 -0.01510107
Pr...t.. p.adj sig
low vs Grouphigh 0.3953543 0.3953543
low vs high - CZ vs NO 0.1754218 0.1754218
low vs Grouphigh:CountryNO 0.9879770 0.9879770
pc_simpson[[segment]] Estimate Std..Error df t.value Pr...t..
low vs Grouphigh -0.03519075 0.03962315 119.7223 -0.8881361 0.3762484
low vs high - CZ vs NO -0.03804238 0.03153914 120.1589 -1.2061961 0.2301115
low vs Grouphigh:CountryNO 0.02320291 0.05094297 118.4727 0.4554683 0.6496060
p.adj sig
low vs Grouphigh 0.3762484
low vs high - CZ vs NO 0.2301115
low vs Grouphigh:CountryNO 0.6496060
Plots
alpha_div_plots[[paste(segment,"Country")]]3.6.0.2 Beta diversity
Main results
pairwise_aitchison_raw[[paste("genus", segment)]] pairs Df SumsOfSqs F.Model R2 p.value p.adjusted
1 low vs high 1 318.4846 2.059227 0.006943927 0.857 0.857
2 low vs high , Country 1 661.4748 4.276901 0.014422151 0.003 0.003
3 low vs high : Country 1 246.4880 1.596999 0.005374183 0.955 0.955
sig
1
2 **
3
PCA
pca_plots_list[[paste(segment,"genus custom")]]3.6.0.3 Univariate analysis
Number of significant taxa
knitr::kable(as.data.frame(lapply(list_intersections,nrow)) %>% t() %>%
`colnames<-`("Count") %>%
`rownames<-`(c(names(list_intersections))),caption="Number of significant taxa")| Count | |
|---|---|
| colon genus low vs high | 0 |
3.6.0.4 Machine learning
Main models
Summary
knitr::kable(models_summ_df_colon %>% dplyr::select(
"alpha","lambda",
"auc_optimism_corrected",
"auc_optimism_corrected_CIL",
"auc_optimism_corrected_CIU"),
digits=3,caption="Elastic net results")| alpha | lambda | auc_optimism_corrected | auc_optimism_corrected_CIL | auc_optimism_corrected_CIU | |
|---|---|---|---|---|---|
| low vs high ASV colon | 1.0 | 0.002 | 0.576 | 0.418 | 0.69 |
| low vs high genus colon | 0.2 | 0.018 | 0.619 | 0.503 | 0.74 |
ROC - Genus level
p <- roc_curve_all_custom(roc_cs[2],Q="Q5",
model_name="enet_model")
p4 Paper-ready visualizations
Alpha diversity
p_A <- alpha_div_plots$`terminal_ileum Country` +
ggtitle("Terminal ileum")+
theme(plot.title = element_text(hjust=0.5,face = "bold",size = 15))
p_B <- alpha_div_plots$`colon Country` +
ggtitle("Colon") +
theme(plot.title = element_text(hjust=0.5,face = "bold",size = 15))
Q5_alpha <- ggarrange(p_A,ggplot() + theme_minimal(),p_B,nrow=1, ncol=3,
widths = c(1,0.1,1))
Q5_alphaBeta diversity
pca_ti <- pca_plots_list$`terminal_ileum genus custom`
pca_colon <- pca_plots_list$`colon genus custom`
genus_Q5_beta <- ggarrange(pca_ti,
ggplot() + theme_minimal(),
pca_colon,ncol=3,
widths = c(1,0.1,1))
genus_Q5_betasave(Q5_alpha,genus_Q5_beta,file="../../../results/merged_sites/main_analysis/Q5/figures.RData")Alpha + Beta diversity
alpha_beta <- ggarrange(Q5_alpha,genus_Q5_beta,
ncol = 1,nrow=2,labels = c("A","B"))
alpha_beta4.1 Supplement Figure
alpha_beta <- ggarrange(alpha_beta,ggplot() + theme_minimal(),ncol=2,
widths = c(1,0.15))
alpha_betaQ3+Q5
load("../../../results/merged_sites/main_analysis/Q3/figures.RData")p <- ggarrange(Q3_alpha,Q5_alpha,
genus_Q3_beta,genus_Q5_beta,ncol=1,labels = LETTERS)
p <- ggarrange(p,ncol=2,widths = c(1,0.15))
pDAA
if (exists("dot_heatmap_ileum")){
p_ileum <- dot_heatmap_ileum +
ggtitle("Terminal ileum") +
theme(plot.title = element_text(hjust=0.5,face = "bold",size = 15),
legend.position = "none")
}
if (exists("dot_heatmap_colon")){
p_colon <- dot_heatmap_colon +
ggtitle("Colon") +
theme(plot.title = element_text(hjust=0.5,face = "bold",size = 15),
legend.position = "none")
}
if (exists("p_ileum") | exists("p_colon")){
heatmap_plot <- ggarrange(p_ileum,
p_colon,
ncol = 2,labels=c("A","B"))
heatmap_plot
}5 Session info
sessionInfo()R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 11 x64 (build 26100)
Matrix products: default
locale:
[1] LC_COLLATE=Czech_Czechia.utf8 LC_CTYPE=Czech_Czechia.utf8
[3] LC_MONETARY=Czech_Czechia.utf8 LC_NUMERIC=C
[5] LC_TIME=Czech_Czechia.utf8
time zone: Europe/Prague
tzcode source: internal
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] mice_3.17.0 picante_1.8.2 ape_5.8-1
[4] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[7] tidyverse_2.0.0 kableExtra_1.4.0 tidyr_1.3.1
[10] gbm_2.2.2 doParallel_1.0.17 iterators_1.0.14
[13] foreach_1.5.2 ranger_0.17.0 ggvenn_0.1.15
[16] Maaslin2_1.16.0 purrr_1.0.2 pROC_1.18.5
[19] glmnet_4.1-8 MicrobiomeStat_1.2 caret_6.0-94
[22] openxlsx_4.2.8 magrittr_2.0.3 emmeans_1.11.2-8
[25] lmerTest_3.1-3 robustlmm_3.3-1 lme4_1.1-37
[28] Matrix_1.6-5 mgcv_1.9-1 nlme_3.1-164
[31] pheatmap_1.0.13 reshape2_1.4.4 vegan_2.6-10
[34] lattice_0.22-6 permute_0.9-8 ggplotify_0.1.2
[37] ggrepel_0.9.6 ggpubr_0.6.1 MicrobiotaProcess_1.14.1
[40] phyloseq_1.46.0 ggplot2_3.5.2 tibble_3.2.1
[43] dplyr_1.1.4 cowplot_1.2.0 readr_2.1.5
[46] igraph_2.1.4 ccrepe_1.38.1 data.table_1.15.4
loaded via a namespace (and not attached):
[1] fs_1.6.5 matrixStats_1.5.0
[3] bitops_1.0-9 RColorBrewer_1.1-3
[5] numDeriv_2016.8-1.1 tools_4.3.1
[7] backports_1.4.1 utf8_1.2.4
[9] R6_2.6.1 lazyeval_0.2.2
[11] jomo_2.7-6 rhdf5filters_1.14.1
[13] withr_3.0.2 gridExtra_2.3
[15] cli_3.6.2 Biobase_2.62.0
[17] logging_0.10-108 biglm_0.9-3
[19] sandwich_3.1-1 labeling_0.4.3
[21] mvtnorm_1.3-3 robustbase_0.99-4-1
[23] pbapply_1.7-4 systemfonts_1.2.2
[25] yulab.utils_0.2.1 svglite_2.1.3
[27] parallelly_1.38.0 rstudioapi_0.17.1
[29] generics_0.1.4 gridGraphics_0.5-1
[31] shape_1.4.6.1 car_3.1-3
[33] zip_2.3.2 biomformat_1.30.0
[35] S4Vectors_0.40.2 abind_1.4-8
[37] infotheo_1.2.0.1 lifecycle_1.0.4
[39] multcomp_1.4-28 yaml_2.3.10
[41] carData_3.0-5 SummarizedExperiment_1.32.0
[43] Rtsne_0.17 rhdf5_2.46.1
[45] recipes_1.3.1 SparseArray_1.2.4
[47] grid_4.3.1 mitml_0.4-5
[49] crayon_1.5.3 pillar_1.11.0
[51] knitr_1.50 optparse_1.7.5
[53] GenomicRanges_1.54.1 statip_0.2.3
[55] boot_1.3-32 estimability_1.5.1
[57] future.apply_1.11.3 codetools_0.2-20
[59] pan_1.9 glue_1.7.0
[61] ggfun_0.2.0 vctrs_0.6.5
[63] treeio_1.26.0 Rdpack_2.6.4
[65] gtable_0.3.6 gower_1.0.1
[67] xfun_0.52 rbibutils_2.3
[69] S4Arrays_1.2.1 prodlim_2024.06.25
[71] libcoin_1.0-10 coda_0.19-4.1
[73] reformulas_0.4.1 pcaPP_2.0-5
[75] modeest_2.4.0 survival_3.5-8
[77] timeDate_4041.110 hardhat_1.4.2
[79] lava_1.8.1 statmod_1.5.0
[81] TH.data_1.1-4 ipred_0.9-15
[83] ggtree_3.10.1 GenomeInfoDb_1.38.8
[85] fBasics_4041.97 rpart_4.1.23
[87] colorspace_2.1-1 BiocGenerics_0.48.1
[89] DBI_1.2.3 nnet_7.3-19
[91] ade4_1.7-23 tidyselect_1.2.1
[93] timeSeries_4041.111 compiler_4.3.1
[95] microbiome_1.24.0 xml2_1.3.8
[97] DelayedArray_0.28.0 scales_1.4.0
[99] DEoptimR_1.1-4 spatial_7.3-18
[101] rappdirs_0.3.3 digest_0.6.35
[103] minqa_1.2.8 rmarkdown_2.29
[105] XVector_0.42.0 htmltools_0.5.8.1
[107] pkgconfig_2.0.3 MatrixGenerics_1.14.0
[109] stabledist_0.7-2 fastmap_1.2.0
[111] rlang_1.1.3 htmlwidgets_1.6.4
[113] farver_2.1.2 zoo_1.8-13
[115] jsonlite_2.0.0 ModelMetrics_1.2.2.2
[117] RCurl_1.98-1.17 modeltools_0.2-24
[119] Formula_1.2-5 GenomeInfoDbData_1.2.11
[121] patchwork_1.3.2 Rhdf5lib_1.24.2
[123] Rcpp_1.0.12 ggnewscale_0.5.2
[125] fastGHQuad_1.0.1 stringi_1.8.3
[127] ggstar_1.0.4 stable_1.1.6
[129] zlibbioc_1.48.2 MASS_7.3-60.0.1
[131] plyr_1.8.9 listenv_0.9.1
[133] Biostrings_2.70.3 splines_4.3.1
[135] hash_2.2.6.3 multtest_2.58.0
[137] hms_1.1.3 ggtreeExtra_1.12.0
[139] ggsignif_0.6.4 stats4_4.3.1
[141] rmutil_1.1.10 evaluate_1.0.5
[143] nloptr_2.2.1 tzdb_0.5.0
[145] getopt_1.20.4 future_1.34.0
[147] clue_0.3-66 coin_1.4-3
[149] broom_1.0.9 xtable_1.8-4
[151] tidytree_0.4.6 rstatix_0.7.2
[153] viridisLite_0.4.2 class_7.3-22
[155] aplot_0.2.8 IRanges_2.36.0
[157] cluster_2.1.8.1 timechange_0.3.0
[159] globals_0.18.0