This script is for microbial co-ocurrence networks in 3 different enviroments.
library(phyloseq)
library(ggplot2)
library(scales)
library(RColorBrewer)
library(ggpubr)
library(readr)
library(patchwork)
library(plyr)
library(tidyverse)
library(cluster)
library(factoextra)
library(ggcorrplot)
library(corrplot)
raw_biom <- import_biom("~/dc_workshop/pocean_metagenome/data/pacific_ocean.biom")
#Give name to taxonomic categories
raw_biom@tax_table@.Data<-substring(raw_biom@tax_table@.Data,4)
colnames(raw_biom@tax_table@.Data)<- c("Kingdom","Phylum", "Class", "Order","Family", "Genus", "Species")
# Read metadata
metadatos<-read.csv("~/dc_workshop/pocean_metagenome/data/metadata_sampling.csv")
row.names(metadatos) <- metadatos$Samples
metadatos$Zone<-factor(metadatos$Zone, levels = c("Epipelagic", "Mesopelagic", "Abyssopelagic"))
# Generate otu_table
otu<-otu_table(raw_biom@otu_table@.Data, taxa_are_rows = TRUE)
colnames(otu) <-metadatos$Samples
# Generate taxonomy table
tax<-tax_table(raw_biom@tax_table@.Data)
# Generate metadata table
metad<-sample_data(metadatos)
# It is the complete phyloseq object with the 3 previous tables
global<-phyloseq(otu, tax, metad)
# Put the complete species names in the column for Species (instead of the epithet only)
global@tax_table@.Data[,7]<-paste(global@tax_table@.Data[,6], global@tax_table@.Data[,7], sep= "_")
Make the object
# Agglomerate at Phylum level
phylum_global<- tax_glom(global, "Phylum")
#Relative abundance
relative_phylum_global<- transform_sample_counts(phylum_global, function(x) x / sum(x) )
rel_phyl_global_df<-psmelt(relative_phylum_global) # convert to data frame
rel_phyl_global_df$Phylum<- as.character(rel_phyl_global_df$Phylum) # convert to character type
# Agglomerate phylum with less than 0.01 abundance
rel_phyl_global_df$Phylum[rel_phyl_global_df$Abundance < 0.01] <- "Fila < 1% de abundancia"
rel_phyl_global_df$Phylum<- as.factor(rel_phyl_global_df$Phylum) # convert to factor type
rel_phyl_global_df$Samples<- factor(rel_phyl_global_df$Samples, levels = c("SRR5788415", "SRR5788416", "SRR5788417", "SRR5788420", "SRR5788421", "SRR5788422")) # Put Samples names in levels
Make the abundance plot by samples.
phylum_colors<- brewer.pal(length(levels(rel_phyl_global_df$Phylum)),"Dark2")
global_rel_phyl_plot<-ggplot(rel_phyl_global_df, aes(x=Samples, y=Abundance, fill=Phylum)) +
geom_bar(aes(), stat="identity", position="stack") +
scale_fill_manual(values=phylum_colors) +
labs(x= "Samples", y = "Relative abundance") +
facet_grid(~Zone, scales = "free_x")+
ggtitle("Relative abundance by Zones")+
theme(axis.title = element_text(size = 14), axis.text.x = element_text(size = 8), plot.title = element_text(size = 20))
ggsave("~/dc_workshop/pocean_metagenome/images/abund_rel_phyl_global.svg", plot = global_rel_phyl_plot, width = 6.5, height = 3.65, units = "in", dpi = 400)
global_rel_phyl_plot
Make the objects
# Absolute abundance
abs_phyl_global_df<-psmelt(phylum_global) # convert to data frame
# Filter OTU table
abs_phyl_global_df_E <- abs_phyl_global_df %>%
filter(abs_phyl_global_df$Zone == "Epipelagic") %>%
select(OTU, Sample, Abundance) %>%
pivot_wider(names_from = OTU, values_from = Abundance)
abs_phyl_global_df_M <- abs_phyl_global_df %>%
filter(abs_phyl_global_df$Zone == "Mesopelagic")%>%
select(OTU, Sample, Abundance) %>%
pivot_wider(names_from = OTU, values_from = Abundance)
abs_phyl_global_df_A <- abs_phyl_global_df %>%
filter(abs_phyl_global_df$Zone == "Abyssopelagic")%>%
select(OTU, Sample, Abundance) %>%
pivot_wider(names_from = OTU, values_from = Abundance)
Create the otutable directory
mkdir otutable
Save otu tables without row names.
#write.csv(abs_phyl_global_df_E,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Epipelagic",".csv",sep=""), row.names= FALSE)
#write.csv(abs_phyl_global_df_M,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Mesopelagic",".csv",sep=""), row.names= FALSE)
#write.csv(abs_phyl_global_df_A,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Abyssopelagic",".csv",sep=""), row.names= FALSE)
Calculate the ordinate
objet with Jaccard distance.
meta.ord <- ordinate(physeq = relative_phylum_global, method = "NMDS",
distance = "jaccard")
## Run 0 stress 0
## Run 1 stress 0
## ... Procrustes: rmse 0.1116033 max resid 0.1575744
## Run 2 stress 0
## ... Procrustes: rmse 0.09833171 max resid 0.1393525
## Run 3 stress 9.802441e-05
## ... Procrustes: rmse 0.1344138 max resid 0.2384963
## Run 4 stress 9.966834e-05
## ... Procrustes: rmse 0.1466619 max resid 0.249883
## Run 5 stress 0.1557467
## Run 6 stress 0
## ... Procrustes: rmse 0.1343425 max resid 0.1768282
## Run 7 stress 0.0002896358
## ... Procrustes: rmse 0.1709236 max resid 0.3062635
## Run 8 stress 0.1557467
## Run 9 stress 0
## ... Procrustes: rmse 0.05797811 max resid 0.09812407
## Run 10 stress 0.1557467
## Run 11 stress 0
## ... Procrustes: rmse 0.09930643 max resid 0.1464909
## Run 12 stress 0
## ... Procrustes: rmse 0.1150026 max resid 0.1439802
## Run 13 stress 9.84357e-05
## ... Procrustes: rmse 0.1054029 max resid 0.2141803
## Run 14 stress 0
## ... Procrustes: rmse 0.1734049 max resid 0.227205
## Run 15 stress 0.2761342
## Run 16 stress 8.980857e-05
## ... Procrustes: rmse 0.1599885 max resid 0.2219469
## Run 17 stress 0
## ... Procrustes: rmse 0.1090451 max resid 0.1326152
## Run 18 stress 9.992838e-05
## ... Procrustes: rmse 0.1050076 max resid 0.210506
## Run 19 stress 0
## ... Procrustes: rmse 0.1207826 max resid 0.1558563
## Run 20 stress 9.986879e-05
## ... Procrustes: rmse 0.1111995 max resid 0.2149204
## *** No convergence -- monoMDS stopping criteria:
## 1: no. of iterations >= maxit
## 15: stress < smin
## 1: stress ratio > sratmax
## 3: scale factor of the gradient < sfgrmin
Plot the clusters.
rel_phylum_cluster_Jaccard <- plot_ordination(physeq = relative_phylum_global, ordination = meta.ord, color="Zone")+
geom_text(aes(label=Samples), size=3)
ggsave("cluster_jaccard.svg", plot = rel_phylum_cluster_Jaccard, width = 6.5, height = 3.65, units = "in", dpi = 400)
rel_phylum_cluster_Jaccard
Compare clusters with different methods.
dist = "jaccard" # especify distance
ord_meths = c("DCA", "CCA", "RDA", "NMDS", "MDS", "PCoA") #vector of methods names
plist = llply(as.list(ord_meths), function(i, physeq, dist){
ordi = ordinate(physeq, method=i, distance=dist)
plot_ordination(physeq, ordi, "Samples", color="Zone")
}, physeq=relative_phylum_global, dist)
## Run 0 stress 0
## Run 1 stress 0
## ... Procrustes: rmse 0.09862521 max resid 0.1270772
## Run 2 stress 0
## ... Procrustes: rmse 0.09227392 max resid 0.1138774
## Run 3 stress 0
## ... Procrustes: rmse 0.1242844 max resid 0.197133
## Run 4 stress 9.443781e-05
## ... Procrustes: rmse 0.1147886 max resid 0.2212059
## Run 5 stress 0
## ... Procrustes: rmse 0.1249011 max resid 0.172647
## Run 6 stress 0.2761337
## Run 7 stress 0
## ... Procrustes: rmse 0.1212303 max resid 0.1632644
## Run 8 stress 0
## ... Procrustes: rmse 0.1539816 max resid 0.2022742
## Run 9 stress 0.2269841
## Run 10 stress 0.001590993
## Run 11 stress 1.992066e-05
## ... Procrustes: rmse 0.1999354 max resid 0.2476482
## Run 12 stress 9.915231e-05
## ... Procrustes: rmse 0.1371223 max resid 0.23079
## Run 13 stress 0
## ... Procrustes: rmse 0.08179261 max resid 0.09947291
## Run 14 stress 0.0002141331
## ... Procrustes: rmse 0.1832949 max resid 0.2824196
## Run 15 stress 0.1557467
## Run 16 stress 0
## ... Procrustes: rmse 0.09299676 max resid 0.1300814
## Run 17 stress 1.506566e-05
## ... Procrustes: rmse 0.1659642 max resid 0.2325868
## Run 18 stress 9.762141e-05
## ... Procrustes: rmse 0.1572659 max resid 0.2592858
## Run 19 stress 1.029677e-05
## ... Procrustes: rmse 0.1104634 max resid 0.2153393
## Run 20 stress 0.2181459
## *** No convergence -- monoMDS stopping criteria:
## 2: no. of iterations >= maxit
## 14: stress < smin
## 2: stress ratio > sratmax
## 2: scale factor of the gradient < sfgrmin
names(plist) <- ord_meths
#Dataframe for plots
pdataframe = ldply(plist, function(x){
df = x$data[, 1:2]
colnames(df) = c("Axis_1", "Axis_2")
return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"
#Plots with all methods
p = ggplot(pdataframe, aes(Axis_1, Axis_2, color=Zone, shape=Zone))
p = p + geom_point(size=2) + geom_text(aes(label=Samples), size=2,vjust = "inward", hjust = "inward")
p = p + facet_wrap(~method, scales="free")
p = p + scale_fill_brewer(type="qual", palette="Set1")
p = p + scale_colour_brewer(type="qual", palette="Set1")
p= p + labs(x="", y="")
p
cluster
and
factoExtra
packages.Filter the data set by samples and Phylum.
abs_filter <- daply(
.data = rel_phyl_global_df,
.variables = c("Samples", "Phylum"),
.fun = function(x) sum(x$Abundance)
)
abs_filter <- as.data.frame(abs_filter)
abs_filter[is.na(abs_filter)] = 0
Compute the distance matrix with pearson
distance.
#Computing correlation based distance: pearson, spearman or kendall
dist_corr <- get_dist(abs_filter, method = "pearson")
Plot distance matrix.
Clusters with different methods
set.seed(12345)
hc_single <- hclust(d=dist_corr, method = "single")
hc_average <- hclust(d=dist_corr, method = "average")
hc_complete <- hclust(d=dist_corr, method = "complete")
hc_ward <- hclust(d=dist_corr, method = "ward.D2")
hc_median <- hclust(d=dist_corr, method = "median")
hc_centroid <- hclust(d=dist_corr, method = "centroid")
Plot clusters with different methods.
Compare methods.
## average single complete ward.D2 median centroid
## 0.9876304 0.9876253 0.9876278 0.9876183 0.9876304 0.9876304
PCA proyection.
Groups obtained.
grupos_average <-sort(cutree(hc_average, 3))
grupos_average
## SRR5788415 SRR5788416 SRR5788417 SRR5788420 SRR5788421 SRR5788422
## 1 1 1 2 2 3
cor_base <- cor(abs_filter, use="complete.obs", method = "spearman")
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(cor_base)
corr_plot <- corrplot(cor_base, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45,
p.mat = p.mat, sig.level = 0.05)
[1] Ma, B., Wang, Y., Ye, S. et al. Earth microbial co-occurrence network reveals interconnection pattern across microbiomes. Microbiome 8, 82 (2020). https://doi.org/10.1186/s40168-020-00857-2
[2] C. Zirion. Zamia-bacterial-BGCs. https://czirion.github.io/Zamia-bacterial-BGCs/
[3] Data processing and visualization for metagenomics. https://carpentries-incubator.github.io/metagenomics/