This script is for microbial co-ocurrence networks in 3 different enviroments.
library(phyloseq)
library(ggplot2)
library(scales)
library(RColorBrewer)
library(ggpubr)
library(readr)
library(patchwork)
library(plyr)
library(tidyverse)
library(cluster)
library(factoextra)
library(ggcorrplot)
library(corrplot)
raw_biom <- import_biom("~/dc_workshop/pocean_metagenome/data/pacific_ocean.biom")
#Give name to taxonomic categories
raw_biom@tax_table@.Data<-substring(raw_biom@tax_table@.Data,4)
colnames(raw_biom@tax_table@.Data)<- c("Kingdom","Phylum", "Class", "Order","Family", "Genus", "Species")
# Read metadata
metadatos<-read.csv("~/dc_workshop/pocean_metagenome/data/metadata_sampling.csv")
row.names(metadatos) <- metadatos$Samples
metadatos$Zone<-factor(metadatos$Zone, levels = c("Epipelagic", "Mesopelagic", "Abyssopelagic"))
# Generate otu_table
otu<-otu_table(raw_biom@otu_table@.Data, taxa_are_rows = TRUE)
colnames(otu) <-metadatos$Samples
# Generate taxonomy table
tax<-tax_table(raw_biom@tax_table@.Data)
# Generate metadata table
metad<-sample_data(metadatos)
# It is the complete phyloseq object with the 3 previous tables
global<-phyloseq(otu, tax, metad)
# Put the complete species names in the column for Species (instead of the epithet only)
global@tax_table@.Data[,7]<-paste(global@tax_table@.Data[,6], global@tax_table@.Data[,7], sep= "_")
Make the object
# Agglomerate at Phylum level
phylum_global<- tax_glom(global, "Phylum")
#Relative abundance
relative_phylum_global<- transform_sample_counts(phylum_global, function(x) x / sum(x) )
rel_phyl_global_df<-psmelt(relative_phylum_global) # convert to data frame
rel_phyl_global_df$Phylum<- as.character(rel_phyl_global_df$Phylum) # convert to character type
# Agglomerate phylum with less than 0.01 abundance
rel_phyl_global_df$Phylum[rel_phyl_global_df$Abundance < 0.01] <- "Fila < 1% de abundancia"
rel_phyl_global_df$Phylum<- as.factor(rel_phyl_global_df$Phylum) # convert to factor type
rel_phyl_global_df$Samples<- factor(rel_phyl_global_df$Samples, levels = c("SRR5788415", "SRR5788416", "SRR5788417", "SRR5788420", "SRR5788421", "SRR5788422")) # Put Samples names in levels
Make the abundance plot by samples.
phylum_colors<- brewer.pal(length(levels(rel_phyl_global_df$Phylum)),"Dark2")
global_rel_phyl_plot<-ggplot(rel_phyl_global_df, aes(x=Samples, y=Abundance, fill=Phylum)) +
geom_bar(aes(), stat="identity", position="stack") +
scale_fill_manual(values=phylum_colors) +
labs(x= "Samples", y = "Relative abundance") +
facet_grid(~Zone, scales = "free_x")+
ggtitle("Relative abundance by Zones")+
theme(axis.title = element_text(size = 14), axis.text.x = element_text(size = 8), plot.title = element_text(size = 20))
ggsave("~/dc_workshop/pocean_metagenome/images/abund_rel_phyl_global.svg", plot = global_rel_phyl_plot, width = 6.5, height = 3.65, units = "in", dpi = 400)
global_rel_phyl_plot
Make the objects
# Absolute abundance
abs_phyl_global_df<-psmelt(phylum_global) # convert to data frame
# Filter OTU table
abs_phyl_global_df_E <- abs_phyl_global_df %>%
filter(abs_phyl_global_df$Zone == "Epipelagic") %>%
select(OTU, Sample, Abundance) %>%
pivot_wider(names_from = OTU, values_from = Abundance)
abs_phyl_global_df_M <- abs_phyl_global_df %>%
filter(abs_phyl_global_df$Zone == "Mesopelagic")%>%
select(OTU, Sample, Abundance) %>%
pivot_wider(names_from = OTU, values_from = Abundance)
abs_phyl_global_df_A <- abs_phyl_global_df %>%
filter(abs_phyl_global_df$Zone == "Abyssopelagic")%>%
select(OTU, Sample, Abundance) %>%
pivot_wider(names_from = OTU, values_from = Abundance)
Create the otutable directory
mkdir otutable
Save otu tables without row names.
#write.csv(abs_phyl_global_df_E,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Epipelagic",".csv",sep=""), row.names= FALSE)
#write.csv(abs_phyl_global_df_M,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Mesopelagic",".csv",sep=""), row.names= FALSE)
#write.csv(abs_phyl_global_df_A,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Abyssopelagic",".csv",sep=""), row.names= FALSE)
Calculate the ordinate
objet with Jaccard distance.
meta.ord <- ordinate(physeq = relative_phylum_global, method = "NMDS",
distance = "jaccard")
## Run 0 stress 0
## Run 1 stress 0
## ... Procrustes: rmse 0.1116033 max resid 0.1575744
## Run 2 stress 0
## ... Procrustes: rmse 0.09833171 max resid 0.1393525
## Run 3 stress 9.802441e-05
## ... Procrustes: rmse 0.1344138 max resid 0.2384963
## Run 4 stress 9.966834e-05
## ... Procrustes: rmse 0.1466619 max resid 0.249883
## Run 5 stress 0.1557467
## Run 6 stress 0
## ... Procrustes: rmse 0.1343425 max resid 0.1768282
## Run 7 stress 0.0002896358
## ... Procrustes: rmse 0.1709236 max resid 0.3062635
## Run 8 stress 0.1557467
## Run 9 stress 0
## ... Procrustes: rmse 0.05797811 max resid 0.09812407
## Run 10 stress 0.1557467
## Run 11 stress 0
## ... Procrustes: rmse 0.09930643 max resid 0.1464909
## Run 12 stress 0
## ... Procrustes: rmse 0.1150026 max resid 0.1439802
## Run 13 stress 9.84357e-05
## ... Procrustes: rmse 0.1054029 max resid 0.2141803
## Run 14 stress 0
## ... Procrustes: rmse 0.1734049 max resid 0.227205
## Run 15 stress 0.2761342
## Run 16 stress 8.980857e-05
## ... Procrustes: rmse 0.1599885 max resid 0.2219469
## Run 17 stress 0
## ... Procrustes: rmse 0.1090451 max resid 0.1326152
## Run 18 stress 9.992838e-05
## ... Procrustes: rmse 0.1050076 max resid 0.210506
## Run 19 stress 0
## ... Procrustes: rmse 0.1207826 max resid 0.1558563
## Run 20 stress 9.986879e-05
## ... Procrustes: rmse 0.1111995 max resid 0.2149204
## *** No convergence -- monoMDS stopping criteria:
## 1: no. of iterations >= maxit
## 15: stress < smin
## 1: stress ratio > sratmax
## 3: scale factor of the gradient < sfgrmin
Plot the clusters.
rel_phylum_cluster_Jaccard <- plot_ordination(physeq = relative_phylum_global, ordination = meta.ord, color="Zone")+
geom_text(aes(label=Samples), size=3)
ggsave("cluster_jaccard.svg", plot = rel_phylum_cluster_Jaccard, width = 6.5, height = 3.65, units = "in", dpi = 400)
rel_phylum_cluster_Jaccard
Compare clusters with different methods.
dist = "jaccard" # especify distance
ord_meths = c("DCA", "CCA", "RDA", "NMDS", "MDS", "PCoA") #vector of methods names
plist = llply(as.list(ord_meths), function(i, physeq, dist){
ordi = ordinate(physeq, method=i, distance=dist)
plot_ordination(physeq, ordi, "Samples", color="Zone")
}, physeq=relative_phylum_global, dist)
## Run 0 stress 0
## Run 1 stress 0
## ... Procrustes: rmse 0.09862521 max resid 0.1270772
## Run 2 stress 0
## ... Procrustes: rmse 0.09227392 max resid 0.1138774
## Run 3 stress 0
## ... Procrustes: rmse 0.1242844 max resid 0.197133
## Run 4 stress 9.443781e-05
## ... Procrustes: rmse 0.1147886 max resid 0.2212059
## Run 5 stress 0
## ... Procrustes: rmse 0.1249011 max resid 0.172647
## Run 6 stress 0.2761337
## Run 7 stress 0
## ... Procrustes: rmse 0.1212303 max resid 0.1632644
## Run 8 stress 0
## ... Procrustes: rmse 0.1539816 max resid 0.2022742
## Run 9 stress 0.2269841
## Run 10 stress 0.001590993
## Run 11 stress 1.992066e-05
## ... Procrustes: rmse 0.1999354 max resid 0.2476482
## Run 12 stress 9.915231e-05
## ... Procrustes: rmse 0.1371223 max resid 0.23079
## Run 13 stress 0
## ... Procrustes: rmse 0.08179261 max resid 0.09947291
## Run 14 stress 0.0002141331
## ... Procrustes: rmse 0.1832949 max resid 0.2824196
## Run 15 stress 0.1557467
## Run 16 stress 0
## ... Procrustes: rmse 0.09299676 max resid 0.1300814
## Run 17 stress 1.506566e-05
## ... Procrustes: rmse 0.1659642 max resid 0.2325868
## Run 18 stress 9.762141e-05
## ... Procrustes: rmse 0.1572659 max resid 0.2592858
## Run 19 stress 1.029677e-05
## ... Procrustes: rmse 0.1104634 max resid 0.2153393
## Run 20 stress 0.2181459
## *** No convergence -- monoMDS stopping criteria:
## 2: no. of iterations >= maxit
## 14: stress < smin
## 2: stress ratio > sratmax
## 2: scale factor of the gradient < sfgrmin
names(plist) <- ord_meths
#Dataframe for plots
pdataframe = ldply(plist, function(x){
df = x$data[, 1:2]
colnames(df) = c("Axis_1", "Axis_2")
return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"
#Plots with all methods
p = ggplot(pdataframe, aes(Axis_1, Axis_2, color=Zone, shape=Zone))
p = p + geom_point(size=2) + geom_text(aes(label=Samples), size=2,vjust = "inward", hjust = "inward")
p = p + facet_wrap(~method, scales="free")
p = p + scale_fill_brewer(type="qual", palette="Set1")
p = p + scale_colour_brewer(type="qual", palette="Set1")
p= p + labs(x="", y="")
p
Clusters with all methods
cluster
and
factoExtra
packages.Filter the data set by samples and Phylum.
abs_filter <- daply(
.data = rel_phyl_global_df,
.variables = c("Samples", "Phylum"),
.fun = function(x) sum(x$Abundance)
)
abs_filter <- as.data.frame(abs_filter)
abs_filter[is.na(abs_filter)] = 0
Compute the distance matrix with pearson
distance.
#Computing correlation based distance: pearson, spearman or kendall
dist_corr <- get_dist(abs_filter, method = "pearson")
Plot distance matrix.
Heatmap of distance matrix
Clusters with different methods
set.seed(12345)
hc_single <- hclust(d=dist_corr, method = "single")
hc_average <- hclust(d=dist_corr, method = "average")
hc_complete <- hclust(d=dist_corr, method = "complete")
hc_ward <- hclust(d=dist_corr, method = "ward.D2")
hc_median <- hclust(d=dist_corr, method = "median")
hc_centroid <- hclust(d=dist_corr, method = "centroid")
Plot clusters with different methods.
Cluster with average
method
Cluster with ward
method
Cluster with single
method
Cluster with complete
method
Cluster with median
method
Cluster with centroid
method
Compare methods.
## average single complete ward.D2 median centroid
## 0.9876304 0.9876253 0.9876278 0.9876183 0.9876304 0.9876304
PCA proyection.
Clusters and PCA
Groups obtained.
grupos_average <-sort(cutree(hc_average, 3))
grupos_average
## SRR5788415 SRR5788416 SRR5788417 SRR5788420 SRR5788421 SRR5788422
## 1 1 1 2 2 3
cor_base <- cor(abs_filter, use="complete.obs", method = "spearman")
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(cor_base)
corr_plot <- corrplot(cor_base, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45,
p.mat = p.mat, sig.level = 0.05)
[1] Ma, B., Wang, Y., Ye, S. et al. Earth microbial co-occurrence network reveals interconnection pattern across microbiomes. Microbiome 8, 82 (2020). https://doi.org/10.1186/s40168-020-00857-2
[2] C. Zirion. Zamia-bacterial-BGCs. https://czirion.github.io/Zamia-bacterial-BGCs/
[3] Data processing and visualization for metagenomics. https://carpentries-incubator.github.io/metagenomics/