Co-ocurrence networks with Ocean data

This script is for microbial co-ocurrence networks in 3 different enviroments.

Load the libraries

library(phyloseq)
library(ggplot2)
library(scales)
library(RColorBrewer)
library(ggpubr)
library(readr)
library(patchwork)
library(plyr)
library(tidyverse)
library(cluster)
library(factoextra)
library(ggcorrplot)
library(corrplot)

Read the biom file

raw_biom <- import_biom("~/dc_workshop/pocean_metagenome/data/pacific_ocean.biom")

#Give name to taxonomic categories
raw_biom@tax_table@.Data<-substring(raw_biom@tax_table@.Data,4)
colnames(raw_biom@tax_table@.Data)<- c("Kingdom","Phylum", "Class", "Order","Family", "Genus", "Species")

# Read metadata
metadatos<-read.csv("~/dc_workshop/pocean_metagenome/data/metadata_sampling.csv") 
row.names(metadatos) <- metadatos$Samples
metadatos$Zone<-factor(metadatos$Zone, levels = c("Epipelagic", "Mesopelagic", "Abyssopelagic"))

# Generate otu_table
otu<-otu_table(raw_biom@otu_table@.Data, taxa_are_rows = TRUE) 
colnames(otu) <-metadatos$Samples

# Generate taxonomy table
tax<-tax_table(raw_biom@tax_table@.Data)

# Generate metadata table
metad<-sample_data(metadatos)

# It is the complete phyloseq object with the 3 previous tables
global<-phyloseq(otu, tax, metad)

# Put the complete species names in the column for Species (instead of the epithet only)
global@tax_table@.Data[,7]<-paste(global@tax_table@.Data[,6], global@tax_table@.Data[,7], sep= "_")

Make the abundance plots

Make the object

# Agglomerate at Phylum level
phylum_global<- tax_glom(global, "Phylum")

#Relative abundance
relative_phylum_global<- transform_sample_counts(phylum_global, function(x) x / sum(x) ) 
rel_phyl_global_df<-psmelt(relative_phylum_global) # convert to data frame
rel_phyl_global_df$Phylum<- as.character(rel_phyl_global_df$Phylum) # convert to character type

# Agglomerate phylum with less than 0.01 abundance
rel_phyl_global_df$Phylum[rel_phyl_global_df$Abundance < 0.01] <- "Fila < 1% de abundancia" 

rel_phyl_global_df$Phylum<- as.factor(rel_phyl_global_df$Phylum) # convert to factor type
rel_phyl_global_df$Samples<- factor(rel_phyl_global_df$Samples, levels = c("SRR5788415", "SRR5788416", "SRR5788417", "SRR5788420", "SRR5788421", "SRR5788422"))  # Put Samples names in levels

Make the abundance plot by samples.

phylum_colors<- brewer.pal(length(levels(rel_phyl_global_df$Phylum)),"Dark2") 
global_rel_phyl_plot<-ggplot(rel_phyl_global_df, aes(x=Samples, y=Abundance, fill=Phylum)) +
  geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values=phylum_colors) + 
  labs(x= "Samples", y = "Relative abundance") +
  facet_grid(~Zone, scales = "free_x")+
  ggtitle("Relative abundance by Zones")+
  theme(axis.title = element_text(size = 14), axis.text.x = element_text(size = 8), plot.title = element_text(size = 20))

ggsave("~/dc_workshop/pocean_metagenome/images/abund_rel_phyl_global.svg", plot = global_rel_phyl_plot, width = 6.5, height = 3.65, units = "in", dpi = 400)
global_rel_phyl_plot

Filter OTUS tables with absolute abundance by Zone

Make the objects

# Absolute abundance
abs_phyl_global_df<-psmelt(phylum_global) # convert to data frame

# Filter OTU table
abs_phyl_global_df_E <- abs_phyl_global_df %>%
  filter(abs_phyl_global_df$Zone == "Epipelagic") %>%
  select(OTU, Sample, Abundance) %>%
  pivot_wider(names_from = OTU, values_from = Abundance)

abs_phyl_global_df_M <- abs_phyl_global_df %>%
  filter(abs_phyl_global_df$Zone == "Mesopelagic")%>%
  select(OTU, Sample, Abundance) %>%
  pivot_wider(names_from = OTU, values_from = Abundance)

abs_phyl_global_df_A <- abs_phyl_global_df %>%
  filter(abs_phyl_global_df$Zone == "Abyssopelagic")%>%
  select(OTU, Sample, Abundance) %>%
  pivot_wider(names_from = OTU, values_from = Abundance)

Create the otutable directory

mkdir otutable

Save otu tables without row names.

#write.csv(abs_phyl_global_df_E,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Epipelagic",".csv",sep=""), row.names= FALSE)
#write.csv(abs_phyl_global_df_M,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Mesopelagic",".csv",sep=""), row.names= FALSE)
#write.csv(abs_phyl_global_df_A,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Abyssopelagic",".csv",sep=""), row.names= FALSE)

Clusters

Calculate the ordinate objet with Jaccard distance.

meta.ord <- ordinate(physeq = relative_phylum_global, method = "NMDS", 
                     distance = "jaccard")
## Run 0 stress 0 
## Run 1 stress 0 
## ... Procrustes: rmse 0.1116033  max resid 0.1575744 
## Run 2 stress 0 
## ... Procrustes: rmse 0.09833171  max resid 0.1393525 
## Run 3 stress 9.802441e-05 
## ... Procrustes: rmse 0.1344138  max resid 0.2384963 
## Run 4 stress 9.966834e-05 
## ... Procrustes: rmse 0.1466619  max resid 0.249883 
## Run 5 stress 0.1557467 
## Run 6 stress 0 
## ... Procrustes: rmse 0.1343425  max resid 0.1768282 
## Run 7 stress 0.0002896358 
## ... Procrustes: rmse 0.1709236  max resid 0.3062635 
## Run 8 stress 0.1557467 
## Run 9 stress 0 
## ... Procrustes: rmse 0.05797811  max resid 0.09812407 
## Run 10 stress 0.1557467 
## Run 11 stress 0 
## ... Procrustes: rmse 0.09930643  max resid 0.1464909 
## Run 12 stress 0 
## ... Procrustes: rmse 0.1150026  max resid 0.1439802 
## Run 13 stress 9.84357e-05 
## ... Procrustes: rmse 0.1054029  max resid 0.2141803 
## Run 14 stress 0 
## ... Procrustes: rmse 0.1734049  max resid 0.227205 
## Run 15 stress 0.2761342 
## Run 16 stress 8.980857e-05 
## ... Procrustes: rmse 0.1599885  max resid 0.2219469 
## Run 17 stress 0 
## ... Procrustes: rmse 0.1090451  max resid 0.1326152 
## Run 18 stress 9.992838e-05 
## ... Procrustes: rmse 0.1050076  max resid 0.210506 
## Run 19 stress 0 
## ... Procrustes: rmse 0.1207826  max resid 0.1558563 
## Run 20 stress 9.986879e-05 
## ... Procrustes: rmse 0.1111995  max resid 0.2149204 
## *** No convergence -- monoMDS stopping criteria:
##      1: no. of iterations >= maxit
##     15: stress < smin
##      1: stress ratio > sratmax
##      3: scale factor of the gradient < sfgrmin

Plot the clusters.

rel_phylum_cluster_Jaccard <- plot_ordination(physeq = relative_phylum_global, ordination = meta.ord, color="Zone")+
  geom_text(aes(label=Samples), size=3)

ggsave("cluster_jaccard.svg", plot = rel_phylum_cluster_Jaccard, width = 6.5, height = 3.65, units = "in", dpi = 400)
rel_phylum_cluster_Jaccard

Compare clusters with different methods.

dist = "jaccard" # especify distance
ord_meths = c("DCA", "CCA", "RDA", "NMDS", "MDS", "PCoA") #vector of methods names
plist = llply(as.list(ord_meths), function(i, physeq, dist){
  ordi = ordinate(physeq, method=i, distance=dist)
  plot_ordination(physeq, ordi, "Samples", color="Zone")
}, physeq=relative_phylum_global, dist)
## Run 0 stress 0 
## Run 1 stress 0 
## ... Procrustes: rmse 0.09862521  max resid 0.1270772 
## Run 2 stress 0 
## ... Procrustes: rmse 0.09227392  max resid 0.1138774 
## Run 3 stress 0 
## ... Procrustes: rmse 0.1242844  max resid 0.197133 
## Run 4 stress 9.443781e-05 
## ... Procrustes: rmse 0.1147886  max resid 0.2212059 
## Run 5 stress 0 
## ... Procrustes: rmse 0.1249011  max resid 0.172647 
## Run 6 stress 0.2761337 
## Run 7 stress 0 
## ... Procrustes: rmse 0.1212303  max resid 0.1632644 
## Run 8 stress 0 
## ... Procrustes: rmse 0.1539816  max resid 0.2022742 
## Run 9 stress 0.2269841 
## Run 10 stress 0.001590993 
## Run 11 stress 1.992066e-05 
## ... Procrustes: rmse 0.1999354  max resid 0.2476482 
## Run 12 stress 9.915231e-05 
## ... Procrustes: rmse 0.1371223  max resid 0.23079 
## Run 13 stress 0 
## ... Procrustes: rmse 0.08179261  max resid 0.09947291 
## Run 14 stress 0.0002141331 
## ... Procrustes: rmse 0.1832949  max resid 0.2824196 
## Run 15 stress 0.1557467 
## Run 16 stress 0 
## ... Procrustes: rmse 0.09299676  max resid 0.1300814 
## Run 17 stress 1.506566e-05 
## ... Procrustes: rmse 0.1659642  max resid 0.2325868 
## Run 18 stress 9.762141e-05 
## ... Procrustes: rmse 0.1572659  max resid 0.2592858 
## Run 19 stress 1.029677e-05 
## ... Procrustes: rmse 0.1104634  max resid 0.2153393 
## Run 20 stress 0.2181459 
## *** No convergence -- monoMDS stopping criteria:
##      2: no. of iterations >= maxit
##     14: stress < smin
##      2: stress ratio > sratmax
##      2: scale factor of the gradient < sfgrmin
names(plist) <- ord_meths

#Dataframe for plots
pdataframe = ldply(plist, function(x){
  df = x$data[, 1:2]
  colnames(df) = c("Axis_1", "Axis_2")
  return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"


#Plots with all methods
p = ggplot(pdataframe, aes(Axis_1, Axis_2, color=Zone, shape=Zone))
p = p + geom_point(size=2) + geom_text(aes(label=Samples), size=2,vjust = "inward", hjust = "inward")
p = p + facet_wrap(~method, scales="free")
p = p + scale_fill_brewer(type="qual", palette="Set1")
p = p + scale_colour_brewer(type="qual", palette="Set1") 
p= p + labs(x="", y="")
p
Clusters with all methods

Clusters with all methods

Cluster analysis using cluster and factoExtra packages.

Filter the data set by samples and Phylum.

abs_filter <- daply(
  .data = rel_phyl_global_df,
  .variables = c("Samples", "Phylum"),
  .fun = function(x) sum(x$Abundance)
)
abs_filter <- as.data.frame(abs_filter)
abs_filter[is.na(abs_filter)] = 0

Compute the distance matrix with pearson distance.

#Computing correlation based distance: pearson, spearman or kendall
dist_corr <- get_dist(abs_filter, method = "pearson")

Plot distance matrix.

Heatmap of distance matrix

Heatmap of distance matrix

Clusters with different methods

set.seed(12345)
hc_single <- hclust(d=dist_corr, method = "single")
hc_average <- hclust(d=dist_corr, method = "average")
hc_complete <- hclust(d=dist_corr, method = "complete")
hc_ward <- hclust(d=dist_corr, method = "ward.D2")
hc_median <- hclust(d=dist_corr, method = "median")
hc_centroid <- hclust(d=dist_corr, method = "centroid")

Plot clusters with different methods.

Cluster with `average` method

Cluster with average method

Cluster with `ward` method

Cluster with ward method

Cluster with `single` method

Cluster with single method

Cluster with `complete` method

Cluster with complete method

Cluster with `median` method

Cluster with median method

Cluster with `centroid` method

Cluster with centroid method

Compare methods.

##   average    single  complete   ward.D2    median  centroid 
## 0.9876304 0.9876253 0.9876278 0.9876183 0.9876304 0.9876304

PCA proyection.

Clusters and PCA

Clusters and PCA

Groups obtained.

grupos_average <-sort(cutree(hc_average, 3))
grupos_average
## SRR5788415 SRR5788416 SRR5788417 SRR5788420 SRR5788421 SRR5788422 
##          1          1          1          2          2          3

Corplot

cor_base <- cor(abs_filter, use="complete.obs", method = "spearman")

cor.mtest <- function(mat, ...) {
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat<- matrix(NA, n, n)
  diag(p.mat) <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      tmp <- cor.test(mat[, i], mat[, j], ...)
      p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
    }
  }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(cor_base)

corr_plot <- corrplot(cor_base, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45,
         p.mat = p.mat, sig.level = 0.05)

To do

References

[1] Ma, B., Wang, Y., Ye, S. et al. Earth microbial co-occurrence network reveals interconnection pattern across microbiomes. Microbiome 8, 82 (2020). https://doi.org/10.1186/s40168-020-00857-2

[2] C. Zirion. Zamia-bacterial-BGCs. https://czirion.github.io/Zamia-bacterial-BGCs/

[3] Data processing and visualization for metagenomics. https://carpentries-incubator.github.io/metagenomics/