Overrepresentation analysis

Co-ocurrence networks with Ocean data

This script is for microbial co-ocurrence networks in 3 different enviroments.

Load the libraries

library(phyloseq)
library(ggplot2)
library(scales)
library(RColorBrewer)
library(ggpubr)
library(readr)
library(patchwork)
library(plyr)
library(tidyverse)
library(cluster)
library(factoextra)
library(ggcorrplot)
library(corrplot)

Read the biom file

raw_biom <- import_biom("~/dc_workshop/pocean_metagenome/data/pacific_ocean.biom")

#Give name to taxonomic categories
raw_biom@tax_table@.Data<-substring(raw_biom@tax_table@.Data,4)
colnames(raw_biom@tax_table@.Data)<- c("Kingdom","Phylum", "Class", "Order","Family", "Genus", "Species")

# Read metadata
metadatos<-read.csv("~/dc_workshop/pocean_metagenome/data/metadata_sampling.csv") 
row.names(metadatos) <- metadatos$Samples
metadatos$Zone<-factor(metadatos$Zone, levels = c("Epipelagic", "Mesopelagic", "Abyssopelagic"))

# Generate otu_table
otu<-otu_table(raw_biom@otu_table@.Data, taxa_are_rows = TRUE) 
colnames(otu) <-metadatos$Samples

# Generate taxonomy table
tax<-tax_table(raw_biom@tax_table@.Data)

# Generate metadata table
metad<-sample_data(metadatos)

# It is the complete phyloseq object with the 3 previous tables
global<-phyloseq(otu, tax, metad)

# Put the complete species names in the column for Species (instead of the epithet only)
global@tax_table@.Data[,7]<-paste(global@tax_table@.Data[,6], global@tax_table@.Data[,7], sep= "_")

Make the abundance plots

Make the object

# Agglomerate at Phylum level
phylum_global<- tax_glom(global, "Phylum")

#Relative abundance
relative_phylum_global<- transform_sample_counts(phylum_global, function(x) x / sum(x) ) 
rel_phyl_global_df<-psmelt(relative_phylum_global) # convert to data frame
rel_phyl_global_df$Phylum<- as.character(rel_phyl_global_df$Phylum) # convert to character type

# Agglomerate phylum with less than 0.01 abundance
rel_phyl_global_df$Phylum[rel_phyl_global_df$Abundance < 0.01] <- "Fila < 1% de abundancia" 

rel_phyl_global_df$Phylum<- as.factor(rel_phyl_global_df$Phylum) # convert to factor type
rel_phyl_global_df$Samples<- factor(rel_phyl_global_df$Samples, levels = c("SRR5788415", "SRR5788416", "SRR5788417", "SRR5788420", "SRR5788421", "SRR5788422"))  # Put Samples names in levels

Make the abundance plot by samples.

phylum_colors<- brewer.pal(length(levels(rel_phyl_global_df$Phylum)),"Dark2") 
global_rel_phyl_plot<-ggplot(rel_phyl_global_df, aes(x=Samples, y=Abundance, fill=Phylum)) +
  geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values=phylum_colors) + 
  labs(x= "Samples", y = "Relative abundance") +
  facet_grid(~Zone, scales = "free_x")+
  ggtitle("Relative abundance by Zones")+
  theme(axis.title = element_text(size = 14), axis.text.x = element_text(size = 8), plot.title = element_text(size = 20))

ggsave("~/dc_workshop/pocean_metagenome/images/abund_rel_phyl_global.svg", plot = global_rel_phyl_plot, width = 6.5, height = 3.65, units = "in", dpi = 400)
global_rel_phyl_plot

Filter OTUS tables with absolute abundance by Zone

Make the objects

# Absolute abundance
abs_phyl_global_df<-psmelt(phylum_global) # convert to data frame

# Filter OTU table
abs_phyl_global_df_E <- abs_phyl_global_df %>%
  filter(abs_phyl_global_df$Zone == "Epipelagic") %>%
  select(OTU, Sample, Abundance) %>%
  pivot_wider(names_from = OTU, values_from = Abundance)

abs_phyl_global_df_M <- abs_phyl_global_df %>%
  filter(abs_phyl_global_df$Zone == "Mesopelagic")%>%
  select(OTU, Sample, Abundance) %>%
  pivot_wider(names_from = OTU, values_from = Abundance)

abs_phyl_global_df_A <- abs_phyl_global_df %>%
  filter(abs_phyl_global_df$Zone == "Abyssopelagic")%>%
  select(OTU, Sample, Abundance) %>%
  pivot_wider(names_from = OTU, values_from = Abundance)

Create the otutable directory

mkdir otutable

Save otu tables without row names.

#write.csv(abs_phyl_global_df_E,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Epipelagic",".csv",sep=""), row.names= FALSE)
#write.csv(abs_phyl_global_df_M,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Mesopelagic",".csv",sep=""), row.names= FALSE)
#write.csv(abs_phyl_global_df_A,paste("~/dc_workshop/pocean_metagenome/Overrepresentation_analysis/otutable/","Abyssopelagic",".csv",sep=""), row.names= FALSE)

Clusters

Calculate the ordinate objet with Jaccard distance.

meta.ord <- ordinate(physeq = relative_phylum_global, method = "NMDS", 
                     distance = "jaccard")

## Run 0 stress 0 
## Run 1 stress 0 
## ... Procrustes: rmse 0.1116033  max resid 0.1575744 
## Run 2 stress 0 
## ... Procrustes: rmse 0.09833171  max resid 0.1393525 
## Run 3 stress 9.802441e-05 
## ... Procrustes: rmse 0.1344138  max resid 0.2384963 
## Run 4 stress 9.966834e-05 
## ... Procrustes: rmse 0.1466619  max resid 0.249883 
## Run 5 stress 0.1557467 
## Run 6 stress 0 
## ... Procrustes: rmse 0.1343425  max resid 0.1768282 
## Run 7 stress 0.0002896358 
## ... Procrustes: rmse 0.1709236  max resid 0.3062635 
## Run 8 stress 0.1557467 
## Run 9 stress 0 
## ... Procrustes: rmse 0.05797811  max resid 0.09812407 
## Run 10 stress 0.1557467 
## Run 11 stress 0 
## ... Procrustes: rmse 0.09930643  max resid 0.1464909 
## Run 12 stress 0 
## ... Procrustes: rmse 0.1150026  max resid 0.1439802 
## Run 13 stress 9.84357e-05 
## ... Procrustes: rmse 0.1054029  max resid 0.2141803 
## Run 14 stress 0 
## ... Procrustes: rmse 0.1734049  max resid 0.227205 
## Run 15 stress 0.2761342 
## Run 16 stress 8.980857e-05 
## ... Procrustes: rmse 0.1599885  max resid 0.2219469 
## Run 17 stress 0 
## ... Procrustes: rmse 0.1090451  max resid 0.1326152 
## Run 18 stress 9.992838e-05 
## ... Procrustes: rmse 0.1050076  max resid 0.210506 
## Run 19 stress 0 
## ... Procrustes: rmse 0.1207826  max resid 0.1558563 
## Run 20 stress 9.986879e-05 
## ... Procrustes: rmse 0.1111995  max resid 0.2149204 
## *** No convergence -- monoMDS stopping criteria:
##      1: no. of iterations >= maxit
##     15: stress < smin
##      1: stress ratio > sratmax
##      3: scale factor of the gradient < sfgrmin

Plot the clusters.

rel_phylum_cluster_Jaccard <- plot_ordination(physeq = relative_phylum_global, ordination = meta.ord, color="Zone")+
  geom_text(aes(label=Samples), size=3)

ggsave("cluster_jaccard.svg", plot = rel_phylum_cluster_Jaccard, width = 6.5, height = 3.65, units = "in", dpi = 400)
rel_phylum_cluster_Jaccard

Compare clusters with different methods.

dist = "jaccard" # especify distance
ord_meths = c("DCA", "CCA", "RDA", "NMDS", "MDS", "PCoA") #vector of methods names
plist = llply(as.list(ord_meths), function(i, physeq, dist){
  ordi = ordinate(physeq, method=i, distance=dist)
  plot_ordination(physeq, ordi, "Samples", color="Zone")
}, physeq=relative_phylum_global, dist)

## Run 0 stress 0 
## Run 1 stress 0 
## ... Procrustes: rmse 0.09862521  max resid 0.1270772 
## Run 2 stress 0 
## ... Procrustes: rmse 0.09227392  max resid 0.1138774 
## Run 3 stress 0 
## ... Procrustes: rmse 0.1242844  max resid 0.197133 
## Run 4 stress 9.443781e-05 
## ... Procrustes: rmse 0.1147886  max resid 0.2212059 
## Run 5 stress 0 
## ... Procrustes: rmse 0.1249011  max resid 0.172647 
## Run 6 stress 0.2761337 
## Run 7 stress 0 
## ... Procrustes: rmse 0.1212303  max resid 0.1632644 
## Run 8 stress 0 
## ... Procrustes: rmse 0.1539816  max resid 0.2022742 
## Run 9 stress 0.2269841 
## Run 10 stress 0.001590993 
## Run 11 stress 1.992066e-05 
## ... Procrustes: rmse 0.1999354  max resid 0.2476482 
## Run 12 stress 9.915231e-05 
## ... Procrustes: rmse 0.1371223  max resid 0.23079 
## Run 13 stress 0 
## ... Procrustes: rmse 0.08179261  max resid 0.09947291 
## Run 14 stress 0.0002141331 
## ... Procrustes: rmse 0.1832949  max resid 0.2824196 
## Run 15 stress 0.1557467 
## Run 16 stress 0 
## ... Procrustes: rmse 0.09299676  max resid 0.1300814 
## Run 17 stress 1.506566e-05 
## ... Procrustes: rmse 0.1659642  max resid 0.2325868 
## Run 18 stress 9.762141e-05 
## ... Procrustes: rmse 0.1572659  max resid 0.2592858 
## Run 19 stress 1.029677e-05 
## ... Procrustes: rmse 0.1104634  max resid 0.2153393 
## Run 20 stress 0.2181459 
## *** No convergence -- monoMDS stopping criteria:
##      2: no. of iterations >= maxit
##     14: stress < smin
##      2: stress ratio > sratmax
##      2: scale factor of the gradient < sfgrmin

names(plist) <- ord_meths

#Dataframe for plots
pdataframe = ldply(plist, function(x){
  df = x$data[, 1:2]
  colnames(df) = c("Axis_1", "Axis_2")
  return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"


#Plots with all methods
p = ggplot(pdataframe, aes(Axis_1, Axis_2, color=Zone, shape=Zone))
p = p + geom_point(size=2) + geom_text(aes(label=Samples), size=2,vjust = "inward", hjust = "inward")
p = p + facet_wrap(~method, scales="free")
p = p + scale_fill_brewer(type="qual", palette="Set1")
p = p + scale_colour_brewer(type="qual", palette="Set1") 
p= p + labs(x="", y="")
p

Clusters with all methods

Cluster analysis using `cluster` and `factoExtra` packages.

Filter the data set by samples and Phylum.

abs_filter <- daply(
  .data = rel_phyl_global_df,
  .variables = c("Samples", "Phylum"),
  .fun = function(x) sum(x$Abundance)
)
abs_filter <- as.data.frame(abs_filter)
abs_filter[is.na(abs_filter)] = 0

Compute the distance matrix with pearson distance.

#Computing correlation based distance: pearson, spearman or kendall
dist_corr <- get_dist(abs_filter, method = "pearson")

Plot distance matrix.

Heatmap of distance matrix

Clusters with different methods

set.seed(12345)
hc_single <- hclust(d=dist_corr, method = "single")
hc_average <- hclust(d=dist_corr, method = "average")
hc_complete <- hclust(d=dist_corr, method = "complete")
hc_ward <- hclust(d=dist_corr, method = "ward.D2")
hc_median <- hclust(d=dist_corr, method = "median")
hc_centroid <- hclust(d=dist_corr, method = "centroid")

Plot clusters with different methods.

Cluster with average method

Cluster with ward method

Cluster with single method

Cluster with complete method

Cluster with median method

Cluster with centroid method

Compare methods.

##   average    single  complete   ward.D2    median  centroid 
## 0.9876304 0.9876253 0.9876278 0.9876183 0.9876304 0.9876304

PCA proyection.

Clusters and PCA

Groups obtained.

grupos_average <-sort(cutree(hc_average, 3))
grupos_average

## SRR5788415 SRR5788416 SRR5788417 SRR5788420 SRR5788421 SRR5788422 
##          1          1          1          2          2          3

Corplot

cor_base <- cor(abs_filter, use="complete.obs", method = "spearman")

cor.mtest <- function(mat, ...) {
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat<- matrix(NA, n, n)
  diag(p.mat) <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      tmp <- cor.test(mat[, i], mat[, j], ...)
      p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
    }
  }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(cor_base)

corr_plot <- corrplot(cor_base, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45,
         p.mat = p.mat, sig.level = 0.05)

To do

Follow the instructions on https://github.com/microbma/earthnetwork/ with the filter OTU tables.

References

[1] Ma, B., Wang, Y., Ye, S. et al. Earth microbial co-occurrence network reveals interconnection pattern across microbiomes. Microbiome 8, 82 (2020). https://doi.org/10.1186/s40168-020-00857-2

[2] C. Zirion. Zamia-bacterial-BGCs. https://czirion.github.io/Zamia-bacterial-BGCs/

[3] Data processing and visualization for metagenomics. https://carpentries-incubator.github.io/metagenomics/