BBobs

Bacterial diversity associated to a coastal tidal front

This is a reproductible workflow of the 16S amplicon analysis of the paper “Linking Spatial and temporal dynamic of bacterioplankton communities with ecological strategies across a coastal frontal area”_

Raw data files can be accessed here

Table of content

I. From raw data to OTU table

  1. Quality-filtering and merging
  2. Clustering using Swarm
  3. Chimera detection and removal
  4. Taxonomic assignation
  5. Prepare tables

II. Beta-diversity and statistics

  1. Normalisation
  2. NMDS
  3. PERMANOVA
  4. PCA

III. Network analysis

  1. Matrix filtration
  2. Covariance estimation using SPIEC-EASI
  3. Module detection with Louvain algorithm
  4. Calculate module eigengenes and correlations with the environment
  5. Modules relative abundance across the samples
  6. Relative abundance of the main families across the modules
  7. Calcul of CV of each OTU

I. From raw data to OTU table

Quality-filtering and merging

All these first steps were done using Illumina utils scripts written by Meren (Eren et al., 2013)

Quality filtering

cd /raw_data

# Create a qual_file

ls *.fastq | \
awk 'BEGIN{FS="_R"}{print $1}' | \
uniq | \
awk 'BEGIN{print "sample\tr1\tr2"}
     {print $0 "\t" $0 "_R1.fastq\t" $0"_R2.fastq"}' > qual_file
     
# Create a merge file

ls *.fastq | \
awk 'BEGIN{FS="_R"}{print $1}' | \
uniq | \
awk 'BEGIN{print "sample\tr1\tr2"}
     {print $0 "\t" $0 "-QUALITY_PASSED_R1.fastq\t" $0"-QUALITY_PASSED_R2.fastq"}' > merge_file

# Generate a config file (.ini) as follow
iu-gen-configs qual_file --r1-prefix "^.....CCAGCAGC[C,T]GCGGTAA." --r2-prefix "^CCGTC[A,T]ATT[C,T].TTT[A,G]A.T"

# create a loop file boucle_quality.sh 
for file in *.ini
do iu-filter-quality-minoche $file
done

# activate it 
chmod 755 boucle_quality.sh

# run it
./boucle_quality.sh

Merging

cd /raw_data

# Generate a config file (.ini) as follow
iu-gen-configs merge_file --r1-prefix "^.....CCAGCAGC[C,T]GCGGTAA." --r2-prefix "^CCGTC[A,T]ATT[C,T].TTT[A,G]A.T"

# Create a loop file boucle_merge.sh as follow :
for file in *.ini
do iu-merge-pairs $file
done

# activate it 
chmod 755 boucle_merge.sh

# run it
./boucle_merge.sh


Clustering using Swarm

Clustering of sequences in OTUs was done using Swarm algorithm (Mahé et al., 2014)

Dereplicate the sequences

cd /raw_data

# Using derep function in Vsearch-2.3.4

# Create a loop boucle_derep.sh
for file in *_MERGED
do vsearch --derep_fulllength  $file  --sizeout --relabel_sha1 --fasta_width 0 --output ./$file.fasta
done

# Defline needs to be change for downstream analysis. Replace "size=" by "_"
for file in *.fasta
do sed 's/;size=/_/' $file > s_$file
done

# Move s_* files in a new folder

mkdir s_fasta
mv s_* ./s_fasta
cd ./s_fasta

# Create a fasta of dereplicated sequences for all the samples 

export LC_ALL=C
cat *.fasta| \
awk 'BEGIN {RS = ">" ; FS = "[_\n]"}
     {if (NR != 1) {abundances[$1] += $2 ; sequences[$1] = $3}}
     END {for (amplicon in sequences) {
     print ">" amplicon "_" abundances[amplicon] "_" sequences[amplicon]}}' | \
sort --temporary-directory=$(pwd) -t "_" -k2,2nr -k1.2,1d | \
sed -e 's/\_/\n/2' > M2BiPAT_derep.fa

# Create an amplicon contingency table

awk 'BEGIN {FS = "[>_]"}

     # Parse the sample files
     /^>/ {contingency[$2][FILENAME] = $3
           amplicons[$2] += $3
           if (FNR == 1) {
               samples[++i] = FILENAME
           }
          }

     END {# Create table header
          printf "amplicon"
          s = length(samples)
          for (i = 1; i <= s; i++) {
              printf "\t%s", samples[i]
          }
          printf "\t%s\n", "total"

          # Sort amplicons by decreasing total abundance (use a coprocess)
          command = "LC_ALL=C sort -k1,1nr -k2,2d"
          for (amplicon in amplicons) {
               printf "%d\t%s\n", amplicons[amplicon], amplicon |& command
          }
          close(command, "to")
          FS = "\t"
          while ((command |& getline) > 0) {
              amplicons_sorted[++j] = $2
          }
          close(command)

          # Print the amplicon occurrences in the different samples
          n = length(amplicons_sorted)
          for (i = 1; i <= n; i++) {
               amplicon = amplicons_sorted[i]
               printf "%s", amplicon
               for (j = 1; j <= s; j++) {
                   printf "\t%d", contingency[amplicon][samples[j]]
               }
               printf "\t%d\n", amplicons[amplicon]
          }}' *.fasta > M2BiPAT_amplicon_contingency_table.csv

Perform Swarm

# Using Swarm-2.2.2

swarm M2BiPAT_derep.fa -d 1 -f -t 8 -w M2BiPAT_OTU_rep -s M2BiPAT_amplicons.stats -o M2BiPAT_amplicons.swarms

# Create an OTU contingency table

STATS="M2BiPAT_amplicons.stats"
SWARMS="M2BiPAT_amplicons.swarms"
AMPLICON_TABLE="M2BiPAT_amplicon_contingency_table.csv"
OTU_TABLE="M2BiPAT_OTU_contingency_table.csv"

# Header
echo -e "OTU\t$(head -n 1 "${AMPLICON_TABLE}")" > "${OTU_TABLE}"

# Compute "per sample abundance" for each OTU
awk -v SWARM="${SWARMS}" \
    -v TABLE="${AMPLICON_TABLE}" \
    'BEGIN {FS = " "
            while ((getline < SWARM) > 0) {
                swarms[$1] = $0
            }
            FS = "\t"
            while ((getline < TABLE) > 0) {
                table[$1] = $0
            }
           }

     {# Parse the stat file (OTUs sorted by decreasing abundance)
      seed = $3 "_" $4
      n = split(swarms[seed], OTU, "[ _]")
      for (i = 1; i < n; i = i + 2) {
          s = split(table[OTU[i]], abundances, "\t")
          for (j = 1; j < s; j++) {
              samples[j] += abundances[j+1]
          }
      }
      printf "%s\t%s", NR, $3
      for (j = 1; j < s; j++) {
          printf "\t%s", samples[j]
      }
     printf "\n"
     delete samples
     }' "${STATS}" >> "${OTU_TABLE}"

Chimera detection and removal


cd /s_fasta

# Again, have to change the defline

sed 's/;_/;size=/' M2BiPAT_OTU_rep > v_M2BiPAT_OTU_rep 

sed '/^>/ s/$/;/' v_M2BiPAT_OTU_rep  > v2_M2BiPAT_OTU_rep 

# Chimera detection must be done in screen

vsearch --uchime_denovo v2_M2BiPAT_OTU_rep --chimeras M2BiPAT_chimeras.fa --nonchimeras M2BiPAT_OTU_rep_whithout_chimera.fa

# Remove chimera sequences from OTU contingency table

grep '>' M2BiPAT_chimeras.fa > chimera_names_M2BiPAT
sed 's/;size=[0-9]*//' chimera_names_M2BiPAT > chimera_the_file
sed 's/>//' chimera_the_file > chimera

awk > M2BiPAT_OTU_table.txt 'NR==FNR{arr[$0];next}!($2 in arr)' FS="\t" chimera M2BiPAT_OTU_contingency_table.csv 

Taxonomic assignation


# done with mothur v.1.35.1

# Using Silva v123 trimmed for v4-v5 region

mothur > classify.seqs(fasta=> M2BiPAT_OTU_rep_whithout_chimera.fa, template= Silva123_V4_V5_trimmed.fasta, taxonomy=SIlva123_slv_taxonomy.txt.nds, processors=8)

Prepare tables


# load the different tables, 
the metadata table is available as "supplementary_data_2.csv" in the paper

OTU_table <- read.table("M2BiPAT_OTU_table.txt", header = T, sep = "\t")
metadata <- read.table("Metadata_M2BiPAT.csv", header = T, sep = ";")
taxonomy <- read.table("M2BiPAT.txt.wang.taxonomy", header = T, sep = "\t")
colnames(taxonomy) <- c("amplicon","taxonomy")
taxonomy$amplicon <- gsub('_[0-9]*','', taxonomy$amplicon)

# Add taxonomy to the OTU_table
OTU_table_tax <- merge(OTU_table, taxonomy, by="amplicon") 
OTU_table_tax_ordered <- OTU_table_tax[order(OTU_table_tax$total, decreasing = T),]

# Remove unwanted taxa
OTU_table_tax_ordered <- OTU_table_tax_ordered[grep("Eukaryota|Archaea|Mitochondria|Chloroplast", OTU_table_tax_ordered$taxonomy, invert =T),]


# good OTU_table
OTU_table <- OTU_table_tax_ordered[,-c(1,189,190)]
rownames(OTU_table) <- OTU_table$OTU 
OTU_table <- OTU_table[,colnames(OTU_table) %in% rownames(metadata)]
write.table(OTU_table, file = "./M2BiPAT_OTU_table.txt", sep= "\t")

# good taxonomy file
taxOTUs <- OTU_table_tax_ordered[,c(2,190)]
write.table(taxOTUs, file = "./M2BiPAT_taxOTUs.txt", sep= "\t")

II. Beta-diversity and statistics

Normalisation


# Normalisation with DESeq package

library(DESeq)

Matrice <- read.delim("M2BiPAT_OTU_table.txt", header=T, sep="\t", row.names=1)

triceMa <- t(Matrice)
triceMa <- cbind(sample = rownames(triceMa), triceMa)
triceMa <- triceMa[,1]
mat <- newCountDataSet(Matrice, triceMa)

mat <- estimateSizeFactors(mat)
sizeFactors(mat)
mat <- counts(mat, normalized=TRUE)

write.table(mat, file = "M2BiPAT_OTU_DESeq.txt", sep= "\t", row.names=T)


# Create a table of relative abundance
OTU <- read.table("M2BiPAT_OTU_DESeq.txt", header=T, row.names=1)

genre_table <- as.matrix(t(OTU))

s_num<-dim(genre_table)[2]
otu.perc<-matrix(rep("NA",times=(dim(genre_table)[1]*s_num)),nrow=dim(genre_table)[1],ncol=s_num)
rownames(otu.perc)<-rownames(genre_table)
colnames(otu.perc)=colnames(genre_table) 
totals<-colSums(genre_table)

for(s in c(1:s_num)){
  vec<-(genre_table[,s]/totals[s])*100
  otu.perc[,s]<-vec
  rm(vec)
}

write.table(otu.perc, "M2BiPAT_OTU_percent.txt", col.names = NA, row.names = T, sep = "\t")

NMDS


library(vegan)
library(ggplot2)

metadata<- read.table("../data/Metadata_M2BiPAT.csv", header = T, sep=";", row.names=1)
Matrice <- read.table("M2BiPAT_OTU_DESeq.txt", header=T, row.names=1)
Matrice <- t(Matrice)

metadata <- metadata[ order(row.names(metadata)), ]
Matrice <- Matrice[ order(row.names(Matrice)), ]
Matrice <- Matrice[rownames(Matrice) %in% rownames(metadata),]
metadata <- metadata[rownames(metadata) %in% rownames(Matrice),]

bray <- vegdist(Matrice, method="bray")
nmds <- metaMDS(bray, wascore = F)

data.score <- as.data.frame(scores(nmds))
data.score$site <- rownames(data.score)

data.score$Mois <- metadata$Mois
data.score$Prof <- metadata$prof
data.score$Station <- metadata$station
data.score <- na.omit(data.score)

ggplot(data=data.score, aes(x=NMDS1, y=NMDS2, size=as.numeric(Prof), shape=Station)) +
  geom_point(data=data.score, aes(size=Prof, fill= Mois)) +
  scale_fill_manual(values=c("#8D68B0","#FFD062","#8FBC8F","#779ECB"))+
  scale_shape_manual(values = c(25,24,21,22,23)) +
  scale_size_manual(values = c(4,6,8))

PERMANOVA


# Test if depth significantly differentiate different communities for each stations of the summer cruises.

metadata_P <- metadata[,c(1,3,7,12)]
metadata_P <- na.omit(metadata_P)

Matrice_P <- Matrice[rownames(Matrice) %in% rownames(metadata_P),]

metadata_Summer <- metadata_P[grep("2015_March", metadata$Mois, invert=T),]
Matrice_Summer <- Matrice_P[rownames(Matrice_P) %in% rownames(metadata_Summer),]

meta1<- metadata_Summer[grep("Station 5", metadata_Summer$station),]

Matrice <- Matrice_Summer[rownames(Matrice_Summer) %in% rownames(meta1),]

bray <- vegdist(Matrice, method="bray")

adonis(formula = bray ~ prof , 
       data = meta1,
       permutations = 999, 
       method = "bray", 
       binary = TRUE)

PCA


library(vegan)
library(ggbiplot)

metadata<- read.csv("../data/Metadata_M2BiPAT.csv", header = T, sep=";", row.names=1)

df <- metadata[,c(3,7,12,16,22,23,24,29,30,31,32)]
df <- na.omit(df)

log.ir <- log(df[,c(4,5,6,7,8,9,10,11)])
log.ir <- log(df[,c(4:11)]+1)
ir.species <- df[,1]
ir <- df[,2]

ir.pca <- prcomp(log.ir, 
                 scale. = TRUE, center=TRUE) 


g <- ggbiplot(ir.pca, groups=ir.species) 

print(g)

III. Network analysis


#Required packages 

library(phyloseq)
library(SpiecEasi)
library(Matrix)
library(reshape2)
library(scales)
library(tidygraph)
library(tidyverse)
library(igraph)
library(SpiecEasi)
library(phyloseq)
source("https://raw.githubusercontent.com/genomewalker/osd2014_analysis/master/osd2014_16S_asv/lib/graph_lib.R")
library(WGCNA)
library(tidyverse)
library(broom)

Matrix filtration


# Load abundance matrix, metadata and taxonomy file
MatricePhylo <- read.delim("M2BiPAT_OTU_table.txt", header=T, sep="\t", row.names=1)
metadata<- read.csv("Metadata_M2BiPAT.csv", row.names=1, header = T, sep=";")
df <- t(MatricePhylo)
taxmat <- read.csv("M2BiPAT_Taxotu_separated.csv", header=T, row.names=1, sep=";")
taxmat <- taxmat[rownames(taxmat) %in% rownames(Matrice_M2BiPAT),]
taxmat <- as.matrix(taxmat)

# Phyloseq object
OTU = otu_table(t(Matrice_M2BiPAT), taxa_are_rows = TRUE)
TAX = tax_table(taxmat)
sampledata = sample_data(metadata)
Phymatrice = phyloseq(OTU,TAX, sampledata)

# Filter OTUs that have 50 sequences in at least 3 samples
subset_3samples = genefilter_sample(Phymatrice, filterfun_sample(function(x) x >= 50), A=3)
Phymatrice2 = prune_taxa(subset_3samples, Phymatrice)
ntaxa(Phymatrice2)
nsamples(Phymatrice2)

# Print the filtered matrix
otus <- otu_table(Phymatrice2)
otus <- as.data.frame(otus)

write.table(otus, file="Phymatrice_M2BiPAT_50seqs3samples_brut.txt", sep="\t")

Covariance estimation using SPIEC-EASI


load("Phymatrice2")
load("Phymatrice_M2BiPAT_50seqs3samples_brut.txt")
mytaxonomy <- read.csv("mytaxonomy.csv", header=T, sep=";") # A table where the taxonomy levels are splitted in different columns 

# Perform SPIEC-EASI
M2BiPAT.OSD.ctl.gl <- spiec.easi(Phymatrice2, method="glasso", lambda.min.ratio=0.01, nlambda=30, icov.select.params = list(rep.num=30, ncores=16))

# Export SPIEC-EASI correlation matrix and model 
secor <- cov2cor(forceSymmetric(getOptCov(OSD.ctl.gl), ifelse(sum(Matrix::tril(getOptCov(OSD.ctl.gl)))>sum(Matrix::triu(getOptCov(OSD.ctl.gl))), 'L', 'U')))
refit <- forceSymmetric(getRefit(OSD.ctl.gl), ifelse(sum(Matrix::tril(getRefit(OSD.ctl.gl)))>sum(Matrix::triu(getRefit(OSD.ctl.gl))), 'L', 'U'))
OSD.ctl.gl_net <- adj2igraph(as.matrix(secor*refit),  vertex.attr=list(name=taxa_names(PhyMatrice)))
E(OSD.ctl.gl_net)$weight %>% hist

graph.density(OSD.ctl.gl_net)
OSD.ctl.gl$est$sparsity[getOptInd(OSD.ctl.gl)]

g <- OSD.ctl.gl_net %>%
  as_tbl_graph()

Module detection with Louvain algorithm


# Determine communities with Louvain algorithm
community_bin <- file.path("~/Desktop/Network_Antonio/deterministic_louvain//community ")
convert_bin <- file.path("~/Desktop/Network_Antonio/deterministic_louvain//convert")
hierarchy_bin <- file.path("~/Desktop/Network_Antonio/deterministic_louvain//hierarchy -n")

community_options <- "725 -l -1 -v" #14
g_louvain <- run_louvain(X = "g", graph = g, community_bin = community_bin, convert_bin = convert_bin, hierarchy_bin = hierarchy_bin, community_options = community_options )

com_l_0 <- make_clusters(g, as.numeric(g_louvain[[1]]), algorithm = 'Louvain', modularity = TRUE)
com_l_1 <- make_clusters(g, as.numeric(g_louvain[[2]]), algorithm = 'Louvain', modularity = TRUE)
com_l_2 <- make_clusters(g, as.numeric(g_louvain[[3]]), algorithm = 'Louvain', modularity = TRUE)

V(g)$com <-  as.character(membership(com_l_1))

counts <- dplyr::as_data_frame(g) %>%
  group_by(com) %>%
  dplyr::count() %>%
  dplyr::rename(n_mem = n)

g <- g %>%
  left_join(counts)

g.c <- contract.vertices(as.igraph(g), V(as.igraph(g))$com,
                         vertex.attr.comb=list(name= function(x) paste(x, collapse="|"),
                                               com= function(x) unique(x),
                                               n_mem = function(x) unique(x))
)

g.c <- igraph::simplify(g.c, edge.attr.comb = list(weight = function(x) median(x)), remove.loops = TRUE)

g <- g %>% activate(nodes) %>% mutate(com = as.character(paste("com", com, sep = "_")))
g.c <- as_tbl_graph(g.c) %>% activate(nodes) %>% mutate(com = as.character(paste("com", com, sep = "_")))

g.c <- as_tbl_graph(g.c) %>%
  activate(edges) %>%
  mutate(sign = ifelse(weight < 0, "NEG", "POS"),
         weight_orig = weight,
         weight = abs(weight))

g <- as_tbl_graph(g) %>%
  activate(edges) %>%
  mutate(sign = ifelse(weight < 0, "NEG", "POS"),
         weight_orig = weight,
         weight = abs(weight))

write.graph(g, "./OSD.ctl.gl_net.graphml", format = "graphml")
write.graph(g.c, "./OSD.ctl.gl_net_aggregated.graphml", format = "graphml")

write_tsv(g %>% activate("nodes") %>% as_tibble(), "~/Downloads/OSD.ctl.gl_net.tsv")

Calculate module eigengenes and correlations with the environment


MatriceF <- read.delim("M2BiPAT_OTU_percent.txt", header=T, sep="\t", row.names=1)
Matrice <- read.table("Phymatrice_50seqs3samples_brut.txt", header=T, sep="\t", row.names=1)
MatriceF <- MatriceF[rownames(MatriceF) %in% rownames(Matrice),]

df <- t(MatriceF)

mat <- Matrice_M2BiPAT
df_nodes <- dplyr::as_data_frame(g %>% activate(nodes))
colnames(mat) <- df_nodes$name
PCs <- WGCNA::moduleEigengenes(mat, colors=df_nodes$com)
ME <- PCs$eigengenes
rownames(ME) <- rownames(mat)

metadata <- as(metadata_M2BiPAT, "matrix") %>% tbl_df()

datExpr = Matrice_M2BiPAT
datTraits = metadata_M2BiPAT[,c(16:56)]

sampleTree2 = hclust(dist(datExpr), method = "average")
traitColors = numbers2colors(datTraits, signed = FALSE) 
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits),
                    main = "Sample dendrogram and trait heatmap")

# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);

# Recalculate MEs with color labels
MEs0 = ME
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

sizeGrWindow(10,6)

# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "");

dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));

# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

Modules relative abundance across the samples

taxmat = read.table(file="mytaxonomy.csv", header=T, row.names=1, sep=";")
rownames(taxmat) <- paste("OTU", rownames(taxmat), sep="")

annot = taxmat
dim(annot)
names(annot)
probes = colnames(datExpr)
probes2annot = match(probes, rownames(annot))

moduleColors <- df_nodes$com

geneInfo0 = data.frame(OTUs = probes,
                       Phylum = annot$Phylum[probes2annot],
                       Class = annot$Class[probes2annot],
                       Order = annot$Order[probes2annot],
                       Family = annot$Family[probes2annot],
                       Genus = annot$Genus[probes2annot],
                       moduleColor = moduleColors)

geneInfo0 = data.frame(OTUs = probes,
                       taxonomy = annot$taxonomy[probes2annot],
                       moduleColor = moduleColors)

table_module <- geneInfo0[,c(1:7)]
table_perc <- t(Matrice_M2BiPAT)
table_perc <- as.data.frame(table_perc)
table_perc$tot <- rowSums(table_perc)
table_perc <- cbind(OTUs = rownames(table_perc), table_perc)
abond <- table_perc[,c(1,136)]
abond <- as.data.frame(abond)

table_perc <- t(table_perc)
table_perc <- table_perc[-c(1,136),]

perc_module <- merge(table_module, abond, by = "OTUs")

module_com_1 <- perc_module[perc_module$moduleColor == 'com_1',]
module_com_2 <- perc_module[perc_module$moduleColor == 'com_2',]
module_com_3 <- perc_module[perc_module$moduleColor == 'com_3',]
module_com_4 <- perc_module[perc_module$moduleColor == 'com_4',]
module_com_5 <- perc_module[perc_module$moduleColor == 'com_5',]
module_com_6 <- perc_module[perc_module$moduleColor == 'com_6',]
module_com_7 <- perc_module[perc_module$moduleColor == 'com_7',]
module_com_8 <- perc_module[perc_module$moduleColor == 'com_8',]
module_com_9 <- perc_module[perc_module$moduleColor == 'com_9',]
module_com_10 <- perc_module[perc_module$moduleColor == 'com_10',]
module_com_11 <- perc_module[perc_module$moduleColor == 'com_11',]
module_com_12 <- perc_module[perc_module$moduleColor == 'com_12',]
module_com_13 <- perc_module[perc_module$moduleColor == 'com_13',]
module_com_14 <- perc_module[perc_module$moduleColor == 'com_14',]

write.table(table_perc, "table_perc.txt", sep="\t")

table_perc <- read.table("table_perc.txt", sep="\t", header=T, row.names=1)

Syneccho <- data.frame(com_1 = apply(table_perc[,colnames(table_perc) %in% module_com_1$OTUs], 1, sum),
                       com_2 = apply(table_perc[,colnames(table_perc) %in% module_com_2$OTUs], 1, sum),
                       com_3 = apply(table_perc[,colnames(table_perc) %in% module_com_3$OTUs], 1, sum),
                       com_4 = apply(table_perc[,colnames(table_perc) %in% module_com_4$OTUs], 1, sum),
                       com_5 = apply(table_perc[,colnames(table_perc) %in% module_com_5$OTUs], 1, sum),
                       com_6 = apply(table_perc[,colnames(table_perc) %in% module_com_6$OTUs], 1, sum),
                       com_7 = apply(table_perc[,colnames(table_perc) %in% module_com_7$OTUs], 1, sum),
                       com_8 = apply(table_perc[,colnames(table_perc) %in% module_com_8$OTUs], 1, sum),
                       com_9 = apply(table_perc[,colnames(table_perc) %in% module_com_9$OTUs], 1, sum),
                       com_10 = apply(table_perc[,colnames(table_perc) %in% module_com_10$OTUs], 1, sum),
                       com_11 = apply(table_perc[,colnames(table_perc) %in% module_com_11$OTUs], 1, sum),
                       com_12 = apply(table_perc[,colnames(table_perc) %in% module_com_12$OTUs], 1, sum),
                       com_13 = apply(table_perc[,colnames(table_perc) %in% module_com_13$OTUs], 1, sum),
                       com_14 = apply(table_perc[,colnames(table_perc) %in% module_com_14$OTUs], 1, sum))

df <- cbind(samples = rownames(Syneccho), Syneccho)
df <- cbind(Date = metadata_M2BiPAT$Mois, df)
df <- cbind(Station = metadata_M2BiPAT$Station, df)
df <- cbind(Prof = metadata_M2BiPAT$prof, df)

df_m <- melt(df, id.vars=c("samples", "Date","Station","Prof"))

df_m$Station <- factor(df_m$Station, levels = c("Large","Front","Centre","Noires", "StMat"))
df_m$Date <- factor(df_m$Date, levels = c("2014_September","2015_March","2015_July","2015_September"))
df_m$Prof <- factor(df_m$Prof, levels = c("bottom", "median", "surface"))

df_m <- na.omit(df_m)

mcast <- dcast(df_m, Prof + Station + Date ~ variable, mean )
df_mm <- melt(mcast, id.vars=c("Prof","Station",'Date'))

df_mm <- na.omit(df_mm)

ggplot(df_mm, aes(x=Prof, y = value, fill=variable)) +
  geom_bar(stat = "identity", position = "fill") +
  facet_grid(Date~Station, scales="free", space="free_x") +
  scale_fill_manual(values=c("#d73b3e","#ffa089","#00cc99","#ffdf80","#417dc1","#aa98a9","#d2b48c","#ff865a","#b2beb5","#bdb76b","#ffb6c1","#536878","#0a3d6e","#fad6a5")) +
  coord_flip() +
  #geom_text(aes(label=Prof), position=position_dodge(width=0.9), vjust=-0.25) +
  theme(strip.text.y = element_text(size = 10, angle = 0), axis.text.x = element_text(size = 12 , angle = 90), axis.text.y = element_text(size = 14)) 

Relative abundance of the main families across the modules


# Group otus at family level

Matrice = load_meta("M2BiPAT_OTU_table.txt", sep= "\t") 

# Charge table Taxa
taxmat = read.delim("M2BiPAT_Taxotu_separated.csv", sep = ";", stringsAsFactors=FALSE, row.names=1)
taxmat$Domain <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Domain)
taxmat$Phylum <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Phylum)
taxmat$Class <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Class)
taxmat$Order <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Order)
taxmat$Family <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Family)
taxmat$Genus <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Genus)

# AnnotatedDataFrame
OTUdata = AnnotatedDataFrame(taxmat)


# MRexperiment object

obj = newMRexperiment(Matrice$counts,featureData=OTUdata)

# Group by family
obj_agg = aggTax(obj, lvl = "Family", out = "matrix")

# Export
exportMat(obj_agg, file = "M2BiPAT_agg_Family.txt")

# Plot the relative abundance 
table <- perc_module[,c(5,7,8)]
table <- read.table("M2BiPAT_agg_Family.txt", header=T, row.names=1)
table <- table[,c(5,7,8)]
cast <- dcast(table, Family ~ moduleColor, sum) 
rownames(cast) <- cast$Family
cast <- cast[,-1]

genre_table <- as.matrix(cast)

s_num<-dim(genre_table)[2]
otu.perc<-matrix(rep("NA",times=(dim(genre_table)[1]*s_num)),nrow=dim(genre_table)[1],ncol=s_num)
rownames(otu.perc)<-rownames(genre_table)
colnames(otu.perc)=colnames(genre_table) 
totals<-colSums(genre_table)

for(s in c(1:s_num)){
  vec<-(genre_table[,s]/totals[s])*100
  otu.perc[,s]<-vec
  rm(vec)
}

df_m <- melt(otu.perc)
colnames(df_m) <- c("Family","moduleColor","value")

# Keep the 30 most abundant families
cast$tot <- rowSums(cast)
cast_order <- cast[order(cast$tot, decreasing=TRUE),]
cast <- cast_order[c(1:30),]
cast <- cast[-4,]
df_mm <- df_m[df_m$Family %in% row.names(cast),]

df_mm[df_mm == 0] <- NA
df_mm <- na.omit(df_mm)

write.table(df_mm, "family_community.txt", sep="\t")

df_mm <- read.table("family_community.txt", header=T, row.names=1)

df_mm$moduleColor <- factor(df_mm$moduleColor, levels = c("com_5","com_13","com_12","com_14","com_10","com_7","com_6","com_3","com_11","com_4","com_2","com_1","com_8","com_9"))
df_mm$Family <- factor(df_mm$Family, levels=rev(row.names(pouet)))
df_mm$value <- as.numeric(df_mm$value)

ggplot(df_mm, aes(x=moduleColor , y = Family), size=as.numeric(value), color=as.numeric(value)) +
  geom_point(aes(size=as.numeric(value)),color="#757272", position = "identity")+ 
  scale_size(limits = c(0,100), range=c(1,10), breaks=c(10,20,30,40,50,60,70,80,90)) +
  scale_color_gradient2(limits=c(0,100),low="#00ffff", mid="#383838",high="#f26846", midpoint = 45, na.value=NA)

Calcul of CV of each OTU


df_m <- melt(t(table_perc))
colnames(df_m) <- c("OTUs","sample","value")
  
# Calculate CV
  mcast <- aggregate( value ~ OTUs, df_m, mean )
  sdcast <- aggregate( value ~ OTUs, df_m, sd )
  sdcast <- sdcast[,-1]
  sdcast <- as.data.frame(sdcast)
  colnames(sdcast) <- paste("sd", colnames(sdcast), sep="_")
  mcast$sd <- sdcast$sd_sdcast
  mcast$CV <- (mcast$sd/mcast$value)*100

# Calcule occurence (>0.05) in samples 

M <- t(table_perc)
M150 <- M
M150[M150 >= 0.05] <- 1
M150[M150 < 0.05] <- 0
M150 <- as.data.frame(M150)
M150$Tot <- rowSums(M150)
M150$label[ M150$Tot < 20] <- "S"
M150$label[ M150$Tot > 65] <- "G"
M <- M150[,c(135,136)]
M$OTUs <- rownames(M)

OTU_mod <- perc_module[,c(1,7)]
CV_mod <- merge(mcast, OTU_mod, by="OTUs")
CV_abund <- merge(CV_mod, M, by="OTUs")
CV_f <- CV_abund[,c(1,4,5,6,7)]
lmObject <- lm(CV ~ Tot, data = CV_f)
mpi <- cbind(CV_f, predict(lmObject, interval = "prediction", level=0.5))
  
# See outliers 

plotPoints <- mpi[which(!(mpi$CV > mpi$lwr & mpi$CV < mpi$upr)),]
good <- merge(plotPoints, M, by="OTUs")

ggplot(data=plotPoints, aes(x = Tot, y = CV)) + geom_point()

# Plot the CV against the occurence

ggplot(CV_f, aes(x=Tot, y = CV, color=moduleColor)) +
    geom_point(aes(color=moduleColor)) +
    geom_smooth(method='lm', se= FALSE) +
    geom_ribbon(data=mpi, aes(ymin = lwr, ymax = upr), fill = "#36454f", alpha = 0.1) +
    scale_y_continuous(trans = "log") +
    scale_color_manual(values=c("#d73b3e","#bdb76b","#ffb6c1","#536878","#0a3d6e","#fad6a5","#ffa089","#00cc99","#ffdf80","#417dc1","#aa98a9","#d2b48c","#ff865a","#b2beb5"))