This is a reproductible workflow of the 16S amplicon analysis of the paper “Linking Spatial and temporal dynamic of bacterioplankton communities with ecological strategies across a coastal frontal area”_
Raw data files can be accessed here
II. Beta-diversity and statistics
III. Network analysis
All these first steps were done using Illumina utils scripts written by Meren (Eren et al., 2013)
cd /raw_data
# Create a qual_file
ls *.fastq | \
awk 'BEGIN{FS="_R"}{print $1}' | \
uniq | \
awk 'BEGIN{print "sample\tr1\tr2"}
{print $0 "\t" $0 "_R1.fastq\t" $0"_R2.fastq"}' > qual_file
# Create a merge file
ls *.fastq | \
awk 'BEGIN{FS="_R"}{print $1}' | \
uniq | \
awk 'BEGIN{print "sample\tr1\tr2"}
{print $0 "\t" $0 "-QUALITY_PASSED_R1.fastq\t" $0"-QUALITY_PASSED_R2.fastq"}' > merge_file
# Generate a config file (.ini) as follow
iu-gen-configs qual_file --r1-prefix "^.....CCAGCAGC[C,T]GCGGTAA." --r2-prefix "^CCGTC[A,T]ATT[C,T].TTT[A,G]A.T"
# create a loop file boucle_quality.sh
for file in *.ini
do iu-filter-quality-minoche $file
done
# activate it
chmod 755 boucle_quality.sh
# run it
./boucle_quality.sh
cd /raw_data
# Generate a config file (.ini) as follow
iu-gen-configs merge_file --r1-prefix "^.....CCAGCAGC[C,T]GCGGTAA." --r2-prefix "^CCGTC[A,T]ATT[C,T].TTT[A,G]A.T"
# Create a loop file boucle_merge.sh as follow :
for file in *.ini
do iu-merge-pairs $file
done
# activate it
chmod 755 boucle_merge.sh
# run it
./boucle_merge.sh
Clustering of sequences in OTUs was done using Swarm algorithm (Mahé et al., 2014)
cd /raw_data
# Using derep function in Vsearch-2.3.4
# Create a loop boucle_derep.sh
for file in *_MERGED
do vsearch --derep_fulllength $file --sizeout --relabel_sha1 --fasta_width 0 --output ./$file.fasta
done
# Defline needs to be change for downstream analysis. Replace "size=" by "_"
for file in *.fasta
do sed 's/;size=/_/' $file > s_$file
done
# Move s_* files in a new folder
mkdir s_fasta
mv s_* ./s_fasta
cd ./s_fasta
# Create a fasta of dereplicated sequences for all the samples
export LC_ALL=C
cat *.fasta| \
awk 'BEGIN {RS = ">" ; FS = "[_\n]"}
{if (NR != 1) {abundances[$1] += $2 ; sequences[$1] = $3}}
END {for (amplicon in sequences) {
print ">" amplicon "_" abundances[amplicon] "_" sequences[amplicon]}}' | \
sort --temporary-directory=$(pwd) -t "_" -k2,2nr -k1.2,1d | \
sed -e 's/\_/\n/2' > M2BiPAT_derep.fa
# Create an amplicon contingency table
awk 'BEGIN {FS = "[>_]"}
# Parse the sample files
/^>/ {contingency[$2][FILENAME] = $3
amplicons[$2] += $3
if (FNR == 1) {
samples[++i] = FILENAME
}
}
END {# Create table header
printf "amplicon"
s = length(samples)
for (i = 1; i <= s; i++) {
printf "\t%s", samples[i]
}
printf "\t%s\n", "total"
# Sort amplicons by decreasing total abundance (use a coprocess)
command = "LC_ALL=C sort -k1,1nr -k2,2d"
for (amplicon in amplicons) {
printf "%d\t%s\n", amplicons[amplicon], amplicon |& command
}
close(command, "to")
FS = "\t"
while ((command |& getline) > 0) {
amplicons_sorted[++j] = $2
}
close(command)
# Print the amplicon occurrences in the different samples
n = length(amplicons_sorted)
for (i = 1; i <= n; i++) {
amplicon = amplicons_sorted[i]
printf "%s", amplicon
for (j = 1; j <= s; j++) {
printf "\t%d", contingency[amplicon][samples[j]]
}
printf "\t%d\n", amplicons[amplicon]
}}' *.fasta > M2BiPAT_amplicon_contingency_table.csv
# Using Swarm-2.2.2
swarm M2BiPAT_derep.fa -d 1 -f -t 8 -w M2BiPAT_OTU_rep -s M2BiPAT_amplicons.stats -o M2BiPAT_amplicons.swarms
# Create an OTU contingency table
STATS="M2BiPAT_amplicons.stats"
SWARMS="M2BiPAT_amplicons.swarms"
AMPLICON_TABLE="M2BiPAT_amplicon_contingency_table.csv"
OTU_TABLE="M2BiPAT_OTU_contingency_table.csv"
# Header
echo -e "OTU\t$(head -n 1 "${AMPLICON_TABLE}")" > "${OTU_TABLE}"
# Compute "per sample abundance" for each OTU
awk -v SWARM="${SWARMS}" \
-v TABLE="${AMPLICON_TABLE}" \
'BEGIN {FS = " "
while ((getline < SWARM) > 0) {
swarms[$1] = $0
}
FS = "\t"
while ((getline < TABLE) > 0) {
table[$1] = $0
}
}
{# Parse the stat file (OTUs sorted by decreasing abundance)
seed = $3 "_" $4
n = split(swarms[seed], OTU, "[ _]")
for (i = 1; i < n; i = i + 2) {
s = split(table[OTU[i]], abundances, "\t")
for (j = 1; j < s; j++) {
samples[j] += abundances[j+1]
}
}
printf "%s\t%s", NR, $3
for (j = 1; j < s; j++) {
printf "\t%s", samples[j]
}
printf "\n"
delete samples
}' "${STATS}" >> "${OTU_TABLE}"
cd /s_fasta
# Again, have to change the defline
sed 's/;_/;size=/' M2BiPAT_OTU_rep > v_M2BiPAT_OTU_rep
sed '/^>/ s/$/;/' v_M2BiPAT_OTU_rep > v2_M2BiPAT_OTU_rep
# Chimera detection must be done in screen
vsearch --uchime_denovo v2_M2BiPAT_OTU_rep --chimeras M2BiPAT_chimeras.fa --nonchimeras M2BiPAT_OTU_rep_whithout_chimera.fa
# Remove chimera sequences from OTU contingency table
grep '>' M2BiPAT_chimeras.fa > chimera_names_M2BiPAT
sed 's/;size=[0-9]*//' chimera_names_M2BiPAT > chimera_the_file
sed 's/>//' chimera_the_file > chimera
awk > M2BiPAT_OTU_table.txt 'NR==FNR{arr[$0];next}!($2 in arr)' FS="\t" chimera M2BiPAT_OTU_contingency_table.csv
# done with mothur v.1.35.1
# Using Silva v123 trimmed for v4-v5 region
mothur > classify.seqs(fasta=> M2BiPAT_OTU_rep_whithout_chimera.fa, template= Silva123_V4_V5_trimmed.fasta, taxonomy=SIlva123_slv_taxonomy.txt.nds, processors=8)
# load the different tables,
the metadata table is available as "supplementary_data_2.csv" in the paper
OTU_table <- read.table("M2BiPAT_OTU_table.txt", header = T, sep = "\t")
metadata <- read.table("Metadata_M2BiPAT.csv", header = T, sep = ";")
taxonomy <- read.table("M2BiPAT.txt.wang.taxonomy", header = T, sep = "\t")
colnames(taxonomy) <- c("amplicon","taxonomy")
taxonomy$amplicon <- gsub('_[0-9]*','', taxonomy$amplicon)
# Add taxonomy to the OTU_table
OTU_table_tax <- merge(OTU_table, taxonomy, by="amplicon")
OTU_table_tax_ordered <- OTU_table_tax[order(OTU_table_tax$total, decreasing = T),]
# Remove unwanted taxa
OTU_table_tax_ordered <- OTU_table_tax_ordered[grep("Eukaryota|Archaea|Mitochondria|Chloroplast", OTU_table_tax_ordered$taxonomy, invert =T),]
# good OTU_table
OTU_table <- OTU_table_tax_ordered[,-c(1,189,190)]
rownames(OTU_table) <- OTU_table$OTU
OTU_table <- OTU_table[,colnames(OTU_table) %in% rownames(metadata)]
write.table(OTU_table, file = "./M2BiPAT_OTU_table.txt", sep= "\t")
# good taxonomy file
taxOTUs <- OTU_table_tax_ordered[,c(2,190)]
write.table(taxOTUs, file = "./M2BiPAT_taxOTUs.txt", sep= "\t")
# Normalisation with DESeq package
library(DESeq)
Matrice <- read.delim("M2BiPAT_OTU_table.txt", header=T, sep="\t", row.names=1)
triceMa <- t(Matrice)
triceMa <- cbind(sample = rownames(triceMa), triceMa)
triceMa <- triceMa[,1]
mat <- newCountDataSet(Matrice, triceMa)
mat <- estimateSizeFactors(mat)
sizeFactors(mat)
mat <- counts(mat, normalized=TRUE)
write.table(mat, file = "M2BiPAT_OTU_DESeq.txt", sep= "\t", row.names=T)
# Create a table of relative abundance
OTU <- read.table("M2BiPAT_OTU_DESeq.txt", header=T, row.names=1)
genre_table <- as.matrix(t(OTU))
s_num<-dim(genre_table)[2]
otu.perc<-matrix(rep("NA",times=(dim(genre_table)[1]*s_num)),nrow=dim(genre_table)[1],ncol=s_num)
rownames(otu.perc)<-rownames(genre_table)
colnames(otu.perc)=colnames(genre_table)
totals<-colSums(genre_table)
for(s in c(1:s_num)){
vec<-(genre_table[,s]/totals[s])*100
otu.perc[,s]<-vec
rm(vec)
}
write.table(otu.perc, "M2BiPAT_OTU_percent.txt", col.names = NA, row.names = T, sep = "\t")
library(vegan)
library(ggplot2)
metadata<- read.table("../data/Metadata_M2BiPAT.csv", header = T, sep=";", row.names=1)
Matrice <- read.table("M2BiPAT_OTU_DESeq.txt", header=T, row.names=1)
Matrice <- t(Matrice)
metadata <- metadata[ order(row.names(metadata)), ]
Matrice <- Matrice[ order(row.names(Matrice)), ]
Matrice <- Matrice[rownames(Matrice) %in% rownames(metadata),]
metadata <- metadata[rownames(metadata) %in% rownames(Matrice),]
bray <- vegdist(Matrice, method="bray")
nmds <- metaMDS(bray, wascore = F)
data.score <- as.data.frame(scores(nmds))
data.score$site <- rownames(data.score)
data.score$Mois <- metadata$Mois
data.score$Prof <- metadata$prof
data.score$Station <- metadata$station
data.score <- na.omit(data.score)
ggplot(data=data.score, aes(x=NMDS1, y=NMDS2, size=as.numeric(Prof), shape=Station)) +
geom_point(data=data.score, aes(size=Prof, fill= Mois)) +
scale_fill_manual(values=c("#8D68B0","#FFD062","#8FBC8F","#779ECB"))+
scale_shape_manual(values = c(25,24,21,22,23)) +
scale_size_manual(values = c(4,6,8))
# Test if depth significantly differentiate different communities for each stations of the summer cruises.
metadata_P <- metadata[,c(1,3,7,12)]
metadata_P <- na.omit(metadata_P)
Matrice_P <- Matrice[rownames(Matrice) %in% rownames(metadata_P),]
metadata_Summer <- metadata_P[grep("2015_March", metadata$Mois, invert=T),]
Matrice_Summer <- Matrice_P[rownames(Matrice_P) %in% rownames(metadata_Summer),]
meta1<- metadata_Summer[grep("Station 5", metadata_Summer$station),]
Matrice <- Matrice_Summer[rownames(Matrice_Summer) %in% rownames(meta1),]
bray <- vegdist(Matrice, method="bray")
adonis(formula = bray ~ prof ,
data = meta1,
permutations = 999,
method = "bray",
binary = TRUE)
library(vegan)
library(ggbiplot)
metadata<- read.csv("../data/Metadata_M2BiPAT.csv", header = T, sep=";", row.names=1)
df <- metadata[,c(3,7,12,16,22,23,24,29,30,31,32)]
df <- na.omit(df)
log.ir <- log(df[,c(4,5,6,7,8,9,10,11)])
log.ir <- log(df[,c(4:11)]+1)
ir.species <- df[,1]
ir <- df[,2]
ir.pca <- prcomp(log.ir,
scale. = TRUE, center=TRUE)
g <- ggbiplot(ir.pca, groups=ir.species)
print(g)
#Required packages
library(phyloseq)
library(SpiecEasi)
library(Matrix)
library(reshape2)
library(scales)
library(tidygraph)
library(tidyverse)
library(igraph)
library(SpiecEasi)
library(phyloseq)
source("https://raw.githubusercontent.com/genomewalker/osd2014_analysis/master/osd2014_16S_asv/lib/graph_lib.R")
library(WGCNA)
library(tidyverse)
library(broom)
# Load abundance matrix, metadata and taxonomy file
MatricePhylo <- read.delim("M2BiPAT_OTU_table.txt", header=T, sep="\t", row.names=1)
metadata<- read.csv("Metadata_M2BiPAT.csv", row.names=1, header = T, sep=";")
df <- t(MatricePhylo)
taxmat <- read.csv("M2BiPAT_Taxotu_separated.csv", header=T, row.names=1, sep=";")
taxmat <- taxmat[rownames(taxmat) %in% rownames(Matrice_M2BiPAT),]
taxmat <- as.matrix(taxmat)
# Phyloseq object
OTU = otu_table(t(Matrice_M2BiPAT), taxa_are_rows = TRUE)
TAX = tax_table(taxmat)
sampledata = sample_data(metadata)
Phymatrice = phyloseq(OTU,TAX, sampledata)
# Filter OTUs that have 50 sequences in at least 3 samples
subset_3samples = genefilter_sample(Phymatrice, filterfun_sample(function(x) x >= 50), A=3)
Phymatrice2 = prune_taxa(subset_3samples, Phymatrice)
ntaxa(Phymatrice2)
nsamples(Phymatrice2)
# Print the filtered matrix
otus <- otu_table(Phymatrice2)
otus <- as.data.frame(otus)
write.table(otus, file="Phymatrice_M2BiPAT_50seqs3samples_brut.txt", sep="\t")
load("Phymatrice2")
load("Phymatrice_M2BiPAT_50seqs3samples_brut.txt")
mytaxonomy <- read.csv("mytaxonomy.csv", header=T, sep=";") # A table where the taxonomy levels are splitted in different columns
# Perform SPIEC-EASI
M2BiPAT.OSD.ctl.gl <- spiec.easi(Phymatrice2, method="glasso", lambda.min.ratio=0.01, nlambda=30, icov.select.params = list(rep.num=30, ncores=16))
# Export SPIEC-EASI correlation matrix and model
secor <- cov2cor(forceSymmetric(getOptCov(OSD.ctl.gl), ifelse(sum(Matrix::tril(getOptCov(OSD.ctl.gl)))>sum(Matrix::triu(getOptCov(OSD.ctl.gl))), 'L', 'U')))
refit <- forceSymmetric(getRefit(OSD.ctl.gl), ifelse(sum(Matrix::tril(getRefit(OSD.ctl.gl)))>sum(Matrix::triu(getRefit(OSD.ctl.gl))), 'L', 'U'))
OSD.ctl.gl_net <- adj2igraph(as.matrix(secor*refit), vertex.attr=list(name=taxa_names(PhyMatrice)))
E(OSD.ctl.gl_net)$weight %>% hist
graph.density(OSD.ctl.gl_net)
OSD.ctl.gl$est$sparsity[getOptInd(OSD.ctl.gl)]
g <- OSD.ctl.gl_net %>%
as_tbl_graph()
# Determine communities with Louvain algorithm
community_bin <- file.path("~/Desktop/Network_Antonio/deterministic_louvain//community ")
convert_bin <- file.path("~/Desktop/Network_Antonio/deterministic_louvain//convert")
hierarchy_bin <- file.path("~/Desktop/Network_Antonio/deterministic_louvain//hierarchy -n")
community_options <- "725 -l -1 -v" #14
g_louvain <- run_louvain(X = "g", graph = g, community_bin = community_bin, convert_bin = convert_bin, hierarchy_bin = hierarchy_bin, community_options = community_options )
com_l_0 <- make_clusters(g, as.numeric(g_louvain[[1]]), algorithm = 'Louvain', modularity = TRUE)
com_l_1 <- make_clusters(g, as.numeric(g_louvain[[2]]), algorithm = 'Louvain', modularity = TRUE)
com_l_2 <- make_clusters(g, as.numeric(g_louvain[[3]]), algorithm = 'Louvain', modularity = TRUE)
V(g)$com <- as.character(membership(com_l_1))
counts <- dplyr::as_data_frame(g) %>%
group_by(com) %>%
dplyr::count() %>%
dplyr::rename(n_mem = n)
g <- g %>%
left_join(counts)
g.c <- contract.vertices(as.igraph(g), V(as.igraph(g))$com,
vertex.attr.comb=list(name= function(x) paste(x, collapse="|"),
com= function(x) unique(x),
n_mem = function(x) unique(x))
)
g.c <- igraph::simplify(g.c, edge.attr.comb = list(weight = function(x) median(x)), remove.loops = TRUE)
g <- g %>% activate(nodes) %>% mutate(com = as.character(paste("com", com, sep = "_")))
g.c <- as_tbl_graph(g.c) %>% activate(nodes) %>% mutate(com = as.character(paste("com", com, sep = "_")))
g.c <- as_tbl_graph(g.c) %>%
activate(edges) %>%
mutate(sign = ifelse(weight < 0, "NEG", "POS"),
weight_orig = weight,
weight = abs(weight))
g <- as_tbl_graph(g) %>%
activate(edges) %>%
mutate(sign = ifelse(weight < 0, "NEG", "POS"),
weight_orig = weight,
weight = abs(weight))
write.graph(g, "./OSD.ctl.gl_net.graphml", format = "graphml")
write.graph(g.c, "./OSD.ctl.gl_net_aggregated.graphml", format = "graphml")
write_tsv(g %>% activate("nodes") %>% as_tibble(), "~/Downloads/OSD.ctl.gl_net.tsv")
MatriceF <- read.delim("M2BiPAT_OTU_percent.txt", header=T, sep="\t", row.names=1)
Matrice <- read.table("Phymatrice_50seqs3samples_brut.txt", header=T, sep="\t", row.names=1)
MatriceF <- MatriceF[rownames(MatriceF) %in% rownames(Matrice),]
df <- t(MatriceF)
mat <- Matrice_M2BiPAT
df_nodes <- dplyr::as_data_frame(g %>% activate(nodes))
colnames(mat) <- df_nodes$name
PCs <- WGCNA::moduleEigengenes(mat, colors=df_nodes$com)
ME <- PCs$eigengenes
rownames(ME) <- rownames(mat)
metadata <- as(metadata_M2BiPAT, "matrix") %>% tbl_df()
datExpr = Matrice_M2BiPAT
datTraits = metadata_M2BiPAT[,c(16:56)]
sampleTree2 = hclust(dist(datExpr), method = "average")
traitColors = numbers2colors(datTraits, signed = FALSE)
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")
# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = ME
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
taxmat = read.table(file="mytaxonomy.csv", header=T, row.names=1, sep=";")
rownames(taxmat) <- paste("OTU", rownames(taxmat), sep="")
annot = taxmat
dim(annot)
names(annot)
probes = colnames(datExpr)
probes2annot = match(probes, rownames(annot))
moduleColors <- df_nodes$com
geneInfo0 = data.frame(OTUs = probes,
Phylum = annot$Phylum[probes2annot],
Class = annot$Class[probes2annot],
Order = annot$Order[probes2annot],
Family = annot$Family[probes2annot],
Genus = annot$Genus[probes2annot],
moduleColor = moduleColors)
geneInfo0 = data.frame(OTUs = probes,
taxonomy = annot$taxonomy[probes2annot],
moduleColor = moduleColors)
table_module <- geneInfo0[,c(1:7)]
table_perc <- t(Matrice_M2BiPAT)
table_perc <- as.data.frame(table_perc)
table_perc$tot <- rowSums(table_perc)
table_perc <- cbind(OTUs = rownames(table_perc), table_perc)
abond <- table_perc[,c(1,136)]
abond <- as.data.frame(abond)
table_perc <- t(table_perc)
table_perc <- table_perc[-c(1,136),]
perc_module <- merge(table_module, abond, by = "OTUs")
module_com_1 <- perc_module[perc_module$moduleColor == 'com_1',]
module_com_2 <- perc_module[perc_module$moduleColor == 'com_2',]
module_com_3 <- perc_module[perc_module$moduleColor == 'com_3',]
module_com_4 <- perc_module[perc_module$moduleColor == 'com_4',]
module_com_5 <- perc_module[perc_module$moduleColor == 'com_5',]
module_com_6 <- perc_module[perc_module$moduleColor == 'com_6',]
module_com_7 <- perc_module[perc_module$moduleColor == 'com_7',]
module_com_8 <- perc_module[perc_module$moduleColor == 'com_8',]
module_com_9 <- perc_module[perc_module$moduleColor == 'com_9',]
module_com_10 <- perc_module[perc_module$moduleColor == 'com_10',]
module_com_11 <- perc_module[perc_module$moduleColor == 'com_11',]
module_com_12 <- perc_module[perc_module$moduleColor == 'com_12',]
module_com_13 <- perc_module[perc_module$moduleColor == 'com_13',]
module_com_14 <- perc_module[perc_module$moduleColor == 'com_14',]
write.table(table_perc, "table_perc.txt", sep="\t")
table_perc <- read.table("table_perc.txt", sep="\t", header=T, row.names=1)
Syneccho <- data.frame(com_1 = apply(table_perc[,colnames(table_perc) %in% module_com_1$OTUs], 1, sum),
com_2 = apply(table_perc[,colnames(table_perc) %in% module_com_2$OTUs], 1, sum),
com_3 = apply(table_perc[,colnames(table_perc) %in% module_com_3$OTUs], 1, sum),
com_4 = apply(table_perc[,colnames(table_perc) %in% module_com_4$OTUs], 1, sum),
com_5 = apply(table_perc[,colnames(table_perc) %in% module_com_5$OTUs], 1, sum),
com_6 = apply(table_perc[,colnames(table_perc) %in% module_com_6$OTUs], 1, sum),
com_7 = apply(table_perc[,colnames(table_perc) %in% module_com_7$OTUs], 1, sum),
com_8 = apply(table_perc[,colnames(table_perc) %in% module_com_8$OTUs], 1, sum),
com_9 = apply(table_perc[,colnames(table_perc) %in% module_com_9$OTUs], 1, sum),
com_10 = apply(table_perc[,colnames(table_perc) %in% module_com_10$OTUs], 1, sum),
com_11 = apply(table_perc[,colnames(table_perc) %in% module_com_11$OTUs], 1, sum),
com_12 = apply(table_perc[,colnames(table_perc) %in% module_com_12$OTUs], 1, sum),
com_13 = apply(table_perc[,colnames(table_perc) %in% module_com_13$OTUs], 1, sum),
com_14 = apply(table_perc[,colnames(table_perc) %in% module_com_14$OTUs], 1, sum))
df <- cbind(samples = rownames(Syneccho), Syneccho)
df <- cbind(Date = metadata_M2BiPAT$Mois, df)
df <- cbind(Station = metadata_M2BiPAT$Station, df)
df <- cbind(Prof = metadata_M2BiPAT$prof, df)
df_m <- melt(df, id.vars=c("samples", "Date","Station","Prof"))
df_m$Station <- factor(df_m$Station, levels = c("Large","Front","Centre","Noires", "StMat"))
df_m$Date <- factor(df_m$Date, levels = c("2014_September","2015_March","2015_July","2015_September"))
df_m$Prof <- factor(df_m$Prof, levels = c("bottom", "median", "surface"))
df_m <- na.omit(df_m)
mcast <- dcast(df_m, Prof + Station + Date ~ variable, mean )
df_mm <- melt(mcast, id.vars=c("Prof","Station",'Date'))
df_mm <- na.omit(df_mm)
ggplot(df_mm, aes(x=Prof, y = value, fill=variable)) +
geom_bar(stat = "identity", position = "fill") +
facet_grid(Date~Station, scales="free", space="free_x") +
scale_fill_manual(values=c("#d73b3e","#ffa089","#00cc99","#ffdf80","#417dc1","#aa98a9","#d2b48c","#ff865a","#b2beb5","#bdb76b","#ffb6c1","#536878","#0a3d6e","#fad6a5")) +
coord_flip() +
#geom_text(aes(label=Prof), position=position_dodge(width=0.9), vjust=-0.25) +
theme(strip.text.y = element_text(size = 10, angle = 0), axis.text.x = element_text(size = 12 , angle = 90), axis.text.y = element_text(size = 14))
# Group otus at family level
Matrice = load_meta("M2BiPAT_OTU_table.txt", sep= "\t")
# Charge table Taxa
taxmat = read.delim("M2BiPAT_Taxotu_separated.csv", sep = ";", stringsAsFactors=FALSE, row.names=1)
taxmat$Domain <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Domain)
taxmat$Phylum <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Phylum)
taxmat$Class <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Class)
taxmat$Order <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Order)
taxmat$Family <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Family)
taxmat$Genus <- gsub("\\s*\\([^\\)]+\\)","", taxmat$Genus)
# AnnotatedDataFrame
OTUdata = AnnotatedDataFrame(taxmat)
# MRexperiment object
obj = newMRexperiment(Matrice$counts,featureData=OTUdata)
# Group by family
obj_agg = aggTax(obj, lvl = "Family", out = "matrix")
# Export
exportMat(obj_agg, file = "M2BiPAT_agg_Family.txt")
# Plot the relative abundance
table <- perc_module[,c(5,7,8)]
table <- read.table("M2BiPAT_agg_Family.txt", header=T, row.names=1)
table <- table[,c(5,7,8)]
cast <- dcast(table, Family ~ moduleColor, sum)
rownames(cast) <- cast$Family
cast <- cast[,-1]
genre_table <- as.matrix(cast)
s_num<-dim(genre_table)[2]
otu.perc<-matrix(rep("NA",times=(dim(genre_table)[1]*s_num)),nrow=dim(genre_table)[1],ncol=s_num)
rownames(otu.perc)<-rownames(genre_table)
colnames(otu.perc)=colnames(genre_table)
totals<-colSums(genre_table)
for(s in c(1:s_num)){
vec<-(genre_table[,s]/totals[s])*100
otu.perc[,s]<-vec
rm(vec)
}
df_m <- melt(otu.perc)
colnames(df_m) <- c("Family","moduleColor","value")
# Keep the 30 most abundant families
cast$tot <- rowSums(cast)
cast_order <- cast[order(cast$tot, decreasing=TRUE),]
cast <- cast_order[c(1:30),]
cast <- cast[-4,]
df_mm <- df_m[df_m$Family %in% row.names(cast),]
df_mm[df_mm == 0] <- NA
df_mm <- na.omit(df_mm)
write.table(df_mm, "family_community.txt", sep="\t")
df_mm <- read.table("family_community.txt", header=T, row.names=1)
df_mm$moduleColor <- factor(df_mm$moduleColor, levels = c("com_5","com_13","com_12","com_14","com_10","com_7","com_6","com_3","com_11","com_4","com_2","com_1","com_8","com_9"))
df_mm$Family <- factor(df_mm$Family, levels=rev(row.names(pouet)))
df_mm$value <- as.numeric(df_mm$value)
ggplot(df_mm, aes(x=moduleColor , y = Family), size=as.numeric(value), color=as.numeric(value)) +
geom_point(aes(size=as.numeric(value)),color="#757272", position = "identity")+
scale_size(limits = c(0,100), range=c(1,10), breaks=c(10,20,30,40,50,60,70,80,90)) +
scale_color_gradient2(limits=c(0,100),low="#00ffff", mid="#383838",high="#f26846", midpoint = 45, na.value=NA)
df_m <- melt(t(table_perc))
colnames(df_m) <- c("OTUs","sample","value")
# Calculate CV
mcast <- aggregate( value ~ OTUs, df_m, mean )
sdcast <- aggregate( value ~ OTUs, df_m, sd )
sdcast <- sdcast[,-1]
sdcast <- as.data.frame(sdcast)
colnames(sdcast) <- paste("sd", colnames(sdcast), sep="_")
mcast$sd <- sdcast$sd_sdcast
mcast$CV <- (mcast$sd/mcast$value)*100
# Calcule occurence (>0.05) in samples
M <- t(table_perc)
M150 <- M
M150[M150 >= 0.05] <- 1
M150[M150 < 0.05] <- 0
M150 <- as.data.frame(M150)
M150$Tot <- rowSums(M150)
M150$label[ M150$Tot < 20] <- "S"
M150$label[ M150$Tot > 65] <- "G"
M <- M150[,c(135,136)]
M$OTUs <- rownames(M)
OTU_mod <- perc_module[,c(1,7)]
CV_mod <- merge(mcast, OTU_mod, by="OTUs")
CV_abund <- merge(CV_mod, M, by="OTUs")
CV_f <- CV_abund[,c(1,4,5,6,7)]
lmObject <- lm(CV ~ Tot, data = CV_f)
mpi <- cbind(CV_f, predict(lmObject, interval = "prediction", level=0.5))
# See outliers
plotPoints <- mpi[which(!(mpi$CV > mpi$lwr & mpi$CV < mpi$upr)),]
good <- merge(plotPoints, M, by="OTUs")
ggplot(data=plotPoints, aes(x = Tot, y = CV)) + geom_point()
# Plot the CV against the occurence
ggplot(CV_f, aes(x=Tot, y = CV, color=moduleColor)) +
geom_point(aes(color=moduleColor)) +
geom_smooth(method='lm', se= FALSE) +
geom_ribbon(data=mpi, aes(ymin = lwr, ymax = upr), fill = "#36454f", alpha = 0.1) +
scale_y_continuous(trans = "log") +
scale_color_manual(values=c("#d73b3e","#bdb76b","#ffb6c1","#536878","#0a3d6e","#fad6a5","#ffa089","#00cc99","#ffdf80","#417dc1","#aa98a9","#d2b48c","#ff865a","#b2beb5"))