Blandine Trouche October 2023
### Extract amoA genes from the unbinned co-assemblies ###
conda activate anvio-7.1
# this code should be run in the home folder of the project
for group in A7D A7S A9S AK7 H3D H3T HAK HAS HKT MD
do
anvi-export-functions -c 03_CONTIGS/${group}-contigs.db \
--annotation-sources KOfam -o 08_FUNCTIONS/${group}_KOfam.txt
# Extracting the gene ids
grep "ammonia monooxygenase subunit A" 08_FUNCTIONS/${group}_KOfam.txt \
| awk '{print $1}' > 08_FUNCTIONS/1_amoA/${group}_amoA_geneid.txt
anvi-get-sequences-for-gene-calls -c 03_CONTIGS/${group}-contigs.db \
--gene-caller-ids 08_FUNCTIONS/1_amoA/${group}_amoA_geneid.txt \
--report-extended-deflines -o 08_FUNCTIONS/1_amoA/${group}_amoA_sequences.txt
done
cat 08_FUNCTIONS/1_amoA/*_amoA_sequences.txt > 08_FUNCTIONS/1_amoA/All_amoA_seq.fasta
# Dereplicating the sequences by clustering them at 100% with CD-HIT
conda activate cd-hit
cd-hit-est -i 08_FUNCTIONS/1_amoA/All_amoA_seq.fasta \
-o 08_FUNCTIONS/1_amoA/Derep_amoA_seq.fasta -c 1
# Extract amoA genes from reconstructed MAGs
cd $HOME/NON_REDUNDANT_BINS
conda activate anvio-7.1
anvi-export-functions -c 03_CONTIGS/NR_MAGs-contigs.db --annotation-sources KOfam \
-o 08_FUNCTIONS/Functions_KOfam_internal.txt
grep "ammonia monooxygenase subunit A" 08_FUNCTIONS/Functions_KOfam_internal.txt
anvi-get-sequences-for-gene-calls -c 03_CONTIGS/NR_MAGs-contigs.db \
--gene-caller-ids 85155,61587,63004,35832,4028,4582,109889,25473,19263,49045 \
--report-extended-deflines -o 08_FUNCTIONS/1_amoA/amoA_sequences_internal.txt
cd NON_REDUNDANT_BINS
conda activate anvio-7.1
for i in 03_CONTIGS_EXT/*-contigs.db
do
NAME=$(echo $i | sed 's|.*\/||')
NAME=$(echo $NAME | sed 's|-contigs.db||')
echo $NAME
anvi-export-functions -c $i --annotation-sources KOfam -o 08_FUNCTIONS/${NAME}_KOfam.txt
GENE=grep "ammonia monooxygenase subunit A" 08_FUNCTIONS/${NAME}_KOfam.txt | awk '{print $1}'
anvi-get-sequences-for-gene-calls -c 03_CONTIGS_EXT/${NAME}-contigs.db \
--gene-caller-ids $GENE -o 08_FUNCTIONS/1_amoA_ext/${NAME}_amoA.txt --report-extended-deflines
done
# concatenate all detected amoA sequences
cat 08_FUNCTIONS/1_amoA_ext/*_amoA.txt > 08_FUNCTIONS/1_amoA_ext/ext_genomes_amoA_seq.fasta
# this code should be run in the home folder of the project
### Tree data was obtained from the supplementary material of Alves et al., 2018
### It was downloaded and placed in folder AmoA_DB_Alves_2018
# Get tree model for EPA with RAxML
./raxml-ng/raxml-ng --evaluate \
--msa AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_an96.aln.roguesout.fasta \
--tree AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_tree.all.branches.newick \
--prefix info --model GTR+G+F
# Align query to reference
conda activate MAFFT
cat AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_an96.aln.roguesout.fasta \
08_FUNCTIONS/1_amoA/Derep_amoA_seq.fasta \
NON_REDUNDANT_BINS/08_FUNCTIONS/1_amoA/amoA_sequences_internal.txt \
NON_REDUNDANT_BINS/08_FUNCTIONS/1_amoA_ext/ext_genomes_amoA_seq.fasta \
> 09_amoA_tree/amoA_db_ext_int.fasta
mafft 09_amoA_tree/amoA_db_ext_int.fasta > 09_amoA_tree/amoA_db_ext_int_aln.fasta
# Use EPA-ng to place in ref tree
conda activate epa-ng
cd 09_amoA_tree
epa-ng --split ../AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_an96.aln.roguesout.fasta \
amoA_db_ext_int_aln.fasta
epa-ng --ref-msa reference.fasta \
--tree ../AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_tree.all.branches.newick \
--query query.fasta --threads 8 \
--model ../raxml_model/info.raxml.bestModel
# creation of a jplace file as a result
mv epa_result.jplace epa_result_amoA_int_ext.jplace
conda activate gtdbtk-1.7.0
# From the home folder
cd NON_REDUNDANT_BINS
mkdir -p GTDB_tree
cd GTDB_tree
# I will use GTDB-tk's classify workflow
# Create symlinks to the fasta files in an input folder
mkdir -p 1_fasta
ln -s ../02_FASTA/*_Bin_*.fa 1_fasta/
ln -s ../02_FASTA_EXT/*/*.fa 1_fasta/
# Apply the classify workflow
gtdbtk classify_wf --genome_dir 1_fasta --extension fa --out_dir 2_classify_wf_output --cpus 12
# Redo the alignment and subset the tree
gtdbtk align --identify_dir 2_classify_wf_output \
--out_dir 3_align_output \
--cpus 8 \
--taxa_filter o__Nitrososphaerales,g__Pyrococcus
# unload gtdb-tk and activate Anvio to use IQ-TREE
conda deactivate
conda activate anvio-7.1
# Remove gaps from alignment
trimal -in 3_align_output/gtdbtk.ar122.msa.fasta \
-out 4_trimal_iqtree/gtdbtk_ar122_msa_anvio_clean.fasta \
-gt 0.50 &> 4_trimal_iqtree/logs/trimal.log 2>&1
# Make tree with the MSA file from gtdb_tk
iqtree -s 4_trimal_iqtree/gtdbtk_ar122_msa_anvio_clean.fasta \
-nt 8 -m WAG \
-bb 1000 &> 4_trimal_iqtree/logs/iqtree.log 2>&1
# go back to gtdb-tk and root the tree
conda deactivate
conda activate gtdbtk-1.7.0
gtdbtk root --input_tree 4_trimal_iqtree/gtdbtk_ar122_msa_anvio_clean.fasta.treefile \
--outgroup_taxon g__Pyrococcus \
--output_tree 4_trimal_iqtree/tree_nitro_with_ref_gtdb.treefile
# To do this I can use 3_align_output/gtdbtk.ar122.user_msa.fasta, which keeps only the aligned user sequences, or I can realign and use option --skip-gtdb-refs
conda activate gtdbtk-1.7.0
# Create symlinks to only the internal MAG fasta files in an input folder
mkdir -p 1_fasta_short
ln -s ../02_FASTA/*_Bin_*.fa 1_fasta_short/
# Apply the classify workflow
gtdbtk classify_wf --genome_dir 1_fasta_short --extension fa --out_dir 2_classify_wf_output_short --cpus 12
# unload gtdb-tk and activate Anvio to use IQ-TREE
conda deactivate
conda activate anvio-7.1
# Remove gaps from alignment
trimal -in 2_classify_wf_output_short/gtdbtk.ar122.user_msa.fasta \
-out 4_trimal_iqtree_short/NR_MAGs_only_clean.fasta \
-gt 0.50 &> 4_trimal_iqtree_short/logs/trimal.log 2>&1
# Make tree with the MSA file from gtdb_tk
iqtree -s 4_trimal_iqtree_short/NR_MAGs_only_clean.fasta \
-nt 8 -m WAG \
-bb 1000 &> 4_trimal_iqtree_short/logs/iqtree.log 2>&1