abyssMG

M3: amoA gene tree placement and phylogenomic tree

Blandine Trouche October 2023

1- amoA gene tree
2- Phylogenomic tree using GTDB-tk and IQ-TREE
- GTDB-tk to build phylogenomic tree of AOA MAGs (internal and reference)
- Phylogenomic tree of internal AOA MAGs only

1- amoA gene tree

amoA gene sequences from the coassemblies

### Extract amoA genes from the unbinned co-assemblies ###
conda activate anvio-7.1
# this code should be run in the home folder of the project

for group in A7D A7S A9S AK7 H3D H3T HAK HAS HKT MD
do
  anvi-export-functions -c 03_CONTIGS/${group}-contigs.db \
      --annotation-sources KOfam -o 08_FUNCTIONS/${group}_KOfam.txt

  # Extracting the gene ids
  grep "ammonia monooxygenase subunit A" 08_FUNCTIONS/${group}_KOfam.txt \
      | awk '{print $1}' >  08_FUNCTIONS/1_amoA/${group}_amoA_geneid.txt

  anvi-get-sequences-for-gene-calls -c 03_CONTIGS/${group}-contigs.db \
      --gene-caller-ids 08_FUNCTIONS/1_amoA/${group}_amoA_geneid.txt \
      --report-extended-deflines -o 08_FUNCTIONS/1_amoA/${group}_amoA_sequences.txt

done

cat 08_FUNCTIONS/1_amoA/*_amoA_sequences.txt > 08_FUNCTIONS/1_amoA/All_amoA_seq.fasta

# Dereplicating the sequences by clustering them at 100% with CD-HIT
conda activate cd-hit

cd-hit-est -i 08_FUNCTIONS/1_amoA/All_amoA_seq.fasta \
    -o 08_FUNCTIONS/1_amoA/Derep_amoA_seq.fasta -c 1

amoA gene sequences from the reconstructed MAGs

# Extract amoA genes from reconstructed MAGs
cd $HOME/NON_REDUNDANT_BINS
conda activate anvio-7.1

anvi-export-functions -c 03_CONTIGS/NR_MAGs-contigs.db --annotation-sources KOfam \
      -o 08_FUNCTIONS/Functions_KOfam_internal.txt

grep "ammonia monooxygenase subunit A" 08_FUNCTIONS/Functions_KOfam_internal.txt 

anvi-get-sequences-for-gene-calls -c 03_CONTIGS/NR_MAGs-contigs.db \
    --gene-caller-ids 85155,61587,63004,35832,4028,4582,109889,25473,19263,49045 \
    --report-extended-deflines -o 08_FUNCTIONS/1_amoA/amoA_sequences_internal.txt

amoA gene sequences from the external reference MAGs

cd NON_REDUNDANT_BINS
conda activate anvio-7.1

for i in 03_CONTIGS_EXT/*-contigs.db
do
  NAME=$(echo $i | sed 's|.*\/||')
  NAME=$(echo $NAME | sed 's|-contigs.db||')
  echo $NAME
  
  anvi-export-functions -c $i --annotation-sources KOfam -o 08_FUNCTIONS/${NAME}_KOfam.txt
  
  GENE=grep "ammonia monooxygenase subunit A" 08_FUNCTIONS/${NAME}_KOfam.txt | awk '{print $1}'
  
  anvi-get-sequences-for-gene-calls -c 03_CONTIGS_EXT/${NAME}-contigs.db \
      --gene-caller-ids $GENE -o 08_FUNCTIONS/1_amoA_ext/${NAME}_amoA.txt --report-extended-deflines 
      
done

# concatenate all detected amoA sequences
cat 08_FUNCTIONS/1_amoA_ext/*_amoA.txt > 08_FUNCTIONS/1_amoA_ext/ext_genomes_amoA_seq.fasta

Placing all sequences (internal and external) in the Alves et al tree

# this code should be run in the home folder of the project

### Tree data was obtained from the supplementary material of Alves et al., 2018
### It was downloaded and placed in folder AmoA_DB_Alves_2018

# Get tree model for EPA with RAxML
./raxml-ng/raxml-ng --evaluate \
    --msa  AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_an96.aln.roguesout.fasta \
    --tree AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_tree.all.branches.newick \
    --prefix info --model GTR+G+F
    
# Align query to reference
conda activate MAFFT

cat AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_an96.aln.roguesout.fasta \
    08_FUNCTIONS/1_amoA/Derep_amoA_seq.fasta \
    NON_REDUNDANT_BINS/08_FUNCTIONS/1_amoA/amoA_sequences_internal.txt \
    NON_REDUNDANT_BINS/08_FUNCTIONS/1_amoA_ext/ext_genomes_amoA_seq.fasta \
    > 09_amoA_tree/amoA_db_ext_int.fasta

mafft 09_amoA_tree/amoA_db_ext_int.fasta > 09_amoA_tree/amoA_db_ext_int_aln.fasta

# Use EPA-ng to place in ref tree 
conda activate epa-ng
cd 09_amoA_tree

epa-ng --split ../AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_an96.aln.roguesout.fasta \
    amoA_db_ext_int_aln.fasta

epa-ng --ref-msa reference.fasta \
      --tree ../AmoA_DB_Alves_2018/AamoA.db_an96.roguesout_epa/AamoA.db_tree.all.branches.newick \
      --query query.fasta --threads 8 \
      --model ../raxml_model/info.raxml.bestModel
      
# creation of a jplace file as a result
mv epa_result.jplace epa_result_amoA_int_ext.jplace

2- Phylogenomic tree using GTDB-tk and IQ-TREE

GTDB-tk to build phylogenomic tree of AOA MAGs (internal and reference)

conda activate gtdbtk-1.7.0

# From the home folder
cd NON_REDUNDANT_BINS
mkdir -p GTDB_tree
cd GTDB_tree

# I will use GTDB-tk's classify workflow 
# Create symlinks to the fasta files in an input folder
mkdir -p 1_fasta
ln -s ../02_FASTA/*_Bin_*.fa 1_fasta/
ln -s ../02_FASTA_EXT/*/*.fa 1_fasta/

# Apply the classify workflow
gtdbtk classify_wf --genome_dir 1_fasta --extension fa --out_dir 2_classify_wf_output --cpus 12

# Redo the alignment and subset the tree
gtdbtk align --identify_dir 2_classify_wf_output \
             --out_dir 3_align_output \
             --cpus 8 \
             --taxa_filter o__Nitrososphaerales,g__Pyrococcus

# unload gtdb-tk and activate Anvio to use IQ-TREE             
conda deactivate
conda activate anvio-7.1 

# Remove gaps from alignment
trimal -in 3_align_output/gtdbtk.ar122.msa.fasta \
       -out 4_trimal_iqtree/gtdbtk_ar122_msa_anvio_clean.fasta \
       -gt 0.50 &> 4_trimal_iqtree/logs/trimal.log 2>&1

# Make tree with the MSA file from gtdb_tk
iqtree -s 4_trimal_iqtree/gtdbtk_ar122_msa_anvio_clean.fasta \
       -nt 8 -m WAG \
       -bb 1000 &> 4_trimal_iqtree/logs/iqtree.log 2>&1

# go back to gtdb-tk and root the tree
conda deactivate
conda activate gtdbtk-1.7.0

gtdbtk root --input_tree 4_trimal_iqtree/gtdbtk_ar122_msa_anvio_clean.fasta.treefile \
            --outgroup_taxon g__Pyrococcus \
            --output_tree 4_trimal_iqtree/tree_nitro_with_ref_gtdb.treefile

Phylogenomic tree of internal AOA MAGs only

# To do this I can use 3_align_output/gtdbtk.ar122.user_msa.fasta, which keeps only the aligned user sequences, or I can realign and use option --skip-gtdb-refs
conda activate gtdbtk-1.7.0

# Create symlinks to only the internal MAG fasta files in an input folder
mkdir -p 1_fasta_short
ln -s ../02_FASTA/*_Bin_*.fa 1_fasta_short/

# Apply the classify workflow
gtdbtk classify_wf --genome_dir 1_fasta_short --extension fa --out_dir 2_classify_wf_output_short --cpus 12

# unload gtdb-tk and activate Anvio to use IQ-TREE             
conda deactivate
conda activate anvio-7.1 

# Remove gaps from alignment
trimal -in 2_classify_wf_output_short/gtdbtk.ar122.user_msa.fasta \
       -out 4_trimal_iqtree_short/NR_MAGs_only_clean.fasta \
       -gt 0.50 &> 4_trimal_iqtree_short/logs/trimal.log 2>&1

# Make tree with the MSA file from gtdb_tk
iqtree -s 4_trimal_iqtree_short/NR_MAGs_only_clean.fasta \
       -nt 8 -m WAG \
       -bb 1000 &> 4_trimal_iqtree_short/logs/iqtree.log 2>&1