... | ... | @@ -652,6 +652,49 @@ cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/bins |
|
|
dRep compare drep_output -g ./*.fa
|
|
|
```
|
|
|
|
|
|
##### Assessing MAGs #####
|
|
|
- Although expected, as stated somewhere above, "we are scientists and so, we need evidence!"
|
|
|
- I decided to confirm that the bins recovered via each of the assembly methods, and additionally the mapping methods would be different
|
|
|
- To get there, first we needed to link the CheckM and GTDBtk outputs to the binned MAGs
|
|
|
```
|
|
|
# Working directory
|
|
|
cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/Binning/checkm_output
|
|
|
|
|
|
# making a list of all the samples
|
|
|
ls -1 *.txt | sed 's/_output.txt//g' > sample_list
|
|
|
|
|
|
# collating the checkm_output to a readable format
|
|
|
for file in `cat sample_list`
|
|
|
do
|
|
|
sed '1,/Bin Id/d;/INFO/,$d' "$file"_output.txt | \
|
|
|
sed '/^-----/d' | \
|
|
|
awk '{print $1"\t"$13"\t"$14"\t"$15}' | \
|
|
|
sed $'1 i\\\nbin\tcompletion\tcontamination\tstrain_heterogeneity' > "$file"_checkm.txt
|
|
|
done
|
|
|
|
|
|
# collecting the gtdbtk information
|
|
|
for file in `cat sample_list`
|
|
|
do
|
|
|
awk '{print $1"\t"$2}' ../gtdbtk_output/"$file"/classify/gtdbtk.bac120.summary.tsv | sed 's/user_genome/bin/g' > "$file"_gtdbtk.txt
|
|
|
done
|
|
|
|
|
|
# merging files based on a common first column
|
|
|
for file in `cat sample_list`
|
|
|
do
|
|
|
awk -F"\t" 'BEGIN{OFS="\t"} {if (NR==FNR) {a[$1]=$2"\t"$3"\t"$4; next} if ($1 in a) {print $1, $2, a[$1]}}' \
|
|
|
"$file"_checkm.txt "$file"_gtdbtk.txt > "$file"_taxonomy_contamination.txt
|
|
|
done
|
|
|
|
|
|
# keeping only those bins with completion over 60% and contamination below 10% (this number can be adjusted as needed)
|
|
|
for file in `cat sample_list`
|
|
|
do
|
|
|
awk '$3>=60 && $4 <10 {print}' "$file"_taxonomy_contamination.txt > "$file"_HQ_bins.txt
|
|
|
done
|
|
|
|
|
|
# The "$file"_HQ_bins.txt files were subsequently analysed on the Desktop in R using the "MAGS_2019_GDB_analyses.R" script
|
|
|
```
|
|
|
|
|
|
|
|
|
##### VizBin #####
|
|
|
- The binners did great for the sr- and hybrid assemblies, however, fared poorly with the lr- (flye) assembly
|
|
|
- A sign of a good researcher is that they never trust their own data, and validate it twelve-ways-to-Sunday
|
... | ... | |