... | ... | @@ -873,7 +873,66 @@ for file in *.txt; do echo $file; egrep 'total | mapped' $file | head -n 2; done |
|
|
sed $'1 i\\\nsample\ttotal\tmapped\tpercent_mapped' | sed 's/ + 0 in total (QC-passed reads + QC-failed reads)//g' | \
|
|
|
sed s'/ + 0 mapped//g' | sed 's/ : N\/A)//g' | sed s'/(//g' | sed s'/.txt//g' | \
|
|
|
awk '{print$0=$1"\t"$2"\t"$3"\t"$4}' > mappability_index.tsv
|
|
|
|
|
|
|
|
|
#### Question: will the hybrid make a difference if the smaller contigs are removed
|
|
|
# Answer: lets disregard the smaller contigs, starting at 1000bp and then get the "mappability_index"
|
|
|
# getting read counts for only contigs > (greater than) 1Kbp
|
|
|
# samtools idxstats output:: contig_name, contig_length, mapped_reads, unmapped_reads; note: * at the end indicates number of unmapped reads
|
|
|
|
|
|
# metaG
|
|
|
samtools idxstats sr/bwa_mem/ONT3_MG_xx_Rashi_S11_reads-x-ONT3_MG_xx_Rashi_S11-megahit_contigs.bam > megahit_counts.txt
|
|
|
samtools idxstats lr/merged/no_barcode/no_barcode_reads-x-no_barcode-flye_contigs.bam > flye_counts.txt
|
|
|
|
|
|
samtools idxstats bwa_merged_metaspades_hybrid/bwa_merged_metaspades_hybrid.bam > hybrid_bwa_merged_counts.txt
|
|
|
samtools idxstats bwa_sr_metaspades_hybrid/bwa_sr_metaspades_hybrid.bam > hybrid_bwa_sr_counts.txt
|
|
|
samtools idxstats bwa_lr_metaspades_hybrid/bwa_lr_metaspades_hybrid.bam > hybrid_bwa_lr_counts.txt
|
|
|
|
|
|
samtools idxstats mmi_merged_metaspades_hybrid/mmi_merged_metaspades_hybrid.bam > hybrid_mmi_merged_counts.txt
|
|
|
samtools idxstats mmi_sr_metaspades_hybrid/mmi_sr_metaspades_hybrid.bam > hybrid_mmi_sr_counts.txt
|
|
|
samtools idxstats mmi_lr_metaspades_hybrid/mmi_lr_metaspades_hybrid.bam > hybrid_mmi_lr_counts.txt
|
|
|
|
|
|
# metaT
|
|
|
samtools idxstats ./metaT/sr/FastSelectHalf1_MT_Rashi_S12_reads-x-lr_no_barcode_sr_ONT3_MG_xx_Rashi_S11-metaspades_hybrid_contigs.bam > hybrid_metaT_sr_counts.txt
|
|
|
samtools idxstats ./metaT/sr/FastSelectHalf1_MT_Rashi_S12_reads-x-ONT3_MG_xx_Rashi_S11-megahit_contigs.bam > megahit_metaT_sr_counts.txt
|
|
|
|
|
|
# collating the sums for contigs larger than 1000 bp
|
|
|
for file in *counts.txt
|
|
|
do
|
|
|
echo "$file"
|
|
|
awk '$2 >= 1000 {sum += $3} END {print sum}' "$file" # sum of number of reads (3rd column) if 2nd column is greater/equal to 1000
|
|
|
awk '{sum+=$3+$4} END {print sum}' "$file" # sum of 3rd and 4th columns to get total reads
|
|
|
done | paste - - - | awk '$4=100*$2/$3' | \ # getting the percentage from columns 2 and 3 and writing to column-4
|
|
|
sed $'1 i\\\nsample\tmapped_reads\ttotal\tpercent_mapped' > mappability_over_1000bp.txt
|
|
|
|
|
|
# collating the sums for contigs larger than 1500 bp
|
|
|
for file in *counts.txt
|
|
|
do
|
|
|
echo "$file"
|
|
|
awk '$2 >= 1500 {sum += $3} END {print sum}' "$file"
|
|
|
awk '{sum+=$3+$4} END {print sum}' "$file"
|
|
|
done | paste - - - | awk '$4=100*$2/$3' | \
|
|
|
sed $'1 i\\\nsample\tmapped_reads\ttotal\tpercent_mapped' > mappability_over_1500bp.txt
|
|
|
|
|
|
# collating the sums for contigs larger than 2000 bp
|
|
|
for file in *counts.txt
|
|
|
do
|
|
|
echo "$file"
|
|
|
awk '$2 >= 2000 {sum += $3} END {print sum}' "$file"
|
|
|
awk '{sum+=$3+$4} END {print sum}' "$file"
|
|
|
done | paste - - - | awk '$4=100*$2/$3' | \
|
|
|
sed $'1 i\\\nsample\tmapped_reads\ttotal\tpercent_mapped' > mappability_over_2000bp.txt
|
|
|
|
|
|
# collating the sums for contigs larger than 5000 bp
|
|
|
for file in *counts.txt
|
|
|
do
|
|
|
echo "$file"
|
|
|
awk '$2 >= 5000 {sum += $3} END {print sum}' "$file"
|
|
|
awk '{sum+=$3+$4} END {print sum}' "$file"
|
|
|
done | paste - - - | awk '$4=100*$2/$3' | \
|
|
|
sed $'1 i\\\nsample\tmapped_reads\ttotal\tpercent_mapped' > mappability_over_5000bp.txt
|
|
|
```
|
|
|
|
|
|
##### Notes - for 2018_GDB data #####
|
|
|
1. Flye does not produce metabat bins. Reason == only 2 contigs (less than 1500 bp) have coverages over 1.
|
|
|
2. Should we adjust the "minCV" parameter for metabat?
|
... | ... | |