... | ... | @@ -845,7 +845,9 @@ agb.py -i methylation_aware_results/assembly/flye/lr/merged/no_barcode/ -a Flye |
|
|
- i.e. the number of reads mapping back to the assemblies
|
|
|
```
|
|
|
cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/mapping
|
|
|
conda activate anvio6
|
|
|
conda create -n analysis -c bioconda samtools
|
|
|
conda activate analysis
|
|
|
conda env export > /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/envs/analysis.yaml
|
|
|
srun -p interactive -t 4:00:00 --pty bash -i # interactive session
|
|
|
|
|
|
# getting the "mappability" using a launcher script for the following:
|
... | ... | @@ -931,6 +933,98 @@ do |
|
|
awk '{sum+=$3+$4} END {print sum}' "$file"
|
|
|
done | paste - - - | awk '$4=100*$2/$3' | \
|
|
|
sed $'1 i\\\nsample\tmapped_reads\ttotal\tpercent_mapped' > mappability_over_5000bp.txt
|
|
|
|
|
|
## Assessing the number of "unique_mapping_reads" from BAM files ##
|
|
|
-based on information here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment/4542#4542
|
|
|
samtools view -h megahit/megahit.bam | grep -v "XT:A:U"| wc -l
|
|
|
# 51826913
|
|
|
# total =
|
|
|
# percent_mapped =
|
|
|
samtools view flye/flye.bam | grep -v "XT:A:U" | uniq | wc -l
|
|
|
# 8351240
|
|
|
# total =
|
|
|
# percent_mapped =
|
|
|
samtools view bwa_merged_metaspades_hybrid/bwa_merged_metaspades_hybrid.bam | grep -v "XT:A:U" | wc -l
|
|
|
# 60114498
|
|
|
# total =
|
|
|
# percent_mapped =
|
|
|
|
|
|
NOTE: all of the above can be run using the **rules/ANALYSIS_RULES**
|
|
|
```
|
|
|
|
|
|
##### CRISPR #####
|
|
|
- A key feature of the long-read technology is the ability to sequence **repeat** regions
|
|
|
- So, CRISPRS are exactly that, and why don't we look for this tiny little pesky repeats that give short-reads such a bad name
|
|
|
- We decided to go with [CASC](https://github.com/dnasko/CASC) and [minced](https://github.com/ctSkennerton/minced)
|
|
|
- Rob Edwards from UCSD suggested trying the CRISPRDetect, but @SBB got stuck in ```perl-hell```, so we decided to put that away for another time
|
|
|
- NOTE: rather than following all of this long-winded way, use the "workflows/analysis.smk" for the CRISPR detection and quantification
|
|
|
|
|
|
```
|
|
|
cd /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB
|
|
|
mkdir crispr
|
|
|
cd crispr
|
|
|
srun -p interactive -t 4:00:00 --pty bash -i
|
|
|
conda activate analysis
|
|
|
conda install -c anaconda perl
|
|
|
# downloading and installing CASC: https://github.com/dnasko/CASC
|
|
|
git clone https://github.com/dnasko/CASC.git
|
|
|
|
|
|
# installing locally
|
|
|
cd CASC
|
|
|
PERL Makefile.PL PREFIX=/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr
|
|
|
make
|
|
|
make test
|
|
|
make install
|
|
|
|
|
|
# updating the PATHs
|
|
|
export PATH=$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin
|
|
|
export PERL5LIB=/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl
|
|
|
|
|
|
# testing with 'flye'
|
|
|
casc -i
|
|
|
/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/assembly/flye.fa \
|
|
|
-o /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/flye_crispr_output \
|
|
|
-n 12 --conservative
|
|
|
|
|
|
# running in a loop for all samples since it only takes 20 minutes per sample
|
|
|
for file in /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/assembly/*.fa
|
|
|
do
|
|
|
casc -i "$file" \
|
|
|
-o /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/$(basename "$file")_crispr_output -n 12 --conservative
|
|
|
done
|
|
|
|
|
|
# collating the number of CRISPR spacers
|
|
|
awk '$8 == "true" {sum += $2} END {print sum}' flye_crispr_output/flye.results.txt
|
|
|
# 28
|
|
|
awk '$8 == "true" {sum += $2} END {print sum}' bwa_lr_metaspades_hybrid.fa_crispr_output/bwa_lr_metaspades_hybrid.results.txt
|
|
|
# 61
|
|
|
awk '$8 == "true" {sum += $2} END {print sum}' bwa_merged_metaspades_hybrid.fa_crispr_output/bwa_merged_metaspades_hybrid.results.txt
|
|
|
# 61
|
|
|
|
|
|
for i in `cat files`
|
|
|
do
|
|
|
echo "$i"
|
|
|
awk '$8 == "true" {sum += $2} END {print sum}' "$i".fa_crispr_output/"$i".results.txt
|
|
|
done | paste - - | sed $'1 i\\\nassembly\tnumber_CRISPR_spacers' > casc_CRISPR_output.txt
|
|
|
|
|
|
##### MinCED #####
|
|
|
# https://github.com/ctSkennerton/minced
|
|
|
git clone https://github.com/ctSkennerton/minced.git
|
|
|
cd minced
|
|
|
conda install -c bioconda java-jdk
|
|
|
make
|
|
|
|
|
|
for file in /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/assembly/*.fa
|
|
|
do
|
|
|
ln -s $file .
|
|
|
done
|
|
|
|
|
|
for file in *.fa
|
|
|
do
|
|
|
./minced "$file" "$file".txt "$file".gff
|
|
|
echo "$file" >> minced_CRISPR_output.txt
|
|
|
grep -c 'CRISPR' "$file".txt >> minced_CRISPR_output.txt
|
|
|
done
|
|
|
```
|
|
|
|
|
|
###############
|
... | ... | |