Susheel Busi · ad21d88e
--- a/ONT_pilot_w_GDB_samples.md
+++ b/ONT_pilot_w_GDB_samples.md
@@ -845,7 +845,9 @@ agb.py -i methylation_aware_results/assembly/flye/lr/merged/no_barcode/ -a Flye
 - i.e. the number of reads mapping back to the assemblies
 ```
 cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/mapping
-conda activate anvio6
+conda create -n analysis -c bioconda samtools
+conda activate analysis
+conda env export > /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/envs/analysis.yaml
 srun -p interactive -t 4:00:00 --pty bash -i    # interactive session

 # getting the "mappability" using a launcher script for the following:
@@ -931,6 +933,98 @@ do
  awk '{sum+=$3+$4} END {print sum}' "$file"
 done | paste - - - | awk '$4=100*$2/$3' | \
  sed $'1 i\\\nsample\tmapped_reads\ttotal\tpercent_mapped' > mappability_over_5000bp.txt
+
+## Assessing the number of "unique_mapping_reads" from BAM files ##
+-based on information here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment/4542#4542
+samtools view -h megahit/megahit.bam | grep -v "XT:A:U"| wc -l
+# 51826913
+# total = 
+# percent_mapped =
+samtools view flye/flye.bam | grep -v "XT:A:U" | uniq | wc -l
+# 8351240
+# total = 
+# percent_mapped =
+samtools view bwa_merged_metaspades_hybrid/bwa_merged_metaspades_hybrid.bam | grep -v "XT:A:U" | wc -l
+# 60114498
+# total = 
+# percent_mapped =
+
+NOTE: all of the above can be run using the **rules/ANALYSIS_RULES**
+```
+
+##### CRISPR #####
+- A key feature of the long-read technology is the ability to sequence **repeat** regions
+- So, CRISPRS are exactly that, and why don't we look for this tiny little pesky repeats that give short-reads such a bad name
+- We decided to go with [CASC](https://github.com/dnasko/CASC) and [minced](https://github.com/ctSkennerton/minced)
+- Rob Edwards from UCSD suggested trying the CRISPRDetect, but @SBB got stuck in ```perl-hell```, so we decided to put that away for another time
+- NOTE: rather than following all of this long-winded way, use the "workflows/analysis.smk" for the CRISPR detection and quantification
+
+```
+cd /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB
+mkdir crispr
+cd crispr
+srun -p interactive -t 4:00:00 --pty bash -i
+conda activate analysis
+conda install -c anaconda perl
+# downloading and installing CASC: https://github.com/dnasko/CASC
+git clone https://github.com/dnasko/CASC.git
+
+# installing locally
+cd CASC
+PERL Makefile.PL PREFIX=/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr
+make
+make test
+make install
+
+# updating the PATHs
+export PATH=$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin
+export PERL5LIB=/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl
+
+# testing with 'flye'
+casc -i 
+/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/assembly/flye.fa \
+  -o /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/flye_crispr_output \
+    -n 12 --conservative
+
+# running in a loop for all samples since it only takes 20 minutes per sample
+for file in /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/assembly/*.fa
+do 
+  casc -i "$file" \
+  -o /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/$(basename "$file")_crispr_output -n 12 --conservative
+done
+
+# collating the number of CRISPR spacers
+awk '$8 == "true" {sum += $2} END {print sum}' flye_crispr_output/flye.results.txt
+# 28
+awk '$8 == "true" {sum += $2} END {print sum}' bwa_lr_metaspades_hybrid.fa_crispr_output/bwa_lr_metaspades_hybrid.results.txt
+# 61
+awk '$8 == "true" {sum += $2} END {print sum}' bwa_merged_metaspades_hybrid.fa_crispr_output/bwa_merged_metaspades_hybrid.results.txt
+# 61
+
+for i in `cat files`
+do 
+  echo "$i"
+  awk '$8 == "true" {sum += $2} END {print sum}' "$i".fa_crispr_output/"$i".results.txt
+done | paste - - | sed $'1 i\\\nassembly\tnumber_CRISPR_spacers'  > casc_CRISPR_output.txt
+
+##### MinCED #####
+# https://github.com/ctSkennerton/minced
+git clone https://github.com/ctSkennerton/minced.git
+cd minced 
+conda install -c bioconda java-jdk
+make 
+
+for file in /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/assembly/*.fa
+do
+  ln -s $file .
+done 
+
+for file in *.fa
+do 
+  ./minced "$file" "$file".txt "$file".gff
+  echo "$file" >> minced_CRISPR_output.txt
+  grep -c 'CRISPR' "$file".txt >> minced_CRISPR_output.txt
+done
 ```

 ###############