... | ... | @@ -396,14 +396,92 @@ sed -i 's/=[^=]*//2g' envs/flye_v2_7.yaml |
|
|
- files created are as below
|
|
|
1. [rules/checkpoint_ASSEMBLY_ANNOTATION_RULES](https://git-r3lab.uni.lu/susheel.busi/ont_pilot_gitlab/-/blob/checkpoint_snakefile/2019_GDB/rules/checkpoint_ASSEMBLY_ANNOTATION_RULES)
|
|
|
2. [workflows/checkpoint_assembly_annotation.smk](https://git-r3lab.uni.lu/susheel.busi/ont_pilot_gitlab/-/blob/checkpoint_snakefile/2019_GDB/workflows/checkpoint_assembly_annotation.smk)
|
|
|
|
|
|
```
|
|
|
################################
|
|
|
##### Checkpoint Snakefile #####
|
|
|
###############################
|
|
|
cp updated_SNAKEFILE checkpoint_SNAKEFILE
|
|
|
# edited the paths to the 'rules' and 'workflows' in the "checkpoint_SNAKEFILE", to include the above files
|
|
|
# test run the "checkpoint_SNAKEILF"
|
|
|
snakemake -np -s checkpoint_SNAKEFILE --use-conda # test-run seemed to work fine
|
|
|
```
|
|
|
|
|
|
## Chapter X - The miscellaneous or nearly-forgotten side projects
|
|
|
## Chapter X - Is the short-read metaspades part of the hybrid assembly contributing to the increase in protein counts compared to long-read alone?
|
|
|
- A random Tuesday morning conversation, led to the conclusion that we should also use the MetaSPADES assembler and assemble the short-reads
|
|
|
- This part of the now, super-long story is called, "sr assembly with metaspades"
|
|
|
```
|
|
|
###############################
|
|
|
# SR assembly with metaspades #
|
|
|
###############################
|
|
|
# based on cedric's suggestion - to contrast the mmseq results and eliminate the bias of the assembler
|
|
|
# using metaspades to assemble short-reads
|
|
|
# then will do mmseq, and compare megahit, metaspades and metaspades_hybrid
|
|
|
# saved the below analyses method in the "scripts" folders as "cdhit_comparisons.sh"
|
|
|
|
|
|
cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB
|
|
|
# edited the following files
|
|
|
1. rules/ASSEMBLY_ANNOTATION_rule # line 783 == added rule "assemble_short_reads_w_metaspades"
|
|
|
2. workflows/assembly_annotation.smk # line 126 == added "metaspades" as a new target
|
|
|
# line 90 == added 'metaspades' in TEST prodigal
|
|
|
3. cluster.json # addded parameters for the new rule
|
|
|
4. rules/MMSEQ_RULES # need to edit the "prepare_plot_files.sh" to include the new additions
|
|
|
5. workflows/mmseq.smk
|
|
|
```
|
|
|
|
|
|
##### MMSEQ #####
|
|
|
- mmseqs2 comparisons were performed as part of the snakefile
|
|
|
- Required adjustment of the plotting prep and R scripts to account for the new "metaspades" group
|
|
|
```
|
|
|
cp mmseq_plots.R [mmseq_plots_w_metaspades.R](https://git-r3lab.uni.lu/susheel.busi/ont_pilot_gitlab/-/blob/checkpoint_snakefile/2019_GDB/scripts/mmseq_plots_w_metaspades.R)
|
|
|
cp prepare_plot_files.sh [prepare_plot_files_w_metaspades.sh](https://git-r3lab.uni.lu/susheel.busi/ont_pilot_gitlab/-/blob/checkpoint_snakefile/2019_GDB/scripts/prepare_plot_files_w_metaspades.sh)
|
|
|
```
|
|
|
|
|
|
##### CD-HIT #####
|
|
|
- comparing metaspades_sr and metaspades_hybrid proteins to see which contributes more to the mmseq comparisons
|
|
|
```
|
|
|
cd /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB
|
|
|
mkdir cd-hit && cd cd-hit
|
|
|
ln -s ../results/annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa metaspades.faa
|
|
|
ln -s ../results/annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa metaspades_hybrid.faa
|
|
|
|
|
|
# getting the count of partially-called genes based on Prodigal output
|
|
|
grep -E -o "partial.{0,3}" metaspades.faa | grep '10\|01\|11' | wc -l > metaspades_partial_counts.txt
|
|
|
grep -E -o "partial.{0,3}" metaspades_hybrid.faa | grep '10\|01\|11' | wc -l > metaspades_hybrid_partial_counts.txt
|
|
|
|
|
|
# getting the counts into one file
|
|
|
for file in *.txt; do echo $file; cat $file; done >> counts
|
|
|
cat counts | paste - - > partial_gene_counts.txt
|
|
|
|
|
|
# editing fasta headers to make it easier after cd-hit clustering
|
|
|
conda activate bbmap
|
|
|
rename.sh in=metaspades.faa out=spades.faa prefix=spades ignorejunk=t
|
|
|
rename.sh in=metaspades_hybrid.faa out=hybrid.faa prefix=hybrid ignorejunk=t
|
|
|
|
|
|
si # interactive
|
|
|
conda activate cd-hit
|
|
|
cd-hit-2d -i spades.faa -i2 hybrid.faa -o spades_hybrid -c 0.9 -n 5 -d 0 -M 16000 -T 8
|
|
|
cd-hit-2d -i hybrid.faa -i2 spades.faa -o hybrid_spades -c 0.9 -n 5 -d 0 -M 16000 -T 8
|
|
|
|
|
|
# determining number of unique sequences
|
|
|
# according to http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d
|
|
|
# CD-HIT-2D outputs two files: a fasta file of proteins in "db2" that are not similar to db1 and a text file that lists similar sequences between db1 & db2
|
|
|
grep -c '>' spades_hybrid # db2 == hybrid
|
|
|
# 63911
|
|
|
grep -c '>' hybrid_spades # db2 == spades
|
|
|
# 27526
|
|
|
|
|
|
# using auxilliary scripts to merge the cluster (output) files
|
|
|
# making plots for all .clstr files (http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d)
|
|
|
for file in *.clstr
|
|
|
do
|
|
|
echo "$file"
|
|
|
plot_len1.pl "$file" \
|
|
|
1,2-4,5-9,10-19,20-49,50-99,100-299,500-99999 \
|
|
|
10-59,60-149,150-499,500-1999,2000-999999
|
|
|
done >> cluster_plots
|
|
|
```
|
|
|
|
|
|
## Chapter XI - The miscellaneous or nearly-forgotten side projects
|
|
|
- Due to the multifaceted nature of the best, i.e. the modular workflow, we tested several aspects separately
|
|
|
- For example: since we used two mappers bwa-mem and minimap for the reads, we binned each sample separtely based on the mapper
|
|
|
- Additionally, we needed to merge bam files for the "hybrid"-binning, so we compared bins using [sourmash](https://github.com/dib-lab/sourmash) and compared assemblies using [quast](https://github.com/ablab/quast)
|
... | ... | |