... | ... | @@ -269,6 +269,105 @@ tar xzf checkm_data_2015_01_16.tar.gz |
|
|
# Finally, everything ran beautifully - MACHINE VALIDATION COMPLETE!!
|
|
|
```
|
|
|
|
|
|
## Chapter VIII - Are we there yet? Nope, we've still got 2019
|
|
|
- While all around 2020 has turned out to be a crappy year thanks to SARS-CoV2
|
|
|
- 2019 was a great year for GDB, who donated sufficient sample to run metaG, metaT, metaP (proteins) and also ONT (long-read) sequencing
|
|
|
- This was also the first time in our lab that we ran a MinION flowcell to its exhaustion, acquiring ~31GB of data, thanks to RH's efforts
|
|
|
- This section of the narrative will be titled, "the shit storm that wasn't!"
|
|
|
```
|
|
|
##### 2019 GDB DATA #####
|
|
|
# since the new guppy-3.6 was released with a better accuracy, I downloaded and installed the same locally
|
|
|
# downloaded from: https://community.nanoporetech.com/downloads
|
|
|
cd /home/users/sbusi/apps
|
|
|
wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy_3.6.0_linux64.tar.gz
|
|
|
tar -xzvf ont-guppy_3.6.0_linux64.tar.gz # exported to PATH to be used later
|
|
|
|
|
|
# Preparing the data folders etc.
|
|
|
cd /scratch/users/sbusi/ONT/cedric_ont_basecalling
|
|
|
mkdir 2019_GDB
|
|
|
cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB
|
|
|
|
|
|
mkdir data
|
|
|
mkdir data/metaT
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/config .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/cluster.json .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/envs .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/dbs .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/rules .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/scripts .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/src .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/workflows .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/workflows_SNAKEFILE .
|
|
|
cp -vrf ../MODULAR_SNAKEFILE/updated_SNAKEFILE .
|
|
|
|
|
|
mkdir data/raw
|
|
|
mkdir data/raw/short_reads
|
|
|
|
|
|
# symlinking to the raw ONT reads
|
|
|
cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/raw
|
|
|
|
|
|
ln -sd /scratch/users/claczny/ont/fecal_pilot-Aug2019_run/data/raw/DonerB_fecal \
|
|
|
/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/raw/. # just in case
|
|
|
|
|
|
cp -vrf /scratch/users/claczny/ont/fecal_pilot-Aug2019_run/data/raw/DonerB_fecal/S1_SizeSelected/20190828_0614_MN22103_FAK04421_24db0054/fast5/ \
|
|
|
/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/multifast5/S1_SizeSelected
|
|
|
cp -vrf /scratch/users/claczny/ont/fecal_pilot-Aug2019_run/data/raw/DonerB_fecal/S3_Gtube/20190828_1842_MN22103_FAK04421_7a557a8b/fast5/ \
|
|
|
/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/multifast5/S3_Gtube
|
|
|
|
|
|
# copying the metaG reads
|
|
|
cp -vrf /mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Nov/MG/fastq/*.gz \
|
|
|
/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/raw/short_reads/.
|
|
|
|
|
|
# copying the metaT reads
|
|
|
cp -vrf /mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Nov/MT/fastq/*Full* \
|
|
|
/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/metaT/.
|
|
|
cp -vrf /mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Nov/MT/fastq/*Half* \
|
|
|
/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/metaT/.
|
|
|
|
|
|
# Since there were several metaT samples to choose from (processed with different library preparations)
|
|
|
# Counted the number of reads in the metaT fastq.gz files to decide which sample to use
|
|
|
cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/metaT
|
|
|
for file in *.fastq.gz
|
|
|
do
|
|
|
echo "$file"; zcat "$file" | echo $((`wc -l`/4))
|
|
|
done
|
|
|
# FastSelectFull1_MT_Rashi_S14_R1_001.fastq.gz
|
|
|
# 40105481
|
|
|
# FastSelectFull1_MT_Rashi_S14_R2_001.fastq.gz
|
|
|
# 40105481
|
|
|
# FastSelectFull2_MT_Rashi_S15_R1_001.fastq.gz
|
|
|
# 38354199
|
|
|
# FastSelectFull2_MT_Rashi_S15_R2_001.fastq.gz
|
|
|
# 38354199
|
|
|
# FastSelectHalf1_MT_Rashi_S12_R1_001.fastq.gz
|
|
|
# 40251171
|
|
|
# FastSelectHalf1_MT_Rashi_S12_R2_001.fastq.gz
|
|
|
# 40251171
|
|
|
# FastSelectHalf2_MT_Rashi_S13_R1_001.fastq.gz
|
|
|
# 39937608
|
|
|
# FastSelectHalf2_MT_Rashi_S13_R2_001.fastq.gz
|
|
|
# 39937608
|
|
|
|
|
|
# Chose the "FastSelectHalf1" sample based on reads
|
|
|
|
|
|
## Edited the "config/CONFIG.yaml" files with the appropriate file paths
|
|
|
vi config/CONFIG.yaml
|
|
|
# since there are only 2 runs: S1_SizeSelected, S3_Gtube
|
|
|
# Also, had to adjust the rules/ASSEMBLY_ANNOTATION_RULES, and the workflows/assembly.smk files
|
|
|
# Adjusted the short-read "name" in the rules files from "NEB2_MG_S17" to "ONT3_MG_xx_Rashi_S11"
|
|
|
# Adjusted the config file to run the first line before going on to the second line and further
|
|
|
# Included the workflows as steps in the [CONFIG.YAML](url) file
|
|
|
# steps: "mmseq metaT mapping binning taxonomy"
|
|
|
steps: "assembly_annotation metaT"
|
|
|
steps: "mapping mmseq binning taxonomy"
|
|
|
|
|
|
# Basecalling taking too long, so running separately
|
|
|
# created a "basecalling_snakefile" and an associated launcher script "run_basecalling_snakemake.sh"
|
|
|
cp src/snakemake_run_use_conda_FINAL.sh run_basecalling_snakemake.sh
|
|
|
./run_basecalling_snakemake.sh
|
|
|
```
|
|
|
|
|
|
|
|
|
## Chapter IX - The miscellaneous or nearly-forgotten side projects
|
|
|
- Due to the multifaceted nature of the best, i.e. the modular workflow, we tested several aspects separately
|
|
|
- For example: since we used two mappers bwa-mem and minimap for the reads, we binned each sample separtely based on the mapper
|
... | ... | |