Commit 60215475 authored by Shaman Narayanasamy's avatar Shaman Narayanasamy
Browse files

Naive assemblies

parent b89ed332
import os
## Config
configfile: "config.iterative_MG-gaia183.json"
## Parameters required from the config file
MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
THREADS = os.environ.get("THREADS", config['threads'])
## Input and output parameters
INPUT_DIR = os.environ.get("INPUT_DIR")
OUT_DIR = os.environ.get("OUT_DIR")
OUT_LOG = os.environ.get("OUT_LOG")
MG_ASSM = os.environ.get("MG_ASSM")
MT_ASSM = os.environ.get("MT_ASSM")
TMP = os.environ.get("TMP")
### General rule for the iterative assemblies ###
rule ALL:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
"%s/quast/report.txt" % OUT_DIR,
shell:
"echo 'DONE'"
### Rules for the initial assembly ###
rule MERGE_ASSEMBLIES:
input:
{MG_ASSM},
{MT_ASSM}
output:
"%s/MG_MT_cap3.cat.fa" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa" % OUT_DIR
benchmark:
"%s/benchmarks/MERGE_ASSEMBLIES.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
echo "[x] Merging initial assembly contigs [x]"
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
### General rules MG data
rule QUAST:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR
output:
"%s/quast/report.txt" % (OUT_DIR),
"%s/quast/report.tsv" % (OUT_DIR),
"%s/quast/report.tex" % (OUT_DIR)
benchmark:
"%s/benchmarks/QUAST.json" % (OUT_DIR)
log:
OUT_LOG
shell:
"""
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast \
{input} \
--max-ref-number 0 --no-check --fast -f \
>> {log} 2>&1
"""
## Index gene catalog using bwa
rule INDEX_DATABASE:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR
output:
"%s/MG_MT_cap3.merged.fa.amb" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.bwt" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.pac" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.ann" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.sa" % OUT_DIR
benchmark:
"%s/benchmarks/INDEX_DATABASE.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
bwa index {input}
"""
rule MAP_IMP_MG_READS:
input:
"%s/MG.R1.preprocessed.fq" % INPUT_DIR,
"%s/MG.R2.preprocessed.fq" % INPUT_DIR,
"%s/MG.SE.preprocessed.fq" % INPUT_DIR,
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.amb" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.bwt" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.pac" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.ann" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.sa" % OUT_DIR
output:
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted.bam' % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted.bam.bai' % OUT_DIR,
benchmark:
"%s/benchmarks/MAP_MG_READS.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
SAMHEADER="@RG\\tID:{{SAMPLE}}\\tSM:MG"
PREFIX={OUT_DIR}/MG_IMP_x_MG_MT_cap3.merged
# merge paired and se
samtools merge -@ {THREADS} -f $PREFIX.merged.bam \
<(bwa mem -v 1 -t {THREADS} -M -R \"$SAMHEADER\" {input[3]} {input[0]} {input[1]} | \
samtools view -@ {THREADS} -bS -) \
<(bwa mem -v 1 -t {THREADS} -M -R \"$SAMHEADER\" {input[3]} {input[2]} | \
samtools view -@ {THREADS} -bS -)
# sort
samtools sort -@ {THREADS} -m {MEMCORE}G $PREFIX.merged.bam $PREFIX.sorted
rm $PREFIX.merged.bam
# index
samtools index $PREFIX.sorted.bam
"""
### Obtain MG read mapping statistics
rule IMP_MG_MAP_STATS:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted.bam' % OUT_DIR,
output:
"%s/MG_MT_cap3.merged.fa.fai" % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
log:
OUT_LOG
benchmark:
"%s/benchmarks/ANALYSIS_MG_CALL_CONTIG_DEPTH.json" % OUT_DIR
shell:
"""
if [[ ! -f {input[0]}.fai ]]
then
echo "No fasta index! Creating one." >> {log}
samtools faidx {input[0]}
fi
echo "flagstat" >> {log}
samtools flagstat {input[1]} > {output[1]}
"""
### Map MT reads
rule MAP_IMP_MT_READS:
input:
"%s/MT.R1.preprocessed.fq" % INPUT_DIR,
"%s/MT.R2.preprocessed.fq" % INPUT_DIR,
"%s/MT.SE.preprocessed.fq" % INPUT_DIR,
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.amb" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.bwt" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.pac" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.ann" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.sa" % OUT_DIR
output:
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted.bam' % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted.bam.bai' % OUT_DIR,
benchmark:
"%s/benchmarks/MAP_MT_READS.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
SAMHEADER="@RG\\tID:{{SAMPLE}}\\tSM:MT"
PREFIX={OUT_DIR}/MT_IMP_x_MG_MT_cap3.merged
# merge paired and se
samtools merge -@ {THREADS} -f $PREFIX.merged.bam \
<(bwa mem -v 1 -t {THREADS} -M -R \"$SAMHEADER\" {input[3]} {input[0]} {input[1]} | \
samtools view -@ {THREADS} -bS -) \
<(bwa mem -v 1 -t {THREADS} -M -R \"$SAMHEADER\" {input[3]} {input[2]} | \
samtools view -@ {THREADS} -bS -)
# sort
samtools sort -@ {THREADS} -m {MEMCORE}G $PREFIX.merged.bam $PREFIX.sorted
rm $PREFIX.merged.bam
# index
samtools index $PREFIX.sorted.bam
"""
### Obtain MT read mapping statistics
rule IMP_MT_MAP_STATS:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted.bam' % OUT_DIR,
output:
"%s/MG_MT_cap3.merged.fa.fai" % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
log:
OUT_LOG
benchmark:
"%s/benchmarks/ANALYSIS_MT_CALL_CONTIG_DEPTH.json" % OUT_DIR
shell:
"""
if [[ ! -f {input[0]}.fai ]]
then
echo "No fasta index! Creating one." >> {log}
samtools faidx {input[0]}
fi
echo "flagstat" >> {log}
samtools flagstat {input[1]} > {output[1]}
"""
{
"threads": 8,
"memory_total_gb": 256,
"memory_per_core_gb": 32,
"tmp_dir": "tmp",
"imp_src": "src",
"raws": {
"Metagenomics": "",
"Metatranscriptomics": ""
},
"sample": "test",
"outputdir": "/output",
"db_path": "/databases",
"preprocessing_filtering": true,
"assembler": "idba",
"trimmomatic": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip",
"adapter": "TruSeq3",
"leading": 20,
"minlen": 40,
"palindrome_clip_threshold": 30,
"simple_clip_threshold": 10,
"trailing": 20,
"seed_mismatch": 2,
"window_size": 1,
"window_quality": 3,
"strictness": 0.5,
"target_length": 40,
"jarfile": "/home/imp/lib/trimmomatic-1.32.jar"
},
"idba_ud": {
"mink": 25,
"maxk": 99,
"step": 4,
"perid": 0.98
},
"cap3": {
"identity": 100,
"overlap": 1000
},
"vizbin": {
"dimension": 50,
"kmer": 5,
"size": 4,
"theta": 0.5,
"perp": 30,
"cutoff": 1000,
"jarfile": "/home/imp/lib/VizBin-dist.jar"
},
"human_filtering": {
"filter": "hg38",
"url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
},
"sortmerna": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/sortmerna.2.0.tgz",
"scripts_path": "/home/imp/lib",
"files": [
"rfam-5.8s-database-id98",
"silva-arc-16s-id95",
"silva-bac-16s-id90",
"silva-euk-18s-id95",
"rfam-5s-database-id98",
"silva-arc-23s-id98",
"silva-bac-23s-id98",
"silva-euk-28s-id98"
]
},
"prokka": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/prokka-1.11.tar.gz",
"databases": [
"cm/Bacteria.i1i",
"genus/Staphylococcus.phr",
"hmm/CLUSTERS.hmm.h3f",
"kingdom/Archaea/sprot.phr"
]
},
"kegg": {
"db_ec2pthy": "http://rest.kegg.jp/link/ec/pathway",
"db_hierarchy": "http://rest.kegg.jp/list/pathway"
}
}
{
"threads": 8,
"memory_total_gb": 200,
"memory_per_core_gb": 25,
"tmp_dir": "tmp",
"imp_src": "src",
"raws": {
"Metagenomics": "",
"Metatranscriptomics": ""
},
"sample": "test",
"outputdir": "/output",
"db_path": "/databases",
"preprocessing_filtering": true,
"assembler": "idba",
"trimmomatic": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip",
"adapter": "TruSeq3",
"leading": 20,
"minlen": 40,
"palindrome_clip_threshold": 30,
"simple_clip_threshold": 10,
"trailing": 20,
"seed_mismatch": 2,
"window_size": 1,
"window_quality": 3,
"strictness": 0.5,
"target_length": 40,
"jarfile": "/home/imp/lib/trimmomatic-0.32.jar"
},
"idba_ud": {
"mink": 25,
"maxk": 99,
"step": 4,
"perid": 0.98
},
"vizbin": {
"dimension": 50,
"kmer": 5,
"size": 4,
"theta": 0.5,
"perp": 30,
"cutoff": 1000,
"jarfile": "/home/imp/lib/VizBin-dist.jar"
},
"human_filtering": {
"filter": "hg38",
"url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
},
"sortmerna": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/sortmerna.2.0.tgz",
"scripts_path": "/home/imp/lib",
"files": [
"rfam-5.8s-database-id98",
"silva-arc-16s-id95",
"silva-bac-16s-id90",
"silva-euk-18s-id95",
"rfam-5s-database-id98",
"silva-arc-23s-id98",
"silva-bac-23s-id98",
"silva-euk-28s-id98"
]
},
"prokka": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/prokka-1.11.tar.gz",
"databases": [
"cm/Bacteria.i1i",
"genus/Staphylococcus.phr",
"hmm/CLUSTERS.hmm.h3f",
"kingdom/Archaea/sprot.phr"
]
},
"kegg": {
"db_ec2pthy": "http://rest.kegg.jp/link/ec/pathway",
"db_hierarchy": "http://rest.kegg.jp/list/pathway"
}
}
#!/bin/bash -l
source ./preload_modules.sh
INDIR=$1
OUTDIR=$2
OUTLOG=$3
MG_ASSM=$4
MT_ASSM=$5
TMPDIR=$6
date
### Initialize first assembly ###
INPUT_DIR=$INDIR \
OUT_DIR=$OUTDIR \
OUT_LOG=$OUTLOG \
MG_ASSM=$MG_ASSM \
MT_ASSM=$MT_ASSM \
TMP=$TMPDIR \
snakemake -pr ALL
date
#!/bin/bash -l
OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -t bigmem -t idempotent -t besteffort -l core=8/nodes=1,walltime=2"
#OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -t bigmem -p "network_address=\'gaia-183\'" --project "project_biocore" -l core=8/nodes=1,walltime=120"
declare -a SAMPLES=("SM" "HF1" "HF2" "HF3" "HF4" "HF5" "WW1" "WW2" "WW3" "WW4" "BG")
### Repeat for all the data sets
for S in "${SAMPLES[@]}"
do
INDIR="/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/${S}/Preprocessing/"
OUTDIR="/scratch/users/snarayanasamy/IMP_MS_data/naive_assemblies/${S}"
OUTLOG="/scratch/users/snarayanasamy/IMP_MS_data/naive_assemblies/${S}/${S}_naive_assembly.log"
MG_ASSM="/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/MG_assemblies/${S}/MG_contigs_1.fa"
MT_ASSM="/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/MT_assemblies/${S}/MT_contigs_1.fa"
TMPDIR="/scratch/users/snarayanasamy/IMP_MS_data/naive_assemblies/${S}/tmp"
${OARSUB} -n "${S}_naive_assm" "./execution.sh $INDIR $OUTDIR $OUTLOG $MG_ASSM $MT_ASSM $TMPDIR"
#CMD="./execution.sh $INDIR $OUTDIR $OUTLOG $MG_ASSM $MT_ASSM $TMPDIR"
echo $CMD
# exec ${CMD}
done
IMP_ENV=/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies
export PATH=$IMP_ENV/fastuniq/source:$PATH
export PATH=$IMP_ENV/sortmerna-2.0:$PATH
export PATH=$IMP_ENV/sortmerna-2.0/scripts:$PATH
export PATH=$IMP_ENV/bwa-0.7.9a:$PATH
export PATH=$IMP_ENV/idba-1.1.1/bin:$PATH
export PATH=$IMP_ENV/megahit:$PATH
export PATH=$IMP_ENV/CAP3:$PATH
export PATH=$IMP_ENV/prokka/bin:$PATH
export PATH=$IMP_ENV/quast:$PATH
export PATH=$IMP_ENV/quast/libs/genemark/linux_64:$PATH
export PATH=$IMP_ENV/prokka/binaries/linux:$PATH
export PATH=$IMP_ENV/cd-hit-v4.6.1-2012-08-27_OpenMP:$PATH
# Samtools must be full path!
export PATH=/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/samtools-0.1.19:$PATH
export PATH=/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/bedtools2/bin:$PATH
export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/bedtools2/bin
export PATH=$PATH:$IMP_ENV/bedtools2/bin
export PATH=/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast:$IMP_ENV:$PATH
#source this file before execution of snakefile
#module load Python
source /mnt/nfs/projects/ecosystem_biology/local_tools/IMP/bin/activate
module load lang/Java/1.7.0_21
#
#module load MEGAHIT
#module load BWA
#module load SAMtools
#module load BEDTools
#module load OpenBLAS
#module load Boost/1.53.0-ictce-5.3.0
#
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/idba-1.1.1.icc/bin
#
#module load CAP3
#
##symbolic links for prokka db
#module load prokka
#
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/tabix-0.2.6
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/gkno_launcher/tools/freebayes/bin
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/vcftools/bin
#export PERL5LIB=$PERL5LIB:/mnt/nfs/projects/ecosystem_biology/local_tools/vcftools/perl
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/Platypus/Platypus_0.7.9.1
#
#module load R
#Rscript -e "install.packages('beanplot')"
#
#module list
#The Boost C++ Libraries were successfully built!
#
#The following directory should be added to compiler include paths:
#
# /mnt/src_nfs1/projects/ecosystem_biology/local_tools/IMP/dependencies/boost_1_54_0
#
#The following directory should be added to linker library paths:
#
# /mnt/src_nfs1/projects/ecosystem_biology/local_tools/IMP/dependencies/boost_1_54_0/stage/lib
#
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment