Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
IMP
IMP_manuscript_analysis
Commits
60215475
Commit
60215475
authored
Oct 10, 2016
by
Shaman Narayanasamy
Browse files
Naive assemblies
parent
b89ed332
Changes
6
Hide whitespace changes
Inline
Side-by-side
naive_assembly/Snakefile
0 → 100644
View file @
60215475
import os
## Config
configfile: "config.iterative_MG-gaia183.json"
## Parameters required from the config file
MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
THREADS = os.environ.get("THREADS", config['threads'])
## Input and output parameters
INPUT_DIR = os.environ.get("INPUT_DIR")
OUT_DIR = os.environ.get("OUT_DIR")
OUT_LOG = os.environ.get("OUT_LOG")
MG_ASSM = os.environ.get("MG_ASSM")
MT_ASSM = os.environ.get("MT_ASSM")
TMP = os.environ.get("TMP")
### General rule for the iterative assemblies ###
rule ALL:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
"%s/quast/report.txt" % OUT_DIR,
shell:
"echo 'DONE'"
### Rules for the initial assembly ###
rule MERGE_ASSEMBLIES:
input:
{MG_ASSM},
{MT_ASSM}
output:
"%s/MG_MT_cap3.cat.fa" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa" % OUT_DIR
benchmark:
"%s/benchmarks/MERGE_ASSEMBLIES.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
echo "[x] Merging initial assembly contigs [x]"
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
### General rules MG data
rule QUAST:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR
output:
"%s/quast/report.txt" % (OUT_DIR),
"%s/quast/report.tsv" % (OUT_DIR),
"%s/quast/report.tex" % (OUT_DIR)
benchmark:
"%s/benchmarks/QUAST.json" % (OUT_DIR)
log:
OUT_LOG
shell:
"""
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast \
{input} \
--max-ref-number 0 --no-check --fast -f \
>> {log} 2>&1
"""
## Index gene catalog using bwa
rule INDEX_DATABASE:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR
output:
"%s/MG_MT_cap3.merged.fa.amb" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.bwt" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.pac" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.ann" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.sa" % OUT_DIR
benchmark:
"%s/benchmarks/INDEX_DATABASE.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
bwa index {input}
"""
rule MAP_IMP_MG_READS:
input:
"%s/MG.R1.preprocessed.fq" % INPUT_DIR,
"%s/MG.R2.preprocessed.fq" % INPUT_DIR,
"%s/MG.SE.preprocessed.fq" % INPUT_DIR,
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.amb" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.bwt" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.pac" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.ann" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.sa" % OUT_DIR
output:
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted.bam' % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted.bam.bai' % OUT_DIR,
benchmark:
"%s/benchmarks/MAP_MG_READS.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
SAMHEADER="@RG\\tID:{{SAMPLE}}\\tSM:MG"
PREFIX={OUT_DIR}/MG_IMP_x_MG_MT_cap3.merged
# merge paired and se
samtools merge -@ {THREADS} -f $PREFIX.merged.bam \
<(bwa mem -v 1 -t {THREADS} -M -R \"$SAMHEADER\" {input[3]} {input[0]} {input[1]} | \
samtools view -@ {THREADS} -bS -) \
<(bwa mem -v 1 -t {THREADS} -M -R \"$SAMHEADER\" {input[3]} {input[2]} | \
samtools view -@ {THREADS} -bS -)
# sort
samtools sort -@ {THREADS} -m {MEMCORE}G $PREFIX.merged.bam $PREFIX.sorted
rm $PREFIX.merged.bam
# index
samtools index $PREFIX.sorted.bam
"""
### Obtain MG read mapping statistics
rule IMP_MG_MAP_STATS:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted.bam' % OUT_DIR,
output:
"%s/MG_MT_cap3.merged.fa.fai" % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
log:
OUT_LOG
benchmark:
"%s/benchmarks/ANALYSIS_MG_CALL_CONTIG_DEPTH.json" % OUT_DIR
shell:
"""
if [[ ! -f {input[0]}.fai ]]
then
echo "No fasta index! Creating one." >> {log}
samtools faidx {input[0]}
fi
echo "flagstat" >> {log}
samtools flagstat {input[1]} > {output[1]}
"""
### Map MT reads
rule MAP_IMP_MT_READS:
input:
"%s/MT.R1.preprocessed.fq" % INPUT_DIR,
"%s/MT.R2.preprocessed.fq" % INPUT_DIR,
"%s/MT.SE.preprocessed.fq" % INPUT_DIR,
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.amb" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.bwt" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.pac" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.ann" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa.sa" % OUT_DIR
output:
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted.bam' % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted.bam.bai' % OUT_DIR,
benchmark:
"%s/benchmarks/MAP_MT_READS.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
SAMHEADER="@RG\\tID:{{SAMPLE}}\\tSM:MT"
PREFIX={OUT_DIR}/MT_IMP_x_MG_MT_cap3.merged
# merge paired and se
samtools merge -@ {THREADS} -f $PREFIX.merged.bam \
<(bwa mem -v 1 -t {THREADS} -M -R \"$SAMHEADER\" {input[3]} {input[0]} {input[1]} | \
samtools view -@ {THREADS} -bS -) \
<(bwa mem -v 1 -t {THREADS} -M -R \"$SAMHEADER\" {input[3]} {input[2]} | \
samtools view -@ {THREADS} -bS -)
# sort
samtools sort -@ {THREADS} -m {MEMCORE}G $PREFIX.merged.bam $PREFIX.sorted
rm $PREFIX.merged.bam
# index
samtools index $PREFIX.sorted.bam
"""
### Obtain MT read mapping statistics
rule IMP_MT_MAP_STATS:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted.bam' % OUT_DIR,
output:
"%s/MG_MT_cap3.merged.fa.fai" % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
log:
OUT_LOG
benchmark:
"%s/benchmarks/ANALYSIS_MT_CALL_CONTIG_DEPTH.json" % OUT_DIR
shell:
"""
if [[ ! -f {input[0]}.fai ]]
then
echo "No fasta index! Creating one." >> {log}
samtools faidx {input[0]}
fi
echo "flagstat" >> {log}
samtools flagstat {input[1]} > {output[1]}
"""
naive_assembly/config.iterative_MG-gaia183.json
0 → 100644
View file @
60215475
{
"threads"
:
8
,
"memory_total_gb"
:
256
,
"memory_per_core_gb"
:
32
,
"tmp_dir"
:
"tmp"
,
"imp_src"
:
"src"
,
"raws"
:
{
"Metagenomics"
:
""
,
"Metatranscriptomics"
:
""
},
"sample"
:
"test"
,
"outputdir"
:
"/output"
,
"db_path"
:
"/databases"
,
"preprocessing_filtering"
:
true
,
"assembler"
:
"idba"
,
"trimmomatic"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip"
,
"adapter"
:
"TruSeq3"
,
"leading"
:
20
,
"minlen"
:
40
,
"palindrome_clip_threshold"
:
30
,
"simple_clip_threshold"
:
10
,
"trailing"
:
20
,
"seed_mismatch"
:
2
,
"window_size"
:
1
,
"window_quality"
:
3
,
"strictness"
:
0.5
,
"target_length"
:
40
,
"jarfile"
:
"/home/imp/lib/trimmomatic-1.32.jar"
},
"idba_ud"
:
{
"mink"
:
25
,
"maxk"
:
99
,
"step"
:
4
,
"perid"
:
0.98
},
"cap3"
:
{
"identity"
:
100
,
"overlap"
:
1000
},
"vizbin"
:
{
"dimension"
:
50
,
"kmer"
:
5
,
"size"
:
4
,
"theta"
:
0.5
,
"perp"
:
30
,
"cutoff"
:
1000
,
"jarfile"
:
"/home/imp/lib/VizBin-dist.jar"
},
"human_filtering"
:
{
"filter"
:
"hg38"
,
"url"
:
"http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
},
"sortmerna"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/sortmerna.2.0.tgz"
,
"scripts_path"
:
"/home/imp/lib"
,
"files"
:
[
"rfam-5.8s-database-id98"
,
"silva-arc-16s-id95"
,
"silva-bac-16s-id90"
,
"silva-euk-18s-id95"
,
"rfam-5s-database-id98"
,
"silva-arc-23s-id98"
,
"silva-bac-23s-id98"
,
"silva-euk-28s-id98"
]
},
"prokka"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/prokka-1.11.tar.gz"
,
"databases"
:
[
"cm/Bacteria.i1i"
,
"genus/Staphylococcus.phr"
,
"hmm/CLUSTERS.hmm.h3f"
,
"kingdom/Archaea/sprot.phr"
]
},
"kegg"
:
{
"db_ec2pthy"
:
"http://rest.kegg.jp/link/ec/pathway"
,
"db_hierarchy"
:
"http://rest.kegg.jp/list/pathway"
}
}
naive_assembly/config.iterative_MG.json
0 → 100644
View file @
60215475
{
"threads"
:
8
,
"memory_total_gb"
:
200
,
"memory_per_core_gb"
:
25
,
"tmp_dir"
:
"tmp"
,
"imp_src"
:
"src"
,
"raws"
:
{
"Metagenomics"
:
""
,
"Metatranscriptomics"
:
""
},
"sample"
:
"test"
,
"outputdir"
:
"/output"
,
"db_path"
:
"/databases"
,
"preprocessing_filtering"
:
true
,
"assembler"
:
"idba"
,
"trimmomatic"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip"
,
"adapter"
:
"TruSeq3"
,
"leading"
:
20
,
"minlen"
:
40
,
"palindrome_clip_threshold"
:
30
,
"simple_clip_threshold"
:
10
,
"trailing"
:
20
,
"seed_mismatch"
:
2
,
"window_size"
:
1
,
"window_quality"
:
3
,
"strictness"
:
0.5
,
"target_length"
:
40
,
"jarfile"
:
"/home/imp/lib/trimmomatic-0.32.jar"
},
"idba_ud"
:
{
"mink"
:
25
,
"maxk"
:
99
,
"step"
:
4
,
"perid"
:
0.98
},
"vizbin"
:
{
"dimension"
:
50
,
"kmer"
:
5
,
"size"
:
4
,
"theta"
:
0.5
,
"perp"
:
30
,
"cutoff"
:
1000
,
"jarfile"
:
"/home/imp/lib/VizBin-dist.jar"
},
"human_filtering"
:
{
"filter"
:
"hg38"
,
"url"
:
"http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
},
"sortmerna"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/sortmerna.2.0.tgz"
,
"scripts_path"
:
"/home/imp/lib"
,
"files"
:
[
"rfam-5.8s-database-id98"
,
"silva-arc-16s-id95"
,
"silva-bac-16s-id90"
,
"silva-euk-18s-id95"
,
"rfam-5s-database-id98"
,
"silva-arc-23s-id98"
,
"silva-bac-23s-id98"
,
"silva-euk-28s-id98"
]
},
"prokka"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/prokka-1.11.tar.gz"
,
"databases"
:
[
"cm/Bacteria.i1i"
,
"genus/Staphylococcus.phr"
,
"hmm/CLUSTERS.hmm.h3f"
,
"kingdom/Archaea/sprot.phr"
]
},
"kegg"
:
{
"db_ec2pthy"
:
"http://rest.kegg.jp/link/ec/pathway"
,
"db_hierarchy"
:
"http://rest.kegg.jp/list/pathway"
}
}
naive_assembly/execution.sh
0 → 100755
View file @
60215475
#!/bin/bash -l
source
./preload_modules.sh
INDIR
=
$1
OUTDIR
=
$2
OUTLOG
=
$3
MG_ASSM
=
$4
MT_ASSM
=
$5
TMPDIR
=
$6
date
### Initialize first assembly ###
INPUT_DIR
=
$INDIR
\
OUT_DIR
=
$OUTDIR
\
OUT_LOG
=
$OUTLOG
\
MG_ASSM
=
$MG_ASSM
\
MT_ASSM
=
$MT_ASSM
\
TMP
=
$TMPDIR
\
snakemake
-pr
ALL
date
naive_assembly/launcher.sh
0 → 100755
View file @
60215475
#!/bin/bash -l
OARSUB
=
"oarsub --notify "
mail:shaman.narayanasamy@uni.lu
" -t bigmem -t idempotent -t besteffort -l core=8/nodes=1,walltime=2"
#OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -t bigmem -p "network_address=\'gaia-183\'" --project "project_biocore" -l core=8/nodes=1,walltime=120"
declare
-a
SAMPLES
=(
"SM"
"HF1"
"HF2"
"HF3"
"HF4"
"HF5"
"WW1"
"WW2"
"WW3"
"WW4"
"BG"
)
### Repeat for all the data sets
for
S
in
"
${
SAMPLES
[@]
}
"
do
INDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/
${
S
}
/Preprocessing/"
OUTDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/naive_assemblies/
${
S
}
"
OUTLOG
=
"/scratch/users/snarayanasamy/IMP_MS_data/naive_assemblies/
${
S
}
/
${
S
}
_naive_assembly.log"
MG_ASSM
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/MG_assemblies/
${
S
}
/MG_contigs_1.fa"
MT_ASSM
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/MT_assemblies/
${
S
}
/MT_contigs_1.fa"
TMPDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/naive_assemblies/
${
S
}
/tmp"
${
OARSUB
}
-n
"
${
S
}
_naive_assm"
"./execution.sh
$INDIR
$OUTDIR
$OUTLOG
$MG_ASSM
$MT_ASSM
$TMPDIR
"
#CMD="./execution.sh $INDIR $OUTDIR $OUTLOG $MG_ASSM $MT_ASSM $TMPDIR"
echo
$CMD
# exec ${CMD}
done
naive_assembly/preload_modules.sh
0 → 100644
View file @
60215475
IMP_ENV
=
/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies
export
PATH
=
$IMP_ENV
/fastuniq/source:
$PATH
export
PATH
=
$IMP_ENV
/sortmerna-2.0:
$PATH
export
PATH
=
$IMP_ENV
/sortmerna-2.0/scripts:
$PATH
export
PATH
=
$IMP_ENV
/bwa-0.7.9a:
$PATH
export
PATH
=
$IMP_ENV
/idba-1.1.1/bin:
$PATH
export
PATH
=
$IMP_ENV
/megahit:
$PATH
export
PATH
=
$IMP_ENV
/CAP3:
$PATH
export
PATH
=
$IMP_ENV
/prokka/bin:
$PATH
export
PATH
=
$IMP_ENV
/quast:
$PATH
export
PATH
=
$IMP_ENV
/quast/libs/genemark/linux_64:
$PATH
export
PATH
=
$IMP_ENV
/prokka/binaries/linux:
$PATH
export
PATH
=
$IMP_ENV
/cd-hit-v4.6.1-2012-08-27_OpenMP:
$PATH
# Samtools must be full path!
export
PATH
=
/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/samtools-0.1.19:
$PATH
export
PATH
=
/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/bedtools2/bin:
$PATH
export
PATH
=
$PATH
:/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/bedtools2/bin
export
PATH
=
$PATH
:
$IMP_ENV
/bedtools2/bin
export
PATH
=
/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast:
$IMP_ENV
:
$PATH
#source this file before execution of snakefile
#module load Python
source
/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/bin/activate
module load lang/Java/1.7.0_21
#
#module load MEGAHIT
#module load BWA
#module load SAMtools
#module load BEDTools
#module load OpenBLAS
#module load Boost/1.53.0-ictce-5.3.0
#
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/idba-1.1.1.icc/bin
#
#module load CAP3
#
##symbolic links for prokka db
#module load prokka
#
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/tabix-0.2.6
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/gkno_launcher/tools/freebayes/bin
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/vcftools/bin
#export PERL5LIB=$PERL5LIB:/mnt/nfs/projects/ecosystem_biology/local_tools/vcftools/perl
#export PATH=$PATH:/mnt/nfs/projects/ecosystem_biology/local_tools/Platypus/Platypus_0.7.9.1
#
#module load R
#Rscript -e "install.packages('beanplot')"
#
#module list
#The Boost C++ Libraries were successfully built!
#
#The following directory should be added to compiler include paths:
#
# /mnt/src_nfs1/projects/ecosystem_biology/local_tools/IMP/dependencies/boost_1_54_0
#
#The following directory should be added to linker library paths:
#
# /mnt/src_nfs1/projects/ecosystem_biology/local_tools/IMP/dependencies/boost_1_54_0/stage/lib
#
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment