Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
IMP
IMP_manuscript_analysis
Commits
f723c65c
Commit
f723c65c
authored
Oct 10, 2016
by
Shaman Narayanasamy
Browse files
Merge branch 'master' of
ssh://git-r3lab-server.uni.lu:8022/IMP/IMP_manuscript_analysis
parents
4fab96fa
2e0c5b84
Changes
15
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
f723c65c
...
...
@@ -83,3 +83,8 @@ prodigal_analysis/prodigal_summary.tsv
quast_analysis/
additional_analyses/HMP_gene_catalog/IMP2IGC_mapping.txt
additional_analyses/HMP_gene_catalog/OARlogs/
iterative_assembly/MG_assembly/oarlogs/
metaquast_analysis/oarlogs/*
naive_assembly/.snakemake/*
naive_assembly/oarlogs/*
additional_analyses/*~
iterative_assembly/MG_assembly/Snakefile_CAMI
0 → 100644
View file @
f723c65c
import os
# Take the current step and calculate next step target
STEP = os.environ.get("STEP", "1")
STEPp1 = int(STEP) + 1
configfile: "config.iterative_MG.json"
#configfile: "../config.imp.json"
## Parameters required from the config file
MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
THREADS = os.environ.get("THREADS", config['threads'])
## Input and output parameters
OUT_LOG = os.environ.get("OUT_LOG")
INPUT_DIR = os.environ.get("INPUT_DIR")
OUT_DIR = os.environ.get("OUT_DIR")
TMP = os.environ.get("TMP")
### General rule for the iterative assemblies ###
rule ALL:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1),
"%s/quast_1/report.txt" % OUT_DIR,
"%s/quast_1/report.tsv" % OUT_DIR,
"%s/quast_1/report.tex" % OUT_DIR,
"%s/quast_%s/report.txt" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tsv" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tex" % (OUT_DIR, STEPp1)
shell:
"echo 'DONE'"
### Rules for the initial assembly ###
rule INITIAL_ASSEMBLY:
input:
"%s/mg.r1.preprocessed.fq" % INPUT_DIR,
"%s/mg.r2.preprocessed.fq" % INPUT_DIR,
"%s/mg.se.preprocessed.fq" % INPUT_DIR
output:
"%s/mg_contigs_1.fa" % OUT_DIR
benchmark:
"%s/benchmarks/INITIAL_ASSEMBLY.json" % OUT_DIR
log:
OUT_LOG
priority: 50
shell:
"""
echo "[x] Running assembly {STEP} [x]"
mkdir {TMP}
echo "[x] Interleave mg fastq files"
TMPD=$(mktemp -d -t --tmpdir={TMP} "XXXXXX")
fq2fa --merge {input[0]} {input[1]} $TMPD/merged.fa
idba_ud -r $TMPD/merged.fa \
-l {input[2]} -o {OUT_DIR}/assembly_1 \
--mink {config[idba_ud][mink]} --maxk {config[idba_ud][maxk]} \
--step {config[idba_ud][step]} --num_threads {THREADS} \
--similar {config[idba_ud][perid]} --pre_correction \
>> {log} 2>&1
cp {OUT_DIR}/assembly_1/contig.fa {output}
"""
rule MERGE_INITIAL_ASSEMBLY:
input:
"%s/mg_contigs_1.fa" % OUT_DIR,
output:
"%s/mg_contigs_cat_1.fa" % OUT_DIR,
"%s/mg_contigs_merged_1.fa" % OUT_DIR
benchmark:
"%s/benchmarks/MERGE_INITIAL_ASSEMBLY.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
echo "[x] Merging initial assembly contigs [x]"
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
rule QUAST_INITIAL_ASSEMBLY:
input:
"%s/mg_contigs_cat_1.fa" % OUT_DIR,
"%s/mg_contigs_merged_1.fa" % OUT_DIR
output:
"%s/quast_1/report.txt" % OUT_DIR,
"%s/quast_1/report.tsv" % OUT_DIR,
"%s/quast_1/report.tex" % OUT_DIR
benchmark:
"%s/benchmarks/QUAST_1.json" % (OUT_DIR)
log:
OUT_LOG
shell:
"""
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_1 \
{input[0]} \
--max-ref-number 0 --no-check --fast \
>> {log} 2>&1
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_1 \
{input[1]} \
--max-ref-number 0 --no-check --fast -f\
>> {log} 2>&1
"""
### General rules for iterations of assemblies ###
rule EXTRACT_UNMAPPED:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEP),
"%s/mg.r1.preprocessed.fq" % INPUT_DIR,
"%s/mg.r2.preprocessed.fq" % INPUT_DIR,
"%s/mg.se.preprocessed.fq" % INPUT_DIR
output:
'%s/mg.r1.unmapped_%s.fq' % (OUT_DIR, STEP),
'%s/mg.r2.unmapped_%s.fq' % (OUT_DIR, STEP),
'%s/mg.se.unmapped_%s.fq' % (OUT_DIR, STEP)
benchmark:
"%s/benchmarks/EXTRACT_UNMAPPED_%s.json" % (OUT_DIR, STEP)
log:
OUT_LOG
shell:
"""
echo "[x] Extracting unmapped from assembly {STEP} [x]"
echo "[x] Indexing assembly {STEP} [x]"
bwa index {input[0]}
echo "[x] Extracting unmapped reads [x]"
TMP_FILE=$(mktemp --tmpdir={TMP} -t "alignment_{STEP}-XXXX.bam")
BUFFER=$(mktemp --tmpdir={TMP} -t "alignment_buffer_{STEP}-XXXX.bam")
bwa mem -v 1 -t {THREADS} {input[0]} {input[1]} {input[2]} | samtools view -@ {THREADS} -bS - > $TMP_FILE
samtools merge -@ {THREADS} -u - \
<(samtools view -@ {THREADS} -u -f 4 -F 264 $TMP_FILE) \
<(samtools view -@ {THREADS} -u -f 8 -F 260 $TMP_FILE) \
<(samtools view -@ {THREADS} -u -f 12 -F 256 $TMP_FILE) | \
samtools view -@ {THREADS} -bF 0x800 - | samtools sort -o -@ {THREADS} -m {MEMCORE}G -n - $BUFFER | \
bamToFastq -i stdin -fq {output[0]} -fq2 {output[1]} \
>> {log} 2>&1
bwa mem -v 1 -t {THREADS} {input[0]} {input[3]} | \
samtools view -@ {THREADS} -bS - | samtools view -@ {THREADS} -uf 4 - | \
bamToFastq -i stdin -fq {output[2]} \
>> {log} 2>&1
"""
rule ASSEMBLY:
input:
"%s/mg.r1.unmapped_%s.fq" % (OUT_DIR, STEP),
"%s/mg.r2.unmapped_%s.fq" % (OUT_DIR, STEP),
"%s/mg.se.unmapped_%s.fq" % (OUT_DIR, STEP)
output:
"%s/mg_contigs_%s.fa" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/ASSEMBLY_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
echo "[x] Running assembly {STEPp1} [x]"
echo "[x] Interleave mg fastq files"
TMPD=$(mktemp -d -t --tmpdir={TMP} "XXXXXX")
fq2fa --merge {input[0]} {input[1]} $TMPD/merged.fa
idba_ud -r $TMPD/merged.fa \
-l {input[2]} -o {OUT_DIR}/assembly_{STEPp1} \
--mink {config[idba_ud][mink]} --maxk {config[idba_ud][maxk]} \
--step {config[idba_ud][step]} --num_threads {THREADS} \
--similar {config[idba_ud][perid]} --pre_correction \
>> {log} 2>&1
cp {OUT_DIR}/assembly_{STEPp1}/contig.fa {output}
"""
rule MERGE_ASSEMBLIES:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEP),
"%s/mg_contigs_%s.fa" % (OUT_DIR, STEPp1)
output:
"%s/mg_contigs_cat_%s.fa" % (OUT_DIR, STEPp1),
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/MERGE_ASSEMBLIES_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
echo "[x] Merging assemblies {STEP} and {STEPp1} [x]"
NAME={OUT_DIR}/mg.assembly
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
rule QUAST:
input:
"%s/mg_contigs_cat_%s.fa" % (OUT_DIR, STEPp1),
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1)
output:
"%s/quast_%s/report.txt" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tsv" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tex" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/QUAST_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_{STEPp1} \
{input[0]} \
--max-ref-number 0 --no-check --fast \
>> {log} 2>&1
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_{STEPp1} \
{input[1]} \
--max-ref-number 0 --no-check --fast -f \
>> {log} 2>&1
"""
iterative_assembly/MG_assembly/config.iterative_MG-gaia183.json
0 → 100644
View file @
f723c65c
{
"threads"
:
20
,
"memory_total_gb"
:
640
,
"memory_per_core_gb"
:
32
,
"tmp_dir"
:
"tmp"
,
"imp_src"
:
"src"
,
"raws"
:
{
"Metagenomics"
:
""
,
"Metatranscriptomics"
:
""
},
"sample"
:
"test"
,
"outputdir"
:
"/output"
,
"db_path"
:
"/databases"
,
"preprocessing_filtering"
:
true
,
"assembler"
:
"idba"
,
"trimmomatic"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip"
,
"adapter"
:
"TruSeq3"
,
"leading"
:
20
,
"minlen"
:
40
,
"palindrome_clip_threshold"
:
30
,
"simple_clip_threshold"
:
10
,
"trailing"
:
20
,
"seed_mismatch"
:
2
,
"window_size"
:
1
,
"window_quality"
:
3
,
"strictness"
:
0.5
,
"target_length"
:
40
,
"jarfile"
:
"/home/imp/lib/trimmomatic-1.32.jar"
},
"idba_ud"
:
{
"mink"
:
25
,
"maxk"
:
99
,
"step"
:
4
,
"perid"
:
0.98
"cap3"
:
{
"identity"
:
100
,
"overlap"
:
1000
},
"vizbin"
:
{
"dimension"
:
50
,
"kmer"
:
5
,
"size"
:
4
,
"theta"
:
0.5
,
"perp"
:
30
,
"cutoff"
:
1000
,
"jarfile"
:
"/home/imp/lib/VizBin-dist.jar"
},
"human_filtering"
:
{
"filter"
:
"hg38"
,
"url"
:
"http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
},
"sortmerna"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/sortmerna.2.0.tgz"
,
"scripts_path"
:
"/home/imp/lib"
,
"files"
:
[
"rfam-5.8s-database-id98"
,
"silva-arc-16s-id95"
,
"silva-bac-16s-id90"
,
"silva-euk-18s-id95"
,
"rfam-5s-database-id98"
,
"silva-arc-23s-id98"
,
"silva-bac-23s-id98"
,
"silva-euk-28s-id98"
]
},
"prokka"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/prokka-1.11.tar.gz"
,
"databases"
:
[
"cm/Bacteria.i1i"
,
"genus/Staphylococcus.phr"
,
"hmm/CLUSTERS.hmm.h3f"
,
"kingdom/Archaea/sprot.phr"
]
},
"kegg"
:
{
"db_ec2pthy"
:
"http://rest.kegg.jp/link/ec/pathway"
,
"db_hierarchy"
:
"http://rest.kegg.jp/list/pathway"
}
}
iterative_assembly/MG_assembly/execution_CAMI.sh
0 → 100755
View file @
f723c65c
#!/bin/bash -l
source
../preload_modules.sh
INDIR
=
$1
OUTDIR
=
$2
OUTLOG
=
$3
TMPDIR
=
$4
date
### Initialize first assembly ###
INPUT_DIR
=
$INDIR
\
OUT_DIR
=
$OUTDIR
\
OUT_LOG
=
$OUTLOG
\
TMP
=
$TMPDIR
\
snakemake
-s
Snakefile_CAMI ALL
INPUT_DIR
=
$INDIR
\
OUT_DIR
=
$OUTDIR
\
OUT_LOG
=
$OUTLOG
\
TMP
=
$TMPDIR
\
snakemake
-s
Snakefile_CAMI EXTRACT_UNMAPPED
for
ITER
in
{
2..5
}
do
echo
"Iteration
$ITER
"
INPUT_DIR
=
$INDIR
\
OUT_DIR
=
$OUTDIR
\
OUT_LOG
=
$OUTLOG
\
TMP
=
$TMPDIR
\
STEP
=
${
ITER
}
snakemake
-s
Snakefile_CAMI ALL
done
### Join tables
OUT_DIR
=
$OUTDIR
cat
<
(
cat
${
OUT_DIR
}
/quast_1/transposed_report.tsv
)
\
<
(
tail
-n1
${
OUT_DIR
}
/quast_[2-6]/transposed_report.tsv |
\
grep
-v
"==>"
|
grep
-v
"^$"
)
>
${
OUT_DIR
}
/full_report.tsv
ls
${
OUT_DIR
}
/MG_contigs_cat_[1-6].fa | xargs
-I
{}
getN50sizemax
{}
|
\
cut
-f1
,2,3,4,6 |
\
sed
-e
's/NUM=//g'
|
\
sed
-e
's/AVG=//g'
|
\
sed
-e
's/N50=//g'
|
\
sed
-e
's/GENOMESIZE=//g'
|
\
sed
-e
's/MAX=//g'
\
>>
${
OUT_DIR
}
/uncollapsed_tmp.tsv
paste
<
(
ls
${
OUT_DIR
}
/MG_contigs_cat_[1-6].fa
)
${
OUT_DIR
}
/uncollapsed_tmp.tsv
>
${
OUT_DIR
}
/uncollapsed_tmp2.tsv
cat
<
(
echo
-e
"assembly
\t
contig_no
\t
avg_contig_len
\t
N50
\t
genome_len
\t
max_contig_len"
)
<
(
cat
${
OUT_DIR
}
/uncollapsed_tmp2.tsv
)
>
${
OUT_DIR
}
/uncollapsed_contigs_stats.tsv
ls
${
OUT_DIR
}
/MG_contigs_merged_[1-6].fa | xargs
-I
{}
getN50sizemax
{}
|
\
cut
-f2
,4 |
\
sed
-e
's/AVG=//g'
|
\
sed
-e
's/GENOMESIZE=//g'
\
>>
${
OUT_DIR
}
/merged_contigs_tmp.tsv
paste
<
(
cat
full_report.tsv
)
<
(
cat
<
(
echo
-e
"avg_contig_len
\t
genome_len"
)
<
(
cat
${
OUT_DIR
}
/merged_contigs_tmp.tsv
))
>
${
OUT_DIR
}
/collapsed_contigs_stats.tsv
rm
-rf
${
OUT_DIR
}
/uncollapsed_tmp2.tsv
${
OUT_DIR
}
/uncollapsed_tmp.tsv
${
OUT_DIR
}
/merged_contigs_tmp.tsv
### Count reads within the fastq files
INPUT_DIR
=
$INDIR
wc
-l
${
INPUT_DIR
}
/MG.R1.preprocessed.fq |
grep
-v
"total"
>
${
OUT_DIR
}
/pairs-raw_counts.tsv
wc
-l
${
OUT_DIR
}
/MG.R1.unmapped_[1-6].fq |
grep
-v
"total"
>>
${
OUT_DIR
}
/pairs-raw_counts.tsv
wc
-l
${
INPUT_DIR
}
/MG.SE.preprocessed.fq |
grep
-v
"total"
>
${
OUT_DIR
}
/singles-raw_counts.tsv
wc
-l
${
OUT_DIR
}
/MG.SE.unmapped_[1-6].fq |
grep
-v
"total"
>>
${
OUT_DIR
}
/singles-raw_counts.tsv
awk
'{print $2, $1/4}'
${
OUT_DIR
}
/pairs-raw_counts.tsv
>
${
OUT_DIR
}
/pair_counts.tsv
awk
'{print $2, $1/4}'
${
OUT_DIR
}
/singles-raw_counts.tsv
>
${
OUT_DIR
}
/single_counts.tsv
date
iterative_assembly/MG_assembly/launcher_CAMI.sh
0 → 100755
View file @
f723c65c
#!/bin/bash -l
#OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -t bigmem -t idempotent -t besteffort -l core=24/nodes=1,walltime=120"
OARSUB
=
"oarsub --notify "
mail:shaman.narayanasamy@uni.lu
" -t bigmem -p "
network_address
=
\'
gaia-183
\'
" --project "
project_biocore
" -l core=20/nodes=1,walltime=120"
#OARSUB=""
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "iterative_assm_HMP" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_X310763260.sh
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "iterative_assm_A02" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_A02.sh
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "simDat-iterative_assm" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_simDat.sh
#
#declare -a SAMPLES=("CAMI_low" "CAMI_medium" "CAMI_high")
declare
-a
SAMPLES
=(
"CAMI_medium"
"CAMI_high"
)
### Repeat for all the data sets
for
S
in
"
${
SAMPLES
[@]
}
"
do
INDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/CAMI/
${
S
}
-idba/Preprocessing/"
OUTDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/
${
S
}
-iterative_MG"
OUTLOG
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/
${
S
}
-iterative_MG/
${
S
}
_iterative_MG.log"
TMPDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/
${
S
}
-iterative_MG/tmp"
${
OARSUB
}
-n
"
${
S
}
_MG_iterative"
"./execution_CAMI.sh
$INDIR
$OUTDIR
$OUTLOG
$TMPDIR
"
#CMD="./execution.sh $INDIR $OUTDIR $OUTLOG $TMPDIR"
#echo $CMD
#exec ${CMD}
done
metaquast_analysis/execution.sh
View file @
f723c65c
...
...
@@ -13,16 +13,18 @@ MOCAT_MG=$7
IMP_MT
=
$8
METAMOS_MT
=
$9
MOCAT_MT
=
${
10
}
NAIVE
=
${
11
}
OUTDIR
=
${
1
1
}
OUTDIR
=
${
1
2
}
date
#source ../preload_modules.sh
/mnt/gaiagpfs/projects/ecosystem_biology/local_tools/quast/metaquast.py
\
-o
$OUTDIR
-t
12
\
-l
"IMP, IMP-megahit, MetAmos_MGMT, MOCAT_MGMT, IMP_MG, MetAmos_MG, MOCAT_MG, IMP_MT, MetAmos_MT, MOCAT_MT"
\
-o
$OUTDIR
-t
6
\
-l
"IMP, IMP-megahit,
Naive,
MetAmos_MGMT, MOCAT_MGMT, IMP_MG, MetAmos_MG, MOCAT_MG, IMP_MT, MetAmos_MT, MOCAT_MT"
\
-f
$IMP
\
$IMP_MEGAHIT
\
$NAIVE
\
$METAMOS
\
$MOCAT
\
$IMP_MG
\
...
...
metaquast_analysis/execution_SM.sh
View file @
f723c65c
...
...
@@ -13,17 +13,19 @@ MOCAT_MG=$7
IMP_MT
=
$8
METAMOS_MT
=
$9
MOCAT_MT
=
${
10
}
NAIVE
=
${
11
}
OUTDIR
=
${
1
1
}
OUTDIR
=
${
1
2
}
date
#source ../preload_modules.sh
/mnt/gaiagpfs/projects/ecosystem_biology/local_tools/quast/metaquast.py
\
-o
$OUTDIR
-t
12
-f
\
-l
"IMP, IMP-megahit, MetAmos_MGMT, MOCAT_MGMT, IMP_MG, MetAmos_MG, MOCAT_MG, IMP_MT, MetAmos_MT, MOCAT_MT"
\
-l
"IMP, IMP-megahit,
Naive,
MetAmos_MGMT, MOCAT_MGMT, IMP_MG, MetAmos_MG, MOCAT_MG, IMP_MT, MetAmos_MT, MOCAT_MT"
\
-R
/mnt/nfs/projects/ecosystem_biology/test_datasets/CelajEtAl/73_species/
\
${
IMP
}
\
${
IMP_MEGAHIT
}
\
${
NAIVE
}
\
${
METAMOS
}
\
${
MOCAT
}
\
${
IMP_MG
}
\
...
...
metaquast_analysis/launcher.sh
View file @
f723c65c
...
...
@@ -2,31 +2,33 @@
#OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -t bigmem -t idempotent -t besteffort -l core=8/nodes=1,walltime=120"
#OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -l core=12/nodes=1,walltime=48 -t bigmem -t besteffort -t idempotent"
OARSUB
=
"oarsub --notify "
mail:shaman.narayanasamy@uni.lu
" -l nodes=1,walltime=120"
### MGMT assemblies
#IMP="/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/SM/Assembly/MGMT.assembly.merged.fa"
#IMP_MEGAHIT="/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/SM_megahit/Assembly/MGMT.assembly.merged.fa"
#METAMOS="/scratch/users/snarayanasamy/IMP_MS_data/metAmosAnalysis/SM/MGMT/default/SM/Assemble/out/soapdenovo.31.asm.contig"
#MOCAT="/scratch/users/snarayanasamy/IMP_MS_data/MOCAT_analysis/Combined/SM/SM_MOCAT_MGMT"
#
### MG assemblies
#IMP_MG="/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/MG_assemblies/SM/MG_contigs_merged_2.fa"
#METAMOS_MG="/scratch/users/snarayanasamy/IMP_MS_data/metAmosAnalysis/SM/MG/default/SM/Assemble/out/soapdenovo.31.asm.contig"
#MOCAT_MG="/scratch/users/snarayanasamy/IMP_MS_data/MOCAT_analysis/MG/SM/SM_MOCAT_MG"
#
### MT assemblies
#IMP_MT="/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/MT_assemblies/SM/MT_contigs_merged_2.fa"
#METAMOS_MT="/scratch/users/snarayanasamy/IMP_MS_data/metAmosAnalysis/SM/MT/default/SM/Assemble/out/soapdenovo.31.asm.contig"
#MOCAT_MT="/scratch/users/snarayanasamy/IMP_MS_data/MOCAT_analysis/MT/SM/SM_MOCAT_MT"
#
#OUTDIR="/scratch/users/snarayanasamy/IMP_MS_data/metaquast_analysis/SM"
#
#${OARSUB} -n "SM_metaquast" "./execution_SM.sh $IMP $IMP_MEGAHIT $METAMOS $MOCAT $IMP_MG $METAMOS_MG $MOCAT_MG $IMP_MT $METAMOS_MT $MOCAT_MT $OUTDIR"
#declare -a SAMPLES=("HF1" "HF2" "HF3" "HF4" "HF5" "WW1" "WW2" "WW3" "WW4" "BG")
#OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -l nodes=1,walltime=120"
OARSUB
=
"oarsub --notify "
mail:shaman.narayanasamy@uni.lu
" -t bigmem -p "
network_address
=
\'
gaia-183
\'
" --project "
project_biocore
" -l core=6/nodes=1,walltime=72"
## MGMT assemblies
IMP
=
"/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/SM/Assembly/MGMT.assembly.merged.fa"
IMP_MEGAHIT
=
"/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/SM_megahit/Assembly/MGMT.assembly.merged.fa"
NAIVE
=
"/scratch/users/snarayanasamy/IMP_MS_data/naive_assemblies/SM/MG_MT_cap3.merged.fa"
METAMOS
=
"/scratch/users/snarayanasamy/IMP_MS_data/metAmosAnalysis/SM/MGMT/default/SM/Assemble/out/soapdenovo.31.asm.contig"
MOCAT
=
"/scratch/users/snarayanasamy/IMP_MS_data/MOCAT_analysis/Combined/SM/SM_MOCAT_MGMT"
## MG assemblies
IMP_MG
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/MG_assemblies/SM/MG_contigs_merged_2.fa"
METAMOS_MG
=
"/scratch/users/snarayanasamy/IMP_MS_data/metAmosAnalysis/SM/MG/default/SM/Assemble/out/soapdenovo.31.asm.contig"
MOCAT_MG
=
"/scratch/users/snarayanasamy/IMP_MS_data/MOCAT_analysis/MG/SM/SM_MOCAT_MG"
## MT assemblies
IMP_MT
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/MT_assemblies/SM/MT_contigs_merged_2.fa"
METAMOS_MT
=
"/scratch/users/snarayanasamy/IMP_MS_data/metAmosAnalysis/SM/MT/default/SM/Assemble/out/soapdenovo.31.asm.contig"
MOCAT_MT
=
"/scratch/users/snarayanasamy/IMP_MS_data/MOCAT_analysis/MT/SM/SM_MOCAT_MT"
OUTDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/metaquast_analysis/SM"
${
OARSUB
}
-n
"SM_metaquast"
"./execution_SM.sh
$IMP
$IMP_MEGAHIT
$METAMOS
$MOCAT
$IMP_MG
$METAMOS_MG
$MOCAT_MG
$IMP_MT
$METAMOS_MT
$MOCAT_MT
$NAIVE
$OUTDIR
"
declare
-a
SAMPLES
=(
"HF1"
"HF2"
"HF3"
"HF4"
"HF5"
"WW1"
"WW2"
"WW3"
"WW4"
"BG"
)
#declare -a SAMPLES=("WW2" "WW3" "WW4")
declare
-a
SAMPLES
=(
"WW1"
)
#
declare -a SAMPLES=("WW1")
## Repeat for all the data sets
for
S
in
"
${
SAMPLES
[@]
}
"
...
...
@@ -45,6 +47,7 @@ do
IMP
=
"/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/
${
S
}
/Assembly/MGMT.assembly.merged.fa"
IMP_MEGAHIT
=
"/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/
${
S
}
_megahit/Assembly/MGMT.assembly.merged.fa"
METAMOS
=
"/scratch/users/snarayanasamy/IMP_MS_data/metAmosAnalysis/
${
S1
}
/MGMT/default/
${
S
}
/Assemble/out/soapdenovo.31.asm.contig"
NAIVE
=
"/scratch/users/snarayanasamy/IMP_MS_data/naive_assemblies/
${
S
}
/MG_MT_cap3.merged.fa"
MOCAT
=
"/scratch/users/snarayanasamy/IMP_MS_data/MOCAT_analysis/Combined/
${
S1
}
/
${
S
}
_MOCAT_MGMT"
## MG assemblies
...
...
@@ -59,5 +62,5 @@ do
OUTDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/metaquast_analysis/
${
S
}
"
${
OARSUB
}
-n
"
${
S
}
_metaquast"
"./execution.sh
$IMP
$IMP_MEGAHIT
$METAMOS
$MOCAT
$IMP_MG
$METAMOS_MG
$MOCAT_MG
$IMP_MT
$METAMOS_MT
$MOCAT_MT
$OUTDIR
"
${
OARSUB
}
-n
"
${
S
}
_metaquast"
"./execution.sh
$IMP
$IMP_MEGAHIT
$METAMOS
$MOCAT
$IMP_MG
$METAMOS_MG
$MOCAT_MG
$IMP_MT
$METAMOS_MT
$MOCAT_MT
$NAIVE
$OUTDIR
"
done
naive_assembly/Snakefile
0 → 100644
View file @
f723c65c
import os
## Config
configfile: "config.iterative_MG-gaia183.json"
## Parameters required from the config file
MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
THREADS = os.environ.get("THREADS", config['threads'])
## Input and output parameters
INPUT_DIR = os.environ.get("INPUT_DIR")
OUT_DIR = os.environ.get("OUT_DIR")
OUT_LOG = os.environ.get("OUT_LOG")
MG_ASSM = os.environ.get("MG_ASSM")
MT_ASSM = os.environ.get("MT_ASSM")
TMP = os.environ.get("TMP")
### General rule for the iterative assemblies ###
rule ALL:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR,
'%s/MT_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
'%s/MG_IMP_x_MG_MT_cap3.merged.sorted_flagstat.txt' % OUT_DIR,
"%s/quast/report.txt" % OUT_DIR,
shell:
"echo 'DONE'"
### Rules for the initial assembly ###
rule MERGE_ASSEMBLIES:
input:
{MG_ASSM},
{MT_ASSM}
output:
"%s/MG_MT_cap3.cat.fa" % OUT_DIR,
"%s/MG_MT_cap3.merged.fa" % OUT_DIR
benchmark:
"%s/benchmarks/MERGE_ASSEMBLIES.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
echo "[x] Merging initial assembly contigs [x]"
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
### General rules MG data
rule QUAST:
input:
"%s/MG_MT_cap3.merged.fa" % OUT_DIR
output: