Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
IMP
IMP_manuscript_analysis
Commits
c345607e
Commit
c345607e
authored
Oct 10, 2016
by
Shaman Narayanasamy
Browse files
Edit gitignore and add CAMI assembly files
parent
60215475
Changes
5
Show whitespace changes
Inline
Side-by-side
.gitignore
View file @
c345607e
...
...
@@ -83,3 +83,8 @@ prodigal_analysis/prodigal_summary.tsv
quast_analysis/
additional_analyses/HMP_gene_catalog/IMP2IGC_mapping.txt
additional_analyses/HMP_gene_catalog/OARlogs/
iterative_assembly/MG_assembly/oarlogs/
metaquast_analysis/oarlogs/*
naive_assembly/.snakemake/*
naive_assembly/oarlogs/*
additional_analyses/*~
iterative_assembly/MG_assembly/Snakefile_CAMI
0 → 100644
View file @
c345607e
import os
# Take the current step and calculate next step target
STEP = os.environ.get("STEP", "1")
STEPp1 = int(STEP) + 1
configfile: "config.iterative_MG.json"
#configfile: "../config.imp.json"
## Parameters required from the config file
MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
THREADS = os.environ.get("THREADS", config['threads'])
## Input and output parameters
OUT_LOG = os.environ.get("OUT_LOG")
INPUT_DIR = os.environ.get("INPUT_DIR")
OUT_DIR = os.environ.get("OUT_DIR")
TMP = os.environ.get("TMP")
### General rule for the iterative assemblies ###
rule ALL:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1),
"%s/quast_1/report.txt" % OUT_DIR,
"%s/quast_1/report.tsv" % OUT_DIR,
"%s/quast_1/report.tex" % OUT_DIR,
"%s/quast_%s/report.txt" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tsv" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tex" % (OUT_DIR, STEPp1)
shell:
"echo 'DONE'"
### Rules for the initial assembly ###
rule INITIAL_ASSEMBLY:
input:
"%s/mg.r1.preprocessed.fq" % INPUT_DIR,
"%s/mg.r2.preprocessed.fq" % INPUT_DIR,
"%s/mg.se.preprocessed.fq" % INPUT_DIR
output:
"%s/mg_contigs_1.fa" % OUT_DIR
benchmark:
"%s/benchmarks/INITIAL_ASSEMBLY.json" % OUT_DIR
log:
OUT_LOG
priority: 50
shell:
"""
echo "[x] Running assembly {STEP} [x]"
mkdir {TMP}
echo "[x] Interleave mg fastq files"
TMPD=$(mktemp -d -t --tmpdir={TMP} "XXXXXX")
fq2fa --merge {input[0]} {input[1]} $TMPD/merged.fa
idba_ud -r $TMPD/merged.fa \
-l {input[2]} -o {OUT_DIR}/assembly_1 \
--mink {config[idba_ud][mink]} --maxk {config[idba_ud][maxk]} \
--step {config[idba_ud][step]} --num_threads {THREADS} \
--similar {config[idba_ud][perid]} --pre_correction \
>> {log} 2>&1
cp {OUT_DIR}/assembly_1/contig.fa {output}
"""
rule MERGE_INITIAL_ASSEMBLY:
input:
"%s/mg_contigs_1.fa" % OUT_DIR,
output:
"%s/mg_contigs_cat_1.fa" % OUT_DIR,
"%s/mg_contigs_merged_1.fa" % OUT_DIR
benchmark:
"%s/benchmarks/MERGE_INITIAL_ASSEMBLY.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
echo "[x] Merging initial assembly contigs [x]"
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
rule QUAST_INITIAL_ASSEMBLY:
input:
"%s/mg_contigs_cat_1.fa" % OUT_DIR,
"%s/mg_contigs_merged_1.fa" % OUT_DIR
output:
"%s/quast_1/report.txt" % OUT_DIR,
"%s/quast_1/report.tsv" % OUT_DIR,
"%s/quast_1/report.tex" % OUT_DIR
benchmark:
"%s/benchmarks/QUAST_1.json" % (OUT_DIR)
log:
OUT_LOG
shell:
"""
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_1 \
{input[0]} \
--max-ref-number 0 --no-check --fast \
>> {log} 2>&1
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_1 \
{input[1]} \
--max-ref-number 0 --no-check --fast -f\
>> {log} 2>&1
"""
### General rules for iterations of assemblies ###
rule EXTRACT_UNMAPPED:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEP),
"%s/mg.r1.preprocessed.fq" % INPUT_DIR,
"%s/mg.r2.preprocessed.fq" % INPUT_DIR,
"%s/mg.se.preprocessed.fq" % INPUT_DIR
output:
'%s/mg.r1.unmapped_%s.fq' % (OUT_DIR, STEP),
'%s/mg.r2.unmapped_%s.fq' % (OUT_DIR, STEP),
'%s/mg.se.unmapped_%s.fq' % (OUT_DIR, STEP)
benchmark:
"%s/benchmarks/EXTRACT_UNMAPPED_%s.json" % (OUT_DIR, STEP)
log:
OUT_LOG
shell:
"""
echo "[x] Extracting unmapped from assembly {STEP} [x]"
echo "[x] Indexing assembly {STEP} [x]"
bwa index {input[0]}
echo "[x] Extracting unmapped reads [x]"
TMP_FILE=$(mktemp --tmpdir={TMP} -t "alignment_{STEP}-XXXX.bam")
BUFFER=$(mktemp --tmpdir={TMP} -t "alignment_buffer_{STEP}-XXXX.bam")
bwa mem -v 1 -t {THREADS} {input[0]} {input[1]} {input[2]} | samtools view -@ {THREADS} -bS - > $TMP_FILE
samtools merge -@ {THREADS} -u - \
<(samtools view -@ {THREADS} -u -f 4 -F 264 $TMP_FILE) \
<(samtools view -@ {THREADS} -u -f 8 -F 260 $TMP_FILE) \
<(samtools view -@ {THREADS} -u -f 12 -F 256 $TMP_FILE) | \
samtools view -@ {THREADS} -bF 0x800 - | samtools sort -o -@ {THREADS} -m {MEMCORE}G -n - $BUFFER | \
bamToFastq -i stdin -fq {output[0]} -fq2 {output[1]} \
>> {log} 2>&1
bwa mem -v 1 -t {THREADS} {input[0]} {input[3]} | \
samtools view -@ {THREADS} -bS - | samtools view -@ {THREADS} -uf 4 - | \
bamToFastq -i stdin -fq {output[2]} \
>> {log} 2>&1
"""
rule ASSEMBLY:
input:
"%s/mg.r1.unmapped_%s.fq" % (OUT_DIR, STEP),
"%s/mg.r2.unmapped_%s.fq" % (OUT_DIR, STEP),
"%s/mg.se.unmapped_%s.fq" % (OUT_DIR, STEP)
output:
"%s/mg_contigs_%s.fa" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/ASSEMBLY_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
echo "[x] Running assembly {STEPp1} [x]"
echo "[x] Interleave mg fastq files"
TMPD=$(mktemp -d -t --tmpdir={TMP} "XXXXXX")
fq2fa --merge {input[0]} {input[1]} $TMPD/merged.fa
idba_ud -r $TMPD/merged.fa \
-l {input[2]} -o {OUT_DIR}/assembly_{STEPp1} \
--mink {config[idba_ud][mink]} --maxk {config[idba_ud][maxk]} \
--step {config[idba_ud][step]} --num_threads {THREADS} \
--similar {config[idba_ud][perid]} --pre_correction \
>> {log} 2>&1
cp {OUT_DIR}/assembly_{STEPp1}/contig.fa {output}
"""
rule MERGE_ASSEMBLIES:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEP),
"%s/mg_contigs_%s.fa" % (OUT_DIR, STEPp1)
output:
"%s/mg_contigs_cat_%s.fa" % (OUT_DIR, STEPp1),
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/MERGE_ASSEMBLIES_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
echo "[x] Merging assemblies {STEP} and {STEPp1} [x]"
NAME={OUT_DIR}/mg.assembly
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
rule QUAST:
input:
"%s/mg_contigs_cat_%s.fa" % (OUT_DIR, STEPp1),
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1)
output:
"%s/quast_%s/report.txt" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tsv" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tex" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/QUAST_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_{STEPp1} \
{input[0]} \
--max-ref-number 0 --no-check --fast \
>> {log} 2>&1
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_{STEPp1} \
{input[1]} \
--max-ref-number 0 --no-check --fast -f \
>> {log} 2>&1
"""
iterative_assembly/MG_assembly/config.iterative_MG-gaia183.json
0 → 100644
View file @
c345607e
{
"threads"
:
20
,
"memory_total_gb"
:
640
,
"memory_per_core_gb"
:
32
,
"tmp_dir"
:
"tmp"
,
"imp_src"
:
"src"
,
"raws"
:
{
"Metagenomics"
:
""
,
"Metatranscriptomics"
:
""
},
"sample"
:
"test"
,
"outputdir"
:
"/output"
,
"db_path"
:
"/databases"
,
"preprocessing_filtering"
:
true
,
"assembler"
:
"idba"
,
"trimmomatic"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip"
,
"adapter"
:
"TruSeq3"
,
"leading"
:
20
,
"minlen"
:
40
,
"palindrome_clip_threshold"
:
30
,
"simple_clip_threshold"
:
10
,
"trailing"
:
20
,
"seed_mismatch"
:
2
,
"window_size"
:
1
,
"window_quality"
:
3
,
"strictness"
:
0.5
,
"target_length"
:
40
,
"jarfile"
:
"/home/imp/lib/trimmomatic-1.32.jar"
},
"idba_ud"
:
{
"mink"
:
25
,
"maxk"
:
99
,
"step"
:
4
,
"perid"
:
0.98
"cap3"
:
{
"identity"
:
100
,
"overlap"
:
1000
},
"vizbin"
:
{
"dimension"
:
50
,
"kmer"
:
5
,
"size"
:
4
,
"theta"
:
0.5
,
"perp"
:
30
,
"cutoff"
:
1000
,
"jarfile"
:
"/home/imp/lib/VizBin-dist.jar"
},
"human_filtering"
:
{
"filter"
:
"hg38"
,
"url"
:
"http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
},
"sortmerna"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/sortmerna.2.0.tgz"
,
"scripts_path"
:
"/home/imp/lib"
,
"files"
:
[
"rfam-5.8s-database-id98"
,
"silva-arc-16s-id95"
,
"silva-bac-16s-id90"
,
"silva-euk-18s-id95"
,
"rfam-5s-database-id98"
,
"silva-arc-23s-id98"
,
"silva-bac-23s-id98"
,
"silva-euk-28s-id98"
]
},
"prokka"
:
{
"pkg_url"
:
"https://webdav-r3lab.uni.lu/public/R3lab/IMP/prokka-1.11.tar.gz"
,
"databases"
:
[
"cm/Bacteria.i1i"
,
"genus/Staphylococcus.phr"
,
"hmm/CLUSTERS.hmm.h3f"
,
"kingdom/Archaea/sprot.phr"
]
},
"kegg"
:
{
"db_ec2pthy"
:
"http://rest.kegg.jp/link/ec/pathway"
,
"db_hierarchy"
:
"http://rest.kegg.jp/list/pathway"
}
}
iterative_assembly/MG_assembly/execution_CAMI.sh
0 → 100755
View file @
c345607e
#!/bin/bash -l
source
../preload_modules.sh
INDIR
=
$1
OUTDIR
=
$2
OUTLOG
=
$3
TMPDIR
=
$4
date
### Initialize first assembly ###
INPUT_DIR
=
$INDIR
\
OUT_DIR
=
$OUTDIR
\
OUT_LOG
=
$OUTLOG
\
TMP
=
$TMPDIR
\
snakemake
-s
Snakefile_CAMI ALL
INPUT_DIR
=
$INDIR
\
OUT_DIR
=
$OUTDIR
\
OUT_LOG
=
$OUTLOG
\
TMP
=
$TMPDIR
\
snakemake
-s
Snakefile_CAMI EXTRACT_UNMAPPED
for
ITER
in
{
2..5
}
do
echo
"Iteration
$ITER
"
INPUT_DIR
=
$INDIR
\
OUT_DIR
=
$OUTDIR
\
OUT_LOG
=
$OUTLOG
\
TMP
=
$TMPDIR
\
STEP
=
${
ITER
}
snakemake
-s
Snakefile_CAMI ALL
done
### Join tables
OUT_DIR
=
$OUTDIR
cat
<
(
cat
${
OUT_DIR
}
/quast_1/transposed_report.tsv
)
\
<
(
tail
-n1
${
OUT_DIR
}
/quast_[2-6]/transposed_report.tsv |
\
grep
-v
"==>"
|
grep
-v
"^$"
)
>
${
OUT_DIR
}
/full_report.tsv
ls
${
OUT_DIR
}
/MG_contigs_cat_[1-6].fa | xargs
-I
{}
getN50sizemax
{}
|
\
cut
-f1
,2,3,4,6 |
\
sed
-e
's/NUM=//g'
|
\
sed
-e
's/AVG=//g'
|
\
sed
-e
's/N50=//g'
|
\
sed
-e
's/GENOMESIZE=//g'
|
\
sed
-e
's/MAX=//g'
\
>>
${
OUT_DIR
}
/uncollapsed_tmp.tsv
paste
<
(
ls
${
OUT_DIR
}
/MG_contigs_cat_[1-6].fa
)
${
OUT_DIR
}
/uncollapsed_tmp.tsv
>
${
OUT_DIR
}
/uncollapsed_tmp2.tsv
cat
<
(
echo
-e
"assembly
\t
contig_no
\t
avg_contig_len
\t
N50
\t
genome_len
\t
max_contig_len"
)
<
(
cat
${
OUT_DIR
}
/uncollapsed_tmp2.tsv
)
>
${
OUT_DIR
}
/uncollapsed_contigs_stats.tsv
ls
${
OUT_DIR
}
/MG_contigs_merged_[1-6].fa | xargs
-I
{}
getN50sizemax
{}
|
\
cut
-f2
,4 |
\
sed
-e
's/AVG=//g'
|
\
sed
-e
's/GENOMESIZE=//g'
\
>>
${
OUT_DIR
}
/merged_contigs_tmp.tsv
paste
<
(
cat
full_report.tsv
)
<
(
cat
<
(
echo
-e
"avg_contig_len
\t
genome_len"
)
<
(
cat
${
OUT_DIR
}
/merged_contigs_tmp.tsv
))
>
${
OUT_DIR
}
/collapsed_contigs_stats.tsv
rm
-rf
${
OUT_DIR
}
/uncollapsed_tmp2.tsv
${
OUT_DIR
}
/uncollapsed_tmp.tsv
${
OUT_DIR
}
/merged_contigs_tmp.tsv
### Count reads within the fastq files
INPUT_DIR
=
$INDIR
wc
-l
${
INPUT_DIR
}
/MG.R1.preprocessed.fq |
grep
-v
"total"
>
${
OUT_DIR
}
/pairs-raw_counts.tsv
wc
-l
${
OUT_DIR
}
/MG.R1.unmapped_[1-6].fq |
grep
-v
"total"
>>
${
OUT_DIR
}
/pairs-raw_counts.tsv
wc
-l
${
INPUT_DIR
}
/MG.SE.preprocessed.fq |
grep
-v
"total"
>
${
OUT_DIR
}
/singles-raw_counts.tsv
wc
-l
${
OUT_DIR
}
/MG.SE.unmapped_[1-6].fq |
grep
-v
"total"
>>
${
OUT_DIR
}
/singles-raw_counts.tsv
awk
'{print $2, $1/4}'
${
OUT_DIR
}
/pairs-raw_counts.tsv
>
${
OUT_DIR
}
/pair_counts.tsv
awk
'{print $2, $1/4}'
${
OUT_DIR
}
/singles-raw_counts.tsv
>
${
OUT_DIR
}
/single_counts.tsv
date
iterative_assembly/MG_assembly/launcher_CAMI.sh
0 → 100755
View file @
c345607e
#!/bin/bash -l
#OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -t bigmem -t idempotent -t besteffort -l core=24/nodes=1,walltime=120"
OARSUB
=
"oarsub --notify "
mail:shaman.narayanasamy@uni.lu
" -t bigmem -p "
network_address
=
\'
gaia-183
\'
" --project "
project_biocore
" -l core=20/nodes=1,walltime=120"
#OARSUB=""
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "iterative_assm_HMP" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_X310763260.sh
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "iterative_assm_A02" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_A02.sh
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "simDat-iterative_assm" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_simDat.sh
#
#declare -a SAMPLES=("CAMI_low" "CAMI_medium" "CAMI_high")
declare
-a
SAMPLES
=(
"CAMI_medium"
"CAMI_high"
)
### Repeat for all the data sets
for
S
in
"
${
SAMPLES
[@]
}
"
do
INDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/CAMI/
${
S
}
-idba/Preprocessing/"
OUTDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/
${
S
}
-iterative_MG"
OUTLOG
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/
${
S
}
-iterative_MG/
${
S
}
_iterative_MG.log"
TMPDIR
=
"/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/
${
S
}
-iterative_MG/tmp"
${
OARSUB
}
-n
"
${
S
}
_MG_iterative"
"./execution_CAMI.sh
$INDIR
$OUTDIR
$OUTLOG
$TMPDIR
"
#CMD="./execution.sh $INDIR $OUTDIR $OUTLOG $TMPDIR"
#echo $CMD
#exec ${CMD}
done
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment