Commit c345607e authored by Shaman Narayanasamy's avatar Shaman Narayanasamy
Browse files

Edit gitignore and add CAMI assembly files

parent 60215475
...@@ -83,3 +83,8 @@ prodigal_analysis/prodigal_summary.tsv ...@@ -83,3 +83,8 @@ prodigal_analysis/prodigal_summary.tsv
quast_analysis/ quast_analysis/
additional_analyses/HMP_gene_catalog/IMP2IGC_mapping.txt additional_analyses/HMP_gene_catalog/IMP2IGC_mapping.txt
additional_analyses/HMP_gene_catalog/OARlogs/ additional_analyses/HMP_gene_catalog/OARlogs/
iterative_assembly/MG_assembly/oarlogs/
metaquast_analysis/oarlogs/*
naive_assembly/.snakemake/*
naive_assembly/oarlogs/*
additional_analyses/*~
import os
# Take the current step and calculate next step target
STEP = os.environ.get("STEP", "1")
STEPp1 = int(STEP) + 1
configfile: "config.iterative_MG.json"
#configfile: "../config.imp.json"
## Parameters required from the config file
MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
THREADS = os.environ.get("THREADS", config['threads'])
## Input and output parameters
OUT_LOG = os.environ.get("OUT_LOG")
INPUT_DIR = os.environ.get("INPUT_DIR")
OUT_DIR = os.environ.get("OUT_DIR")
TMP = os.environ.get("TMP")
### General rule for the iterative assemblies ###
rule ALL:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1),
"%s/quast_1/report.txt" % OUT_DIR,
"%s/quast_1/report.tsv" % OUT_DIR,
"%s/quast_1/report.tex" % OUT_DIR,
"%s/quast_%s/report.txt" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tsv" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tex" % (OUT_DIR, STEPp1)
shell:
"echo 'DONE'"
### Rules for the initial assembly ###
rule INITIAL_ASSEMBLY:
input:
"%s/mg.r1.preprocessed.fq" % INPUT_DIR,
"%s/mg.r2.preprocessed.fq" % INPUT_DIR,
"%s/mg.se.preprocessed.fq" % INPUT_DIR
output:
"%s/mg_contigs_1.fa" % OUT_DIR
benchmark:
"%s/benchmarks/INITIAL_ASSEMBLY.json" % OUT_DIR
log:
OUT_LOG
priority: 50
shell:
"""
echo "[x] Running assembly {STEP} [x]"
mkdir {TMP}
echo "[x] Interleave mg fastq files"
TMPD=$(mktemp -d -t --tmpdir={TMP} "XXXXXX")
fq2fa --merge {input[0]} {input[1]} $TMPD/merged.fa
idba_ud -r $TMPD/merged.fa \
-l {input[2]} -o {OUT_DIR}/assembly_1 \
--mink {config[idba_ud][mink]} --maxk {config[idba_ud][maxk]} \
--step {config[idba_ud][step]} --num_threads {THREADS} \
--similar {config[idba_ud][perid]} --pre_correction \
>> {log} 2>&1
cp {OUT_DIR}/assembly_1/contig.fa {output}
"""
rule MERGE_INITIAL_ASSEMBLY:
input:
"%s/mg_contigs_1.fa" % OUT_DIR,
output:
"%s/mg_contigs_cat_1.fa" % OUT_DIR,
"%s/mg_contigs_merged_1.fa" % OUT_DIR
benchmark:
"%s/benchmarks/MERGE_INITIAL_ASSEMBLY.json" % OUT_DIR
log:
OUT_LOG
shell:
"""
echo "[x] Merging initial assembly contigs [x]"
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
rule QUAST_INITIAL_ASSEMBLY:
input:
"%s/mg_contigs_cat_1.fa" % OUT_DIR,
"%s/mg_contigs_merged_1.fa" % OUT_DIR
output:
"%s/quast_1/report.txt" % OUT_DIR,
"%s/quast_1/report.tsv" % OUT_DIR,
"%s/quast_1/report.tex" % OUT_DIR
benchmark:
"%s/benchmarks/QUAST_1.json" % (OUT_DIR)
log:
OUT_LOG
shell:
"""
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_1 \
{input[0]} \
--max-ref-number 0 --no-check --fast \
>> {log} 2>&1
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_1 \
{input[1]} \
--max-ref-number 0 --no-check --fast -f\
>> {log} 2>&1
"""
### General rules for iterations of assemblies ###
rule EXTRACT_UNMAPPED:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEP),
"%s/mg.r1.preprocessed.fq" % INPUT_DIR,
"%s/mg.r2.preprocessed.fq" % INPUT_DIR,
"%s/mg.se.preprocessed.fq" % INPUT_DIR
output:
'%s/mg.r1.unmapped_%s.fq' % (OUT_DIR, STEP),
'%s/mg.r2.unmapped_%s.fq' % (OUT_DIR, STEP),
'%s/mg.se.unmapped_%s.fq' % (OUT_DIR, STEP)
benchmark:
"%s/benchmarks/EXTRACT_UNMAPPED_%s.json" % (OUT_DIR, STEP)
log:
OUT_LOG
shell:
"""
echo "[x] Extracting unmapped from assembly {STEP} [x]"
echo "[x] Indexing assembly {STEP} [x]"
bwa index {input[0]}
echo "[x] Extracting unmapped reads [x]"
TMP_FILE=$(mktemp --tmpdir={TMP} -t "alignment_{STEP}-XXXX.bam")
BUFFER=$(mktemp --tmpdir={TMP} -t "alignment_buffer_{STEP}-XXXX.bam")
bwa mem -v 1 -t {THREADS} {input[0]} {input[1]} {input[2]} | samtools view -@ {THREADS} -bS - > $TMP_FILE
samtools merge -@ {THREADS} -u - \
<(samtools view -@ {THREADS} -u -f 4 -F 264 $TMP_FILE) \
<(samtools view -@ {THREADS} -u -f 8 -F 260 $TMP_FILE) \
<(samtools view -@ {THREADS} -u -f 12 -F 256 $TMP_FILE) | \
samtools view -@ {THREADS} -bF 0x800 - | samtools sort -o -@ {THREADS} -m {MEMCORE}G -n - $BUFFER | \
bamToFastq -i stdin -fq {output[0]} -fq2 {output[1]} \
>> {log} 2>&1
bwa mem -v 1 -t {THREADS} {input[0]} {input[3]} | \
samtools view -@ {THREADS} -bS - | samtools view -@ {THREADS} -uf 4 - | \
bamToFastq -i stdin -fq {output[2]} \
>> {log} 2>&1
"""
rule ASSEMBLY:
input:
"%s/mg.r1.unmapped_%s.fq" % (OUT_DIR, STEP),
"%s/mg.r2.unmapped_%s.fq" % (OUT_DIR, STEP),
"%s/mg.se.unmapped_%s.fq" % (OUT_DIR, STEP)
output:
"%s/mg_contigs_%s.fa" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/ASSEMBLY_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
echo "[x] Running assembly {STEPp1} [x]"
echo "[x] Interleave mg fastq files"
TMPD=$(mktemp -d -t --tmpdir={TMP} "XXXXXX")
fq2fa --merge {input[0]} {input[1]} $TMPD/merged.fa
idba_ud -r $TMPD/merged.fa \
-l {input[2]} -o {OUT_DIR}/assembly_{STEPp1} \
--mink {config[idba_ud][mink]} --maxk {config[idba_ud][maxk]} \
--step {config[idba_ud][step]} --num_threads {THREADS} \
--similar {config[idba_ud][perid]} --pre_correction \
>> {log} 2>&1
cp {OUT_DIR}/assembly_{STEPp1}/contig.fa {output}
"""
rule MERGE_ASSEMBLIES:
input:
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEP),
"%s/mg_contigs_%s.fa" % (OUT_DIR, STEPp1)
output:
"%s/mg_contigs_cat_%s.fa" % (OUT_DIR, STEPp1),
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/MERGE_ASSEMBLIES_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
echo "[x] Merging assemblies {STEP} and {STEPp1} [x]"
NAME={OUT_DIR}/mg.assembly
cat {input} > {output[0]}
# Options should be after input file!
cap3 {output[0]} -p 98 -o 100 \
>> {log} 2>&1
# Concatenate assembled contigs, singletons and rename the contigs
cat {output[0]}.cap.contigs {output[0]}.cap.singlets | \
awk '/^>/{{print \">contig_\" ++i; next}}{{print}}' > {output[1]}
"""
rule QUAST:
input:
"%s/mg_contigs_cat_%s.fa" % (OUT_DIR, STEPp1),
"%s/mg_contigs_merged_%s.fa" % (OUT_DIR, STEPp1)
output:
"%s/quast_%s/report.txt" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tsv" % (OUT_DIR, STEPp1),
"%s/quast_%s/report.tex" % (OUT_DIR, STEPp1)
benchmark:
"%s/benchmarks/QUAST_%s.json" % (OUT_DIR, STEPp1)
log:
OUT_LOG
shell:
"""
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_{STEPp1} \
{input[0]} \
--max-ref-number 0 --no-check --fast \
>> {log} 2>&1
METAQUAST="/mnt/nfs/projects/ecosystem_biology/local_tools/IMP/dependencies/quast/metaquast.py"
python2.7 $METAQUAST -t {THREADS} -o {OUT_DIR}/quast_{STEPp1} \
{input[1]} \
--max-ref-number 0 --no-check --fast -f \
>> {log} 2>&1
"""
{
"threads": 20,
"memory_total_gb": 640,
"memory_per_core_gb": 32,
"tmp_dir": "tmp",
"imp_src": "src",
"raws": {
"Metagenomics": "",
"Metatranscriptomics": ""
},
"sample": "test",
"outputdir": "/output",
"db_path": "/databases",
"preprocessing_filtering": true,
"assembler": "idba",
"trimmomatic": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip",
"adapter": "TruSeq3",
"leading": 20,
"minlen": 40,
"palindrome_clip_threshold": 30,
"simple_clip_threshold": 10,
"trailing": 20,
"seed_mismatch": 2,
"window_size": 1,
"window_quality": 3,
"strictness": 0.5,
"target_length": 40,
"jarfile": "/home/imp/lib/trimmomatic-1.32.jar"
},
"idba_ud": {
"mink": 25,
"maxk": 99,
"step": 4,
"perid": 0.98
"cap3": {
"identity": 100,
"overlap": 1000
},
"vizbin": {
"dimension": 50,
"kmer": 5,
"size": 4,
"theta": 0.5,
"perp": 30,
"cutoff": 1000,
"jarfile": "/home/imp/lib/VizBin-dist.jar"
},
"human_filtering": {
"filter": "hg38",
"url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
},
"sortmerna": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/sortmerna.2.0.tgz",
"scripts_path": "/home/imp/lib",
"files": [
"rfam-5.8s-database-id98",
"silva-arc-16s-id95",
"silva-bac-16s-id90",
"silva-euk-18s-id95",
"rfam-5s-database-id98",
"silva-arc-23s-id98",
"silva-bac-23s-id98",
"silva-euk-28s-id98"
]
},
"prokka": {
"pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/prokka-1.11.tar.gz",
"databases": [
"cm/Bacteria.i1i",
"genus/Staphylococcus.phr",
"hmm/CLUSTERS.hmm.h3f",
"kingdom/Archaea/sprot.phr"
]
},
"kegg": {
"db_ec2pthy": "http://rest.kegg.jp/link/ec/pathway",
"db_hierarchy": "http://rest.kegg.jp/list/pathway"
}
}
#!/bin/bash -l
source ../preload_modules.sh
INDIR=$1
OUTDIR=$2
OUTLOG=$3
TMPDIR=$4
date
### Initialize first assembly ###
INPUT_DIR=$INDIR \
OUT_DIR=$OUTDIR \
OUT_LOG=$OUTLOG \
TMP=$TMPDIR \
snakemake -s Snakefile_CAMI ALL
INPUT_DIR=$INDIR \
OUT_DIR=$OUTDIR \
OUT_LOG=$OUTLOG \
TMP=$TMPDIR \
snakemake -s Snakefile_CAMI EXTRACT_UNMAPPED
for ITER in {2..5}
do
echo "Iteration $ITER"
INPUT_DIR=$INDIR \
OUT_DIR=$OUTDIR \
OUT_LOG=$OUTLOG \
TMP=$TMPDIR \
STEP=${ITER} snakemake -s Snakefile_CAMI ALL
done
### Join tables
OUT_DIR=$OUTDIR
cat <(cat ${OUT_DIR}/quast_1/transposed_report.tsv) \
<(tail -n1 ${OUT_DIR}/quast_[2-6]/transposed_report.tsv | \
grep -v "==>" | grep -v "^$") > ${OUT_DIR}/full_report.tsv
ls ${OUT_DIR}/MG_contigs_cat_[1-6].fa | xargs -I{} getN50sizemax {} | \
cut -f1,2,3,4,6 | \
sed -e 's/NUM=//g' | \
sed -e 's/AVG=//g' | \
sed -e 's/N50=//g' | \
sed -e 's/GENOMESIZE=//g' | \
sed -e 's/MAX=//g' \
>> ${OUT_DIR}/uncollapsed_tmp.tsv
paste <(ls ${OUT_DIR}/MG_contigs_cat_[1-6].fa) ${OUT_DIR}/uncollapsed_tmp.tsv > ${OUT_DIR}/uncollapsed_tmp2.tsv
cat <(echo -e "assembly\tcontig_no\tavg_contig_len\tN50\tgenome_len\tmax_contig_len") <(cat ${OUT_DIR}/uncollapsed_tmp2.tsv) > ${OUT_DIR}/uncollapsed_contigs_stats.tsv
ls ${OUT_DIR}/MG_contigs_merged_[1-6].fa | xargs -I{} getN50sizemax {} | \
cut -f2,4 | \
sed -e 's/AVG=//g' | \
sed -e 's/GENOMESIZE=//g' \
>> ${OUT_DIR}/merged_contigs_tmp.tsv
paste <(cat full_report.tsv) <(cat <(echo -e "avg_contig_len\tgenome_len") <(cat ${OUT_DIR}/merged_contigs_tmp.tsv)) > ${OUT_DIR}/collapsed_contigs_stats.tsv
rm -rf ${OUT_DIR}/uncollapsed_tmp2.tsv ${OUT_DIR}/uncollapsed_tmp.tsv ${OUT_DIR}/merged_contigs_tmp.tsv
### Count reads within the fastq files
INPUT_DIR=$INDIR
wc -l ${INPUT_DIR}/MG.R1.preprocessed.fq | grep -v "total" > ${OUT_DIR}/pairs-raw_counts.tsv
wc -l ${OUT_DIR}/MG.R1.unmapped_[1-6].fq | grep -v "total" >> ${OUT_DIR}/pairs-raw_counts.tsv
wc -l ${INPUT_DIR}/MG.SE.preprocessed.fq | grep -v "total" > ${OUT_DIR}/singles-raw_counts.tsv
wc -l ${OUT_DIR}/MG.SE.unmapped_[1-6].fq | grep -v "total" >> ${OUT_DIR}/singles-raw_counts.tsv
awk '{print $2, $1/4}' ${OUT_DIR}/pairs-raw_counts.tsv > ${OUT_DIR}/pair_counts.tsv
awk '{print $2, $1/4}' ${OUT_DIR}/singles-raw_counts.tsv > ${OUT_DIR}/single_counts.tsv
date
#!/bin/bash -l
#OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -t bigmem -t idempotent -t besteffort -l core=24/nodes=1,walltime=120"
OARSUB="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -t bigmem -p "network_address=\'gaia-183\'" --project "project_biocore" -l core=20/nodes=1,walltime=120"
#OARSUB=""
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "iterative_assm_HMP" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_X310763260.sh
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "iterative_assm_A02" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_A02.sh
#
#oarsub --notify "mail:shaman.narayanasamy@uni.lu" -n "simDat-iterative_assm" -t bigsmp -t idempotent -t besteffort -l nodes=1/core=24,walltime=120 ./execution_simDat.sh
#
#declare -a SAMPLES=("CAMI_low" "CAMI_medium" "CAMI_high")
declare -a SAMPLES=("CAMI_medium" "CAMI_high")
### Repeat for all the data sets
for S in "${SAMPLES[@]}"
do
INDIR="/scratch/users/snarayanasamy/IMP_MS_data/IMP_analysis/CAMI/${S}-idba/Preprocessing/"
OUTDIR="/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/${S}-iterative_MG"
OUTLOG="/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/${S}-iterative_MG/${S}_iterative_MG.log"
TMPDIR="/scratch/users/snarayanasamy/IMP_MS_data/iterative_assemblies/CAMI/MG_assemblies/${S}-iterative_MG/tmp"
${OARSUB} -n "${S}_MG_iterative" "./execution_CAMI.sh $INDIR $OUTDIR $OUTLOG $TMPDIR"
#CMD="./execution.sh $INDIR $OUTDIR $OUTLOG $TMPDIR"
#echo $CMD
#exec ${CMD}
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment