Commit ecaef2e6 authored by Shaman Narayanasamy's avatar Shaman Narayanasamy
Browse files

Merge branch 'declutter_refine_plus_test' into 'master'

Declutter refine plus test

See merge request shaman.narayanasamy/LAO-time-series!3
parents 64cad40b af319194
......@@ -84,5 +84,8 @@
},
"metaquast": {
"max_genomes": 2
},
"nonpareil": {
"read_samples": 1000000
}
}
#!/bin/bash -l
#OAR -n test_01
#OAR -l nodes=1/core=8,walltime=120
#####
# # # Launcher to run the binning. Taking output files of IMP
####
source /home/users/smartinezarbas/git/gitlab/LAO-time-series/src/preload_modules.sh
THREADS=1 snakemake -j 8 -pf workflow_binning.done -s workflows/Binning
#!/bin/bash -l
#source /home/snarayanasamy/Work/tools/parallel_functions.sh
#IMP="/home/snarayanasamy/Work/tools/IMP/IMP"
#CONF="/home/snarayanasamy/Work/tools/IMP/conf/LAO_binning_bigbug_config.imp.json"
#DB="/home/snarayanasamy/Work/tools/IMP/db"
ROOTOUT="/scratch/users/snarayanasamy/LAO_TS"
cat LAO_time_series_gaia.txt | grep -v "^#" | while read DATA
do
SAMPLE=`echo $DATA | cut -d$' ' -f1`
INDIR="${ROOTOUT}/${SAMPLE}"
OUTDIR="${ROOTOUT}/${SAMPLE}"
CMD="./execution.sh $SAMPLE $INDIR"
echo $CMD
done
#!/bin/bash -l
#CMD_LAUNCH="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -l core=12/nodes=1,walltime=120 -t bigmem -t idempotent -t besteffort"
#CMD_LAUNCH="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -l core=12/nodes=1,walltime=120 -t idempotent -t besteffort -t bigsmp -p "network_address='gaia-80'""
#CMD_LAUNCH="oarsub --notify \"mail:shaman.narayanasamy@uni.lu\" -l core=12/nodes=1,walltime=120 -t idempotent -t besteffort -t bigmem -p \"network_address='gaia-74'\""
## For an actual run
#CMD_LAUNCH="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -l core=12/nodes=1,walltime=72 -p "memnode='48'""
# For dryrun
CMD_LAUNCH="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -l core=1/nodes=1,walltime=2 -p "memnode='48'""
#source /home/snarayanasamy/Work/tools/parallel_functions.sh
#IMP="/home/snarayanasamy/Work/tools/IMP/IMP"
#CONF="/home/snarayanasamy/Work/tools/IMP/conf/LAO_bigbug_config.imp.json"
#DB="/home/snarayanasamy/Work/tools/IMP/db"
ROOTOUT="/scratch/users/snarayanasamy/LAO_TS_IMP-v1.3"
cat LAO_time_series_gaia.txt | grep -v "^#" | while read DATA
do
SAMPLE=`echo $DATA | cut -d$' ' -f1`
INDIR="${ROOTOUT}/${SAMPLE}"
OUTDIR="${ROOTOUT}/${SAMPLE}"
CMD="./execution.sh $SAMPLE $INDIR"
echo $CMD
echo "$CMD_LAUNCH -n \"${SAMPLE}_binny\" \"$CMD\""
$CMD_LAUNCH -n "${SAMPLE}_binny" "$CMD"
done
#!/bin/bash -l
ROOTOUT="/scratch/users/snarayanasamy/LAO_TS_IMP-v1.3"
cat LAO_time_series_gaia.txt | grep -v "^#" | while read DATA
do
SAMPLE=`echo $DATA | cut -d$' ' -f1`
INDIR="${ROOTOUT}/${SAMPLE}"
OUTDIR="${ROOTOUT}/${SAMPLE}"
CMD="./execution.sh $SAMPLE $INDIR"
echo $CMD
$CMD
done
#!/bin/bash -l
## This will be launched using the gaia73 node
#OAR -n DataCheck_LAO
#OAR -l core=120,walltime=120
#nOAR -t besteffort
#nOAR -t idempotent
source src/preload_modules.sh
THREADS=12 snakemake -krpfs workflows/DataCheck -j 10 --unlock
THREADS=12 snakemake -krpfs workflows/DataCheck -j 10 --touch
THREADS=12 snakemake -krpfs workflows/DataCheck -j 10
## D45 failed rerunning
#THREADS=12 snakemake -krpfs workflows/DataCheck
......@@ -3,7 +3,9 @@
## This will be launched using the gaia73 node
#OAR -n Database
#OAR -l core=12,walltime=72
#OAR -l core=24,walltime=36
source src/preload_modules.sh
THREADS=1 snakemake -krps workflows/Databases -j 12
#THREADS=1 snakemake -krps workflows/Databases -j 12
THREADS=1 snakemake -ps workflows/Databases -f Annotations/ALL-mgmt.assembly.merged.gff -j 24
#!/bin/bash -l
### Run for LAO1
CMD="dry-run.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_01prime_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_01prime_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_01prime \
20110223_01prime \
config_normalNode.json"
$CMD
### Run for LAO2
CMD="dry-run.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_02prime_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_02prime_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_02prime \
20110223_02prime \
config_normalNode.json"
$CMD
### Run for LAO3
CMD="dry-run.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_03prime_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_03prime_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_03prime \
20110223_03prime \
config_normalNode.json"
$CMD
### Run for LAO4
CMD="dry-run.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_04prime_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_04prime_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_04prime \
20110223_04prime \
config_normalNode.json"
$CMD
### Run for LAO5
CMD="dry-run.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_05prime_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_05prime_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_05prime \
20110223_05prime \
config_normalNode.json"
$CMD
### Run for LAO9
CMD="dry-run.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_09prime_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_09prime_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_09prime \
20110223_09prime \
config_normalNode.json"
$CMD
### Run for LAO13
CMD="dry-run.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_13prime_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_13prime_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_13prime \
20110223_13prime \
config_normalNode.json"
$CMD
### Run for all LAO samples pooled
CMD="dry-run.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_pooled_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_pooled_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_pooled \
20110223_pooled \
config_bigmemNode.json"
$CMD
#!/bin/bash -l
### This script takes in the necessary arguments for Snakemake and attaches them into the
### appropriate Snakemake command.
source src/preload_modules.sh
date
# SAMPLE=${1} INPUTDIR=${2} OUTPUTDIR=${2} snakemake --cleanup-metadata Binning/ORFS.hmmer.essential.out Binning/ORFS.hmm.orfs.essential.hits Binning/ORFS.hmm.orfs.essential.hits_reduced Binning/ORFS-contig_links.bed Binning/ORFS.hmm.orfs.essential.hits.protids Binning/ORFS.hmm.orfs.essential.hits.faa
# SAMPLE=${1} INPUTDIR=${2} OUTPUTDIR=${2} snakemake --touch
#SAMPLE=${1} INPUTDIR=${2} OUTPUTDIR=${2} snakemake -np binning.done
SAMPLE=${1} INPUTDIR=${2} OUTPUTDIR=${2} snakemake -rpf binning.done
date
#!/bin/bash -l
### This script takes in the necessary arguments for Snakemake and attaches them into the
### appropriate Snakemake command.
source src/preload_modules.sh
date
CONFIG=config_bigsmp82.json SAMPLE=${1} INPUTDIR=${2} OUTPUTDIR=${2} snakemake -rps workflows/BinRefinement -j 41 --rerun-incomplete
date
#!/bin/bash -l
# This command launches all the commands on the cluster paralelly, using the oarscheduler.
CMD_LAUNCH="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -l core=12/nodes=1,walltime=120 -t idempotent -t besteffort -p "memnode='72'""
#echo "$CMD_LAUNCH"
#
#### Run for LAO1
#$CMD_LAUNCH -n "LAO1" \
# "execution.sh \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_01prime_R1.fastq \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_01prime_R2.fastq \
# /scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_01prime \
# 20110223_01prime \
# config_normalNode.json"
#
#### Run for LAO2
#$CMD_LAUNCH -n "LAO2" \
# "execution.sh \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_02prime_R1.fastq \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_02prime_R2.fastq \
# /scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_02prime \
# 20110223_02prime \
# config_normalNode.json"
#
#### Run for LAO3
#$CMD_LAUNCH -n "LAO3" \
# "execution.sh \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_03prime_R1.fastq \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_03prime_R2.fastq \
# /scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_03prime \
# 20110223_03prime \
# config_normalNode.json"
#
#### Run for LAO4
#$CMD_LAUNCH -n "LAO4" \
# "execution.sh \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_04prime_R1.fastq \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_04prime_R2.fastq \
# /scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_04prime \
# 20110223_04prime \
# config_normalNode.json"
#
#### Run for LAO5
#$CMD_LAUNCH -n "LAO5" \
# "execution.sh \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_05prime_R1.fastq \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_05prime_R2.fastq \
# /scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_05prime \
# 20110223_05prime \
# config_normalNode.json"
#
#### Run for LAO9
#$CMD_LAUNCH -n "LAO9" \
# "execution.sh \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_09prime_R1.fastq \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_09prime_R2.fastq \
# /scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_09prime \
# 20110223_09prime \
# config_normalNode.json"
#
#### Run for LAO13
#$CMD_LAUNCH -n "LAO13" \
# "execution.sh \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_13prime_R1.fastq \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_13prime_R2.fastq \
# /scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_13prime \
# 20110223_13prime \
# config_normalNode.json"
#CMD_LAUNCH_BIGMEM="oarsub --notify "mail:shaman.narayanasamy@uni.lu" -l core=24/nodes=1,walltime=120 -t bigmem -t idempotent -t besteffort"
#
#### Run for all LAO samples pooled
#$CMD_LAUNCH_BIGMEM -n "LAOPool" \
# "execution.sh \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_pooled_R1.fastq \
# /mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_pooled_R2.fastq \
# /scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_pooled \
# 20110223_pooled \
# config_bigmemNode.json"
$CMD_LAUNCH -n "LAOPool" \
"execution.sh \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_pooled_R1.fastq \
/mnt/nfs/projects/ecosystem_biology/LAO/2011-02-23/Metagenomics/SequenceReads/Raw/LAO_MetGen_20110223_pooled_R2.fastq \
/scratch/users/snarayanasamy/LAO_20110223_analysis/20110223_pooled \
20110223_pooled \
config_normalNode.json"
rule binning_clustering:
input:
"Analysis/results/MGMT_results.Rdat",
"Binning/ORFS.hmm.orfs.essential.hits_reduced",
"Binning/ORFS-contig_links.bed"
"%s/{ts_sample}/Analysis/results/MGMT_results.Rdat" % TS_DIR,
"{ts_sample}/Binning/ORFS.hmm.orfs.essential.hits_reduced",
"{ts_sample}/Binning/ORFS-contig_links.bed"
output:
expand("Binning/reachabilityDistanceEstimates.{pk}.{nn}.tsv \
Binning/clusterFirstScan.{pk}.{nn}.tsv \
Binning/bimodalClusterCutoffs.{pk}.{nn}.tsv \
Binning/contigs2clusters.{pk}.{nn}.tsv \
Binning/contigs2clusters.{pk}.{nn}.RDS \
Binning/clusteringWS.{pk}.{nn}.Rdata \
Binning/finalClusterMap.{pk}.{nn}.pdf".split(),pk=config["binning"]["pk"],nn=config["binning"]["nn"])
expand("{{ts_sample}}/Binning/reachabilityDistanceEstimates.{pk}.{nn}.tsv \
{{ts_sample}}/Binning/clusterFirstScan.{pk}.{nn}.tsv \
{{ts_sample}}/Binning/bimodalClusterCutoffs.{pk}.{nn}.tsv \
{{ts_sample}}/Binning/contigs2clusters.{pk}.{nn}.tsv \
{{ts_sample}}/Binning/contigs2clusters.{pk}.{nn}.RDS \
{{ts_sample}}/Binning/clusteringWS.{pk}.{nn}.Rdata \
{{ts_sample}}/Binning/finalClusterMap.{pk}.{nn}.pdf".split(), pk = config["binning"]["pk"], nn = config["binning"]["nn"])
shell:
"""
mkdir -p Binning/clusterFiles
mkdir -p {wildcards.ts_sample}/Binning/clusterFiles
## Run the R script for clustering the contigs
Rscript {SRCDIR}/binny.R {input} {config[binning][pk]} {config[binning][nn]}
Rscript {SRCDIR}/binny.R {input} {config[binning][pk]} {config[binning][nn]} {wildcards.ts_sample}
"""
rule essential_gene_info:
input:
expand("Binning/clusteringWS.{pk}.{nn}.Rdata".split(),pk=config["binning"]["pk"],nn=config["binning"]["nn"]),
"Binning/ORFS-contig_links.bed",
"Binning/ORFS.hmm.orfs.essential.hits_reduced"
expand("{{ts_sample}}/Binning/clusteringWS.{pk}.{nn}.Rdata".split(),pk=config["binning"]["pk"],nn=config["binning"]["nn"]),
"{ts_sample}/Binning/ORFS-contig_links.bed",
"{ts_sample}/Binning/ORFS.hmm.orfs.essential.hits_reduced"
output:
"Binning/essMarkerGenes/markersAll.tsv",
"{ts_sample}/Binning/essMarkerGenes/markersAll.tsv",
shell:
"""
mkdir -p Binning/essMarkerGenes
mkdir -p {wildcards.ts_sample}/Binning/essMarkerGenes
## Run the R script to extract the table
Rscript {SRCDIR}/get_essGeneInfo.R {input} {SAMPLE}
Rscript {SRCDIR}/get_essGeneInfo.R {input} {wildcards.ts_sample}
"""
rule extract_essential_genes:
input:
"Binning/essMarkerGenes/markersAll.tsv",
"Analysis/annotation/prokka.faa",
"{ts_sample}/Binning/essMarkerGenes/markersAll.tsv",
"%s/{ts_sample}/Analysis/annotation/prokka.faa" % TS_DIR
output:
"Binning/getMarkerGenes.done"
"{ts_sample}/Binning/getMarkerGenes.done"
shell:
"""
declare -a MARKER=("TIGR01011" "TIGR01049" "TIGR01169" "TIGR00487" "TIGR01044" "TIGR00959" \
......@@ -16,15 +16,15 @@ rule extract_essential_genes:
for marker in ${{MARKER[@]}}
do
if [[ -s "Binning/essMarkerGenes/marker-${{marker}}.tsv" ]]
if [[ -s "{wildcards.ts_sample}/Binning/essMarkerGenes/marker-${{marker}}.tsv" ]]
then
echo "Extracting genes for essential gene ID: ${{marker}}"
perl {SRCDIR}/fastaProteinExtractAddSampleCluster.pl {input[1]} Binning/essMarkerGenes/marker-${{marker}}.tsv > Binning/essMarkerGenes/marker-${{marker}}.faa
perl {SRCDIR}/fastaProteinExtractAddSampleCluster.pl {input[1]} {wildcards.ts_sample}/Binning/essMarkerGenes/marker-${{marker}}.tsv > {wildcards.ts_sample}/Binning/essMarkerGenes/marker-${{marker}}.faa
else
echo "${{marker}} not present! Generating fake files..."
touch Binning/essMarkerGenes/marker-${{marker}}.tsv
touch Binning/essMarkerGenes/marker-${{marker}}.faa
touch {wildcards.ts_sample}/Binning/essMarkerGenes/marker-${{marker}}.tsv
touch {wildcards.ts_sample}/Binning/essMarkerGenes/marker-${{marker}}.faa
fi
done
touch Binning/getMarkerGenes.done
touch {output}
"""
rule binning_essential_genes:
input:
#"Binning/prokka.faa",
"Analysis/annotation/prokka.faa",
"Analysis/annotation/annotation.filt.gff",
"%s/{ts_sample}/Analysis/annotation/prokka.faa" % TS_DIR,
"%s/{ts_sample}/Analysis/annotation/annotation.filt.gff" % TS_DIR,
"%s/hmm/essential_genes/essential.hmm" % DBPATH
output:
"Binning/ORFS.hmmer.essential.out",
"Binning/ORFS.hmm.orfs.essential.hits",
"Binning/ORFS.hmm.orfs.essential.hits_reduced",
"Binning/ORFS-contig_links.bed",
"Binning/ORFS.hmm.orfs.essential.hits.protids",
"Binning/ORFS.hmm.orfs.essential.hits.faa"
"{ts_sample}/Binning/ORFS.hmmer.essential.out",
"{ts_sample}/Binning/ORFS.hmm.orfs.essential.hits",
"{ts_sample}/Binning/ORFS.hmm.orfs.essential.hits_reduced",
"{ts_sample}/Binning/ORFS-contig_links.bed",
"{ts_sample}/Binning/ORFS.hmm.orfs.essential.hits.protids",
"{ts_sample}/Binning/ORFS.hmm.orfs.essential.hits.faa"
shell:
"""
TBL_TMP=$(mktemp --tmpdir={TMPDIR} -t "XXXX_hmmer.essential.hits")
......
rule binning_plot_completeness:
input:
"Binning/CompletenessSummary.csv"
"{ts_sample}/Binning/CompletenessSummary.csv"
output:
"Binning/analysis_essential_genes.pdf"
"{ts_sample}/Binning/analysis_essential_genes.pdf"
shell:
"""
# module load lang/R/3.2.0-ictce-7.3.5-bare
Rscript {SRCDIR}/plotting_completenessresults.R {input} Binning/
Rscript {SRCDIR}/plotting_completenessresults.R {input} {wildcards.ts_sample}/Binning/
"""
rule binning_separate_bins:
input:
expand("Binning/contigs2clusters.{pk}.{nn}.tsv",pk=config["binning"]["pk"],nn=config["binning"]["nn"]),
"Assembly/mgmt.assembly.merged.fa",
"Binning/ORFS-contig_links.bed",
"Analysis/annotation/prokka.faa",
"Analysis/annotation/annotation.filt.gff"
expand("{{ts_sample}}/Binning/contigs2clusters.{pk}.{nn}.tsv",pk=config["binning"]["pk"],nn=config["binning"]["nn"]),
"%s/{ts_sample}/Assembly/mgmt.assembly.merged.fa" % TS_DIR,
"%s/{ts_sample}/Binning/ORFS-contig_links.bed" % TS_DIR,
"%s/{ts_sample}/Analysis/annotation/prokka.faa" % TS_DIR,
"%s/{ts_sample}/Analysis/annotation/annotation.filt.gff" % TS_DIR
output:
"Binning/separate_bins.done"
"{ts_sample}/Binning/separate_bins.done"
shell:
"""
for cluster in $(tail -n+2 {input[0]} | cut -f2 | sort | uniq)
do
# Create directory for the cluster
mkdir -p Binning/clusterFiles/${{cluster}}
mkdir -p {wildcards.ts_sample}/Binning/clusterFiles/${{cluster}}
# Define headerfiles
headerfile="Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.contigids"
protids="Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.protids"
headerfile="{wildcards.ts_sample}/Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.contigids"
protids="{wildcards.ts_sample}/Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.protids"
echo "Obtaining contigs for cluster ${{cluster}}"
awk -v c=${{cluster}} '{{ if($2==c) {{print $1}} }}' {input[0]} > ${{headerfile}}
pullseq -i {input[1]} -n ${{headerfile}} > Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.fa
pullseq -i {input[1]} -n ${{headerfile}} > {wildcards.ts_sample}/Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.fa
echo "Obtaining amino acid sequences for cluster ${{cluster}}"
awk 'FNR==NR{{a[$1]=$1;next}}{{if(a[$1]) print $4}}' ${{headerfile}} {input[2]} > ${{protids}}
pullseq -i {input[3]} -n ${{protids}} > Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.faa
pullseq -i {input[3]} -n ${{protids}} > {wildcards.ts_sample}/Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.faa
echo "Obtaining functional annotations (gff format) for cluster/bin ${{cluster}}"
grep -wFf ${{headerfile}} {input[4]} > Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.gff
grep -wFf ${{headerfile}} {input[4]} > {wildcards.ts_sample}/Binning/clusterFiles/${{cluster}}/cluster.${{cluster}}.gff
done
echo "Complete separating bins"
touch {output}
......
rule binning_split_completeness:
input:
"Binning/ORFS.hmm.orfs.essential.hits"
"{ts_sample}/Binning/ORFS.hmm.orfs.essential.hits"
output:
"Binning/split_completeness.done"
"{ts_sample}/Binning/split_completeness.done"
shell:
"""
for IDFILE in $(find Binning/clusterFiles -name "*.protids" -maxdepth 1 | sort)
for IDFILE in $(find {wildcards.ts_sample}/Binning/clusterFiles -name "*.protids" -maxdepth 1 | sort)
do
OUTFILE="Binning/clusterFiles/$(basename "${{IDFILE}}" .protids).essential.hits"
OUTFILE="{wildcards.ts_sample}/Binning/clusterFiles/$(basename "${{IDFILE}}" .protids).essential.hits"
awk 'FNR==NR{{a[$1]=$1;next}}{{if(a[$1]) print $0}}' $IDFILE {input[0]} | uniq > $OUTFILE
done
touch {output}
......
rule binning_summarize_completeness:
input:
"Binning/split_completeness.done"
"{ts_sample}/Binning/split_completeness.done"
output:
"Binning/CompletenessSummary.csv"
"{ts_sample}/Binning/CompletenessSummary.csv"
shell:
"""
{SRCDIR}/getCompletenessAnalysisOverview.sh Binning/clusterFiles {output}
# IN_DIR="Binning/clusterFiles"
# OUT_FILE={output}
# COUNTER=0
# echo "STATUS: Collecting completeness results..."
# echo "#ClusterName,num. uniques,num. multiples" > ${{OUT_FILE}}
# for IN_FILE in $(find $IN_DIR -maxdepth 1 -name "*essential.hits" | sort)
# do
# COUNTER=$(($COUNTER + 1))
# (echo -ne "$(basename "$IN_FILE" .essential.hits)," && grep -v "^#" $IN_FILE | awk '{{print $4}}' | sort | uniq -c | awk 'BEGIN {{ uniques=0; multis=0 }} {{if($1 ==1) uniques += 1; else multis +=1;}} END {{ print uniques "," multis }}') >> ${{OUT_FILE}}
# done
#
# echo "done (collecting ${{COUNTER}} completeness results)."
{SRCDIR}/getCompletenessAnalysisOverview.sh {wildcards.ts_sample}/Binning/clusterFiles {output}
"""
rule generate_plots:
input:
output:
shell:
"""
"""
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment