Commit de9b0799 authored by Valentina Galata's avatar Valentina Galata
Browse files

new workflow: gene calling (related to issue #113)

parent de8808fb
work_dir: "/mnt/lscratch/users/vgalata/gene_calling_benchmark"
# reference genomes
refs:
GCF_011456075.1_ASM1145607v1: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/456/075/GCF_011456075.1_ASM1145607v1"
GCA_000006765.1_ASM676v1: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/006/765/GCA_000006765.1_ASM676v1"
GCA_000005845.2_ASM584v2: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/005/845/GCA_000005845.2_ASM584v2"
GCA_014334155.1_ASM1433415v1: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/334/155/GCA_014334155.1_ASM1433415v1"
# # file extensions for reference genomes
# refs_ext:
# fna: "_genomic.fna.gz"
# cds: "_cds_from_genomic.fna.gz"
# faa: "_translated_cds.faa.gz"
diamond:
threads: 5
blast:
threads: 5
\ No newline at end of file
# Pipeline for gene calling performance analysis using ref. data (indep. of other workflows)
#
# Example call:
# snakemake -s workflow_gcall/Snakefile --configfile config/gcall/config.yaml --use-conda --conda-prefix /scratch/users/vgalata/miniconda3/ONT_pilot --cores 1 -rpn
##############################
# MODULES
import os
import re
import pandas
##############################
# CONFIG
# Paths
SRC_DIR = srcdir("scripts")
ENV_DIR = srcdir("envs")
# working directory
workdir:
config["work_dir"]
##################################################
# RULES
##################################################
rule all:
input:
# blastn
blastn="blastn/ref_vs_prodigal.tsv"
rule download_ref:
output:
"refs/{acc}_{ext}"
log:
"refs/{acc}_{ext}.log"
wildcard_constraints:
acc="|".join(config["refs"].keys())
params:
url=lambda wildcards: os.path.join(config["refs"][wildcards.acc], "%s_%s.gz" % (wildcards.acc, wildcards.ext))
message:
"Gene calling: download reference {wildcards.acc} ({wildcards.ext})"
shell:
"(date && wget {params.url} -O {output}.gz && gunzip {output}.gz && date) &> {log}"
rule concat_ref:
input:
expand("refs/{acc}_{{ext}}", acc=config["refs"].keys())
output:
"refs/ref_{ext}"
message:
"Gene calling: concat references ({wildcards.ext})"
shell:
"cat {input} > {output}"
rule prodigal_ref:
input:
"refs/ref_genomic.fna"
output:
"refs/ref_genomic.prodigal.faa"
log:
"refs/ref_genomic.prodigal.log"
conda:
os.path.join(ENV_DIR, "prodigal.yaml")
message:
"Gene calling: run Prodigal on references"
shell:
"(date && prodigal -a {output} -p meta -i {input} && date) &> {log}"
rule prodigal_genes_ref:
input:
fna="refs/ref_genomic.fna",
faa="refs/ref_genomic.prodigal.faa"
output:
"refs/ref_genomic.prodigal.fa"
params:
prefix=""
conda:
os.path.join(ENV_DIR, "biopython.yaml")
message:
"Gene calling: extract reference genes from Prodigal proteins"
script:
os.path.join(SRC_DIR, "genes_from_prodigal_prots.py")
rule blastn_db_ref:
input:
"refs/ref_cds_from_genomic.fna"
output:
expand(
"refs/ref_cds_from_genomic.blast.{ext}",
ext=["ndb","nhr","nin","not","nsq","ntf","nto"]
)
log:
"refs/ref_cds_from_genomic.blast.log"
threads:
config["blast"]["threads"]
conda:
os.path.join(ENV_DIR, "blast.yaml")
message:
"Gene calling: reference BLAST nucl. DB"
shell:
"(date && db={output[0]} && makeblastdb -dbtype 'nucl' -in {input} -input_type 'fasta' -out ${{db%.*}} -logfile {log} && date) &> {log}"
rule diamond_db_ref:
input:
"refs/ref_translated_cds.faa"
output:
"refs/ref_translated_cds.diamond.dmnd"
log:
"refs/ref_translated_cds.diamond.dmnd.log"
threads:
config["diamond"]["threads"]
conda:
os.path.join(ENV_DIR, "diamond.yaml")
message:
"Gene calling: reference DIAMOND DB"
shell:
"(date && db={output} && diamond makedb --in {input} --db ${{db%.*}} --threads {threads} && date) &> {log}"
rule blastn_ref_prodigal:
input:
fasta=rules.prodigal_genes_ref.output,
db=rules.blastn_db_ref.output
output:
asn="blastn/ref_vs_prodigal.asn",
tsv="blastn/ref_vs_prodigal.tsv"
log:
"blastn/ref_vs_prodigal.log"
threads:
config["blast"]["threads"]
params:
blastn="-task 'megablast' -perc_identity 50", # pident: setting low to filter by it later
outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
conda:
os.path.join(ENV_DIR, "blast.yaml")
message:
"Gene calling: BLASTn search (db=references, query=prodigal)"
shell: # TODO
"(date && "
"db={input.db[0]} && "
"blastn -query {input.fasta} -db ${{db%.*}} -out {output.asn} -outfmt 11 -num_threads {threads} {params.blastn} && "
# --max-target-seqs: Maximum number of aligned sequences to keep
# setting to > 5 since it is recommended and might be helpful to understand diff. in proteins between assemblies and references
"blast_formatter -archive {output.asn} -out {output.tsv} -max_target_seqs 10 -outfmt \'{params.outfmt}\' && "
"date) &> {log}"
# rule run_diamond_on_proteins:
# input:
# proteins=os.path.join(DATA_DIR, "data/translated_cds.faa"),
# db=rules.create_diamond_db.output
# output: os.path.join(RESULTS_DIR, "diamond/genome.daa")
# params:
# outfmt="100"
# log: os.path.join(RESULTS_DIR, "logs/genome.diamond.log")
# conda: "envs/diamond.yaml"
# threads: config["diamond"]["threads"]
# shell:
# """
# (date &&\
# diamond blastp -q {input.proteins} --db {input.db} -p {threads} --outfmt {params.outfmt} --out {output} --id 90 \
# --query-cover 90 --subject-cover 90 --more-sensitive &&\
# date) &> >(tee {log})
# """
# rule annotation_diamond_tsv:
# input: os.path.join(RESULTS_DIR, "diamond/genome.daa")
# output: os.path.join(RESULTS_DIR, "diamond/genome.tsv")
# log: os.path.join(RESULTS_DIR, "logs/genome_annotation.diamond.log")
# params:
# # NOTE: For the used conda version, need to specify the entire column layout (outfmt) explicitly
# outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
# conda: "envs/diamond.yaml"
# shell:
# """
# (date &&\
# diamond view --daa $(echo {input} | sed 's/\.daa$//') --max-target-seqs 1 --outfmt {params.outfmt} --out {output} &&\
# date) &> >(tee {log})
# """
../../workflow_zymo/envs/biopython.yaml
\ No newline at end of file
../../workflow_zymo/envs/blast.yaml
\ No newline at end of file
../../workflow/envs/diamond.yaml
\ No newline at end of file
../../workflow/envs/prodigal.yaml
\ No newline at end of file
../../workflow_zymo/scripts/genes_from_prodigal_prots.py
\ No newline at end of file
../../workflow/scripts/utils.py
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment