Commit 804a5f50 authored by Laura Denies's avatar Laura Denies
Browse files

update test module

parent 4fea8c56
pathofact:
sample: ["test_sample"] # requires user input
project: PathoFact_update_trial_new # requires user input
datadir: ../test_dataset # requires user input
sample: ["SRR2582233","SRR2582234","SRR2582237","SRR2582238","SRR2582241","SRR2582243","SRR2582246","SRR2582247","SRR2582248","SRR2582251","SRR2582252","SRR2582253","SRR2582255","SRR2582257","SRR2582258"] # requires user input
project: PathoFact_v2 # requires user input
datadir: /work/projects/ecosystem_biology/local_tools/PathoFact_update/Datasets/CDI # requires user input
workflow: "complete"
size_fasta: 1000
size_fasta: 10000
scripts: "scripts"
signalp: "/work/projects/ecosystem_biology/local_tools/SignalP/signalp-5.0b/bin" # requires user input
deepvirfinder: "submodules/DeepVirFinder/dvf.py"
......
#Prepare fasta
import glob
import os
configfile: "config.yaml"
rule all:
input: expand("{datadir}/{project}/Prodigal/{sample}.contig", datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"])
# Generate ORFs and GFF
# Generate unique ID number for each sequence
rule Prodigal:
input:
"{datadir}/{sample}.fna"
output:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
message:
"Generates ORFs and gff"
params:
outdir="{datadir}"
conda:
"../../envs/Prodigal.yaml"
shell:
"""
prodigal -i {input} -o {output.GFF} -a {output.ORF} -f gff -p meta
"""
rule mapping_file:
input:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
output:
"{datadir}/{project}/Prodigal/{sample}.contig"
message:
"Generate mapping file"
params:
outdir="{datadir}"
shell:
"""
sed -i 's/[^>]*ID=//;s/;.*//' {input.ORF}
sed -i '/^#/d' {input.GFF}
cut -f 1,9 {input.GFF} |cut -d';' -f1| sed 's/ID=//' > {output}
"""
......@@ -91,8 +91,8 @@ checkpoint splitting:
"{datadir}/{project}/renamed/{sample}_ID.faa"
output:
splits=directory("{datadir}/{project}/splitted/{sample}/")
# log:
# "{datadir}/{project}/splitted/{sample}.log"
log:
"{datadir}/{project}/splitted/{sample}.log"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
......
......@@ -4,22 +4,11 @@ This is a data set to test the `PathoFact` pipeline.
See `README.md` in parent directory for `PathoFact` set-up instructions.
## Input files
How input files were created:
```bash
# *.fna, *.faa and *.gff
rsync -avP /work/projects/ecosystem_biology/local_tools/IMP3/test/testRAW/run150320/Analysis/annotation/prokka.gff test_sample.gff
rsync -avP /work/projects/ecosystem_biology/local_tools/IMP3/test/testRAW/run150320/Analysis/annotation/prokka.fna test_sample.fna
rsync -avP /work/projects/ecosystem_biology/local_tools/IMP3/test/testRAW/run150320/Analysis/annotation/prokka.faa test_sample.faa
# set-up: modify files, create other required files
./set-up.sh
```
## Test run
Include the required path to the SingalP v5.0 installation to the config file
```
# activate env
conda activate PathoFact
......
......@@ -33,9 +33,9 @@ PF = pathofact(
[
"{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
"{datadir}/{project}/Toxin_prediction_{sample}_report.csv",
"{datadir}/{project}/Virulence_prediction_{sample}_report.csv",
"{datadir}/{project}/PathoFact_{sample}_predictions.csv"
"{datadir}/{project}/Toxin_prediction_{sample}_report.tsv",
"{datadir}/{project}/Virulence_prediction_{sample}_report.tsv",
"{datadir}/{project}/PathoFact_{sample}_predictions.tsv"
],
datadir=config["pathofact"]["datadir"],
project=config["pathofact"]["project"],
......@@ -48,9 +48,9 @@ CHECKS = expand(
[
"{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv.checked",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv.checked",
"{datadir}/{project}/Toxin_prediction_{sample}_report.csv.checked",
"{datadir}/{project}/Virulence_prediction_{sample}_report.csv.checked",
"{datadir}/{project}/PathoFact_{sample}_predictions.csv.checked"
"{datadir}/{project}/Toxin_prediction_{sample}_report.tsv.checked",
"{datadir}/{project}/Virulence_prediction_{sample}_report.tsv.checked",
"{datadir}/{project}/PathoFact_{sample}_predictions.tsv.checked"
],
datadir=config["pathofact"]["datadir"],
project=config["pathofact"]["project"],
......
No preview for this file type
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
No preview for this file type
No preview for this file type
No preview for this file type
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
#!/bin/bash -l
# modify *.faa: rm record descriptions
sed -i '/^>/ s/ .*//' test_sample.faa
# *.gff to *.contig: contig ID, feature ID
sed -n '/^##gff/,/^##FASTA/p' test_sample.gff | grep -v '^#' | cut -f1,9 | cut -d';' -f1 | sed 's/ID=//' > test_sample.contig
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment