Commit e8081aa9 authored by Valentina Galata's avatar Valentina Galata
Browse files

report: tab w/ unique mmseqs2 prots incl metat cov (issue #121)

parent fc407c63
# Report pipeline: to be used after the main workflow (i.e. workflow/)
#
# Example call:
# snakemake -s workflow_report/Snakefile --configfile config/gdb/config.yaml --use-conda --conda-prefix /scratch/users/vgalata/miniconda3/ONT_pilot --cores 1 -rpn
##############################
# MODULES
......@@ -18,12 +15,19 @@ include:
##############################
# TARGETS & RULES
# flags: whether specific steps should be done
FLAG_METAT = "metat" in META_TYPES
FLAG_EXTRA_AMR = "rgi" in config["steps_annotation"] and "cov" in config["steps_analysis"] and FLAG_METAT
FLAG_EXTRA_UPROT = "mmseqs2" in config["steps_analysis"] and "cov" in config["steps_analysis"] and FLAG_METAT
rule all:
input:
# report
report=os.path.join(RESULTS_DIR, "report/report.html"),
# AMR (for GDB)
amr=[os.path.join(RESULTS_DIR, "report/amr.tsv")] if "rgi" in config["steps_annotation"] and "cov" in config["steps_analysis"] and "metat" in META_TYPES else [] # TODO
# extra: AMR (for GDB)
amr=[os.path.join(RESULTS_DIR, "report/amr.tsv")] if FLAG_EXTRA_AMR else [],
# extra: unique proteins from mmseqs2 clusters (GDB)
uprot=[os.path.join(RESULTS_DIR, "report/mmseqs2_uniq.tsv")] if FLAG_EXTRA_UPROT else []
include:
"rules/collect_data.smk"
......@@ -53,6 +57,48 @@ rule amr_report:
script:
os.path.join(SRC_DIR, "amr.R")
rule uniq_mmseqs2_report:
input:
clusters=os.path.join(RESULTS_DIR, "analysis/mmseqs2/clusters.tsv"),
gcov_metat=expand(
os.path.join(RESULTS_DIR, "mapping/metat/{rtype_tool}.cov.pergene"),
rtype_tool=["{r}/{t}/ASSEMBLY.POLISHED.sr".format(r=r, t=t) for r, t in READ_ASSEMBLERS]
)
output:
tab=os.path.join(RESULTS_DIR, "report/mmseqs2_uniq.tsv")
threads: 1
run:
from scripts.utils import mmseqs2_tsv, mmseqs2_summary
# read in ave. gene coverage
cov = []
for ifile_path in input.gcov_metat:
cov_df = pandas.read_csv(ifile_path, sep="\t", header=0)
cov_df["tool"] = os.path.basename(os.path.dirname(ifile_path))
cov.append(cov_df)
# concat to single dataframe
cov = pandas.concat(cov, ignore_index=True)
# raw names = <tool>__<prot id>
cov["tool_prot_id"] = cov["tool"] + "__" + cov["prot_id"]
cov.set_index(keys="tool_prot_id", drop=True, verify_integrity=True, inplace=True)
# flag for unique genes/proteins
cov["mmseqs2_uniq"] = False
# number of proteins per cluster and tool
counts = mmseqs2_tsv(input.clusters)
# clusters with proteins from one tool only (unique clusters)
uniq_cls = counts.index[counts.aggregate(axis="columns", func=lambda x: pandas.notnull(x).sum() == 1)]
# flag unique proteins, i.e. those from unique clusters
with open(input.clusters, "r") as ifile:
for line in ifile:
cluster_name, cluster_member = line.strip().split("\t")
if cluster_name in uniq_cls:
cov.loc[cluster_member,"mmseqs2_uniq"] = True
# keep only cov. for unique proteins
cov = cov.loc[cov["mmseqs2_uniq"],]
cov.to_csv(output[0], sep="\t", header=True, index=True, index_label="tool_prot_id")
rule render_report:
input:
rmd=os.path.join(SRC_DIR, "report.Rmd"),
......@@ -60,10 +106,10 @@ rule render_report:
### preprocessing
nanostats=os.path.join(RESULTS_DIR, "qc/metag/lr/NanoStats.tsv"),
fastp_metag=[os.path.join(RESULTS_DIR, "preproc/metag/sr/fastp.tsv")],
fastp_metat=[os.path.join(RESULTS_DIR, "preproc/metat/sr/fastp.tsv")] if "metat" in META_TYPES else [],
fastp_metat=[os.path.join(RESULTS_DIR, "preproc/metat/sr/fastp.tsv")] if FLAG_METAT else [],
### mappability
mappability_metag=[os.path.join(RESULTS_DIR, "mapping/metag/mappability.tsv")],
mappability_metat=[os.path.join(RESULTS_DIR, "mapping/metat/mappability.tsv")] if "metat" in META_TYPES else [],
mappability_metat=[os.path.join(RESULTS_DIR, "mapping/metat/mappability.tsv")] if FLAG_METAT else [],
### annotation
prodigal_gcounts=os.path.join(RESULTS_DIR, "annotation/prodigal/summary.gene.counts.tsv"),
prodigal_glength=os.path.join(RESULTS_DIR, "annotation/prodigal/summary.gene.length.tsv"),
......@@ -81,10 +127,10 @@ rule render_report:
# mashmap=TODO
mummer=[os.path.join(RESULTS_DIR, "analysis/mummer/summary.dnadiff.tsv")] if "mummer" in config["steps_analysis"] else [],
cdhit_metag=[os.path.join(RESULTS_DIR, "analysis/cdhit/summary.metag.tsv")] if "cdhit" in config["steps_analysis"] else [],
cdhit_metat=[os.path.join(RESULTS_DIR, "analysis/cdhit/summary.metat.tsv")] if "cdhit" in config["steps_analysis"] and "metat" in META_TYPES else [],
cdhit_metat=[os.path.join(RESULTS_DIR, "analysis/cdhit/summary.metat.tsv")] if "cdhit" in config["steps_analysis"] and FLAG_METAT else [],
diamond_db=[os.path.join(RESULTS_DIR, "analysis/diamond/summary.db.tsv")] if "diamond" in config["steps_analysis"] else [], # TODO: filter hits ???
diamond_metag=[os.path.join(RESULTS_DIR, "analysis/diamond/summary.metag.tsv")] if "diamond" in config["steps_analysis"] else [],
diamond_metat=[os.path.join(RESULTS_DIR, "analysis/diamond/summary.metat.tsv")] if "diamond" in config["steps_analysis"] and "metat" in META_TYPES else [],
diamond_metat=[os.path.join(RESULTS_DIR, "analysis/diamond/summary.metat.tsv")] if "diamond" in config["steps_analysis"] and FLAG_METAT else [],
mmseqs2=[os.path.join(RESULTS_DIR, "analysis/mmseqs2/summary.tsv")] if "mmseqs2" in config["steps_analysis"] else [],
covseg=[os.path.join(RESULTS_DIR, "mapping/summary.segsum.metag.tsv")] if "cov" in config["steps_analysis"] else [],
### taxonomy
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment