Commit 88dfe576 authored by Valentina Galata's avatar Valentina Galata
Browse files

report: gdb: metap stats (issue #97)

parent c04449d4
......@@ -35,7 +35,7 @@ data:
r2: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/GDB_2019/metat/sr/FastSelectFull1_MT_Rashi_S14_R2_001.fastq.gz" # leave empty if no data, i.e. ""
# Meta-proteomics
metap:
"" # TODO
"/mnt/lscratch/users/vgalata/gdb/results/metap/20210323" # folder w/ metaP peptide/protein reports
############################################################
# TOOLS
......
......@@ -347,3 +347,49 @@ def mmseqs2_tool_summary(counts):
else: # total number of proteins (over all clusters)
summary.loc[tool1,tool2] = counts[tool1].sum(skipna=True)
return summary
# metaP
def proc_metap_sec_accs(s, get_tools=False):
"""
Process secondary accessions from metaP table
"""
import re
import pandas
if pandas.isnull(s) or s == "": # empty
s = []
else: # process
s = s.split(", ") # split IDs into a list
if get_tools: # extract tool name from IDs
s = [x.split("__")[0] for x in s if re.search("__", x)]
return s
def proc_metap_tab(metap_file, mult_tools=False):
"""
Read in and process a metaP (protein) report
"""
import pandas
# read in
metap = pandas.read_csv(metap_file, sep="\t", header=0)
if mult_tools:
# assembly tools from accession IDs; empty if no tool (because not an assembly protein) or accession(s)
metap["Main Accession Tool"] = metap["Main Accession"].map(lambda x: ",".join(set(proc_metap_sec_accs(x, True))))
metap["Secondary Accessions Tools"] = metap["Secondary Accessions"].map(lambda x: ",".join(set(proc_metap_sec_accs(x, True))))
# rows containing at least one protein ID from the assemblies
metap["Assembly Proteins"] = (metap["Main Accession Tool"] != "") | (metap["Secondary Accessions Tools"] != "")
# rows with unique/exclusive assembly proteins
# unique/exclusive = (same assembly tool for main and all sec. accessions OR no sec. accessions) AND (is assembly prot.)
metap["Unique"] = ((metap["Main Accession Tool"] == metap["Secondary Accessions Tools"]) | (metap["Secondary Accessions Tools"] == "")) & metap["Assembly Proteins"]
return metap
def collect_metap_prots(metap):
"""
Collect all protein accessions from a metaP (protein) report
"""
import pandas
# init: main accessions
metap_prots = set(metap["Main Accession"])
# add: sec. accessions
for i in metap.index:
metap_prots.update(set(set(proc_metap_sec_accs(metap.loc[i,"Secondary Accessions"], False))))
return metap_prots
\ No newline at end of file
......@@ -19,6 +19,7 @@ include:
FLAG_METAT = "metat" in META_TYPES
FLAG_EXTRA_AMR = "rgi" in config["steps_annotation"] and "cov" in config["steps_analysis"] and FLAG_METAT
FLAG_EXTRA_UPROT = "mmseqs2" in config["steps_analysis"] and "cov" in config["steps_analysis"] and FLAG_METAT
FLAG_EXTRA_METAP = "metap" in config["data"].keys() and FLAG_EXTRA_AMR and FLAG_EXTRA_UPROT
rule all:
input:
......@@ -27,8 +28,9 @@ rule all:
# extra: AMR (for GDB)
amr=[os.path.join(RESULTS_DIR, "report/amr.tsv")] if FLAG_EXTRA_AMR else [],
# extra: unique proteins from mmseqs2 clusters (GDB)
uprot=[os.path.join(RESULTS_DIR, "report/mmseqs2_uniq.tsv")] if FLAG_EXTRA_UPROT else []
uprot=[os.path.join(RESULTS_DIR, "report/mmseqs2_uniq.tsv")] if FLAG_EXTRA_UPROT else [],
# extra: metaP-based stats (GDB)
metap=[os.path.join(RESULTS_DIR, "report/metap.txt")] if FLAG_EXTRA_METAP else [],
include:
"rules/collect_data.smk"
......@@ -99,6 +101,77 @@ rule uniq_mmseqs2_report:
cov = cov.loc[cov["mmseqs2_uniq"],]
cov.to_csv(output[0], sep="\t", header=True, index=True, index_label="tool_prot_id")
rule metap_report_ids_amr:
input:
os.path.join(RESULTS_DIR, "report/amr.tsv")
output:
temp(os.path.join(RESULTS_DIR, "report/metap_ids_amr.txt"))
shell:
"tail -n +2 {input} | awk -F'\\t' '{{print $10\"__\"$1}}' | sed 's/^SR MEGAHIT/megahit/;s/^SR metaSPAdes/metaspades/;s/^LR Flye/flye/;s/LR Raven/raven/;s/^HY metaSPAdes/metaspadeshybrid/;s/^HY OPERA-MS (MH)/operamsmegahit/;s/^HY OPERA-MS (mSPA)/operamsmetaspades/' > {output}"
rule metap_report_ids_mmseqs2uniq:
input:
os.path.join(RESULTS_DIR, "report/mmseqs2_uniq.tsv")
output:
temp(os.path.join(RESULTS_DIR, "report/metap_ids_mmseqs2uniq.txt"))
shell:
"tail -n +2 {input} | cut -f1 > {output}"
rule metap_report:
input:
ids_amr=os.path.join(RESULTS_DIR, "report/metap_ids_amr.txt"),
ids_uniq=os.path.join(RESULTS_DIR, "report/metap_ids_mmseqs2uniq.txt"),
# metaP
metap=expand(
os.path.join(config["data"]["metap"], "{fname}"),
fname=[
"proteins_1_1_Default_Protein_Report.txt", # all proteins from all assemblies
"flye_1_1_Default_Protein_Report.txt",
"raven_1_1_Default_Protein_Report.txt",
"megahit_1_1_Default_Protein_Report.txt",
"metaspades_1_1_Default_Protein_Report.txt",
"metaspadeshybrid_1_1_Default_Protein_Report.txt",
"operamsmegahit_1_1_Default_Protein_Report.txt",
"operamsmetaspades_1_1_Default_Protein_Report.txt"
]
) if FLAG_EXTRA_METAP else []
output:
os.path.join(RESULTS_DIR, "report/metap.txt")
run:
from scripts.utils import proc_metap_sec_accs, proc_metap_tab, collect_metap_prots
prot_lists = {
"amr": list(pandas.read_csv(input.ids_amr, header=None).iloc[:,0]),
"unique": list(pandas.read_csv(input.ids_uniq, header=None).iloc[:,0])
}
metap_files = dict()
for fname in input.metap:
metap_files[os.path.basename(fname).split("_")[0]] = fname
with open(output[0], "w") as ofile:
for metap_tool, metap_file in metap_files.items():
# read (and process)
metap = proc_metap_tab(metap_file, mult_tools=(metap_tool=="proteins"))
# print stats
if metap_tool=="proteins":
ofile.write("\n{}: {}; {} rows w/ assembly proteins, {} rows w/ exclusive assembly proteins\n".format(
metap_file,
metap.shape,
sum(metap["Assembly Proteins"]),
sum(metap["Unique"])
))
else:
ofile.write("\n{}: {}\n".format(metap_file, metap.shape))
# collect all assembly protein IDs
metap_prots = collect_metap_prots(metap)
# metaP cov. for given prot. ID lists
for list_name, list_values in prot_lists.items():
if metap_tool != "proteins":
list_values = ["__".join(pid.split("__")[1:]) for pid in list_values if re.match(metap_tool+"__", pid)]
list_covered = [pid for pid in list_values if pid in metap_prots]
n_total = len(list_values)
n_cov = len(list_covered)
n_pct = 100 * n_cov / n_total
ofile.write("Protein ID list {}: {} of {} ({:.2f}%) covered by {}\n".format(list_name, n_cov, n_total, n_pct, metap_file))
rule render_report:
input:
rmd=os.path.join(SRC_DIR, "report.Rmd"),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment