Commit eb8dc9d2 authored by Valentina Galata's avatar Valentina Galata
Browse files

report: table and stats: assembly intersection and metat cov

parent 00ba04e1
# About
Notes for samples `GDB`, and genes/proteins and protein clusters "covered" by metaT (ave. cov. >= 10) OR belong to protein clusters covering all tools/assemblies.
# Number of (metaT covered) unique genes/proteins
File `report/mmseqs2_highconf.tsv` is created by `workflow_report`.
```python
import pandas
df = pandas.read_csv("/scratch/users/vgalata/gdb/results/report/mmseqs2_highconf.tsv", sep="\t", header=0)
# proteins/protein clusters covering all assemblies
df_all = df.loc[df["mmseqs2_all"],["tool_prot_id", "mmseqs2_cluster"]]
# proteins/protein clusters NOT covering all assemblies but with ave. metaT cov. >= 10
df_cov = df.loc[~(df["mmseqs2_all"]) & (df["ave_cov"] >= 10),["tool_prot_id", "mmseqs2_cluster"]]
# print stats
print(
"All assemblies: %d proteins, %d clusters\nmetaT cov. >= 10: %d proteins, %d clusters" % (
df_all.shape[0],
len(set(df_all.mmseqs2_cluster)),
df_cov.shape[0],
len(set(df_cov.mmseqs2_cluster))
)
)
# All assemblies: 428107 proteins, 51459 clusters
# metaT cov. >= 10: 137054 proteins, 54747 clusters
```
\ No newline at end of file
......@@ -19,6 +19,7 @@ include:
FLAG_METAT = "metat" in META_TYPES
FLAG_EXTRA_AMR = "rgi" in config["steps_annotation"] and "cov" in config["steps_analysis"] and FLAG_METAT
FLAG_EXTRA_UPROT = "mmseqs2" in config["steps_analysis"] and "cov" in config["steps_analysis"] and FLAG_METAT
FLAG_EXTRA_HPROT = FLAG_EXTRA_UPROT
FLAG_EXTRA_METAP = "metap" in config["data"].keys() and FLAG_EXTRA_AMR and FLAG_EXTRA_UPROT
rule all:
......@@ -31,6 +32,8 @@ rule all:
uprot=[os.path.join(RESULTS_DIR, "report/mmseqs2_uniq.tsv")] if FLAG_EXTRA_UPROT else [],
# extra: metaP-based stats (GDB)
metap=[os.path.join(RESULTS_DIR, "report/metap.txt")] if FLAG_EXTRA_METAP else [],
# extra: gene/prot info to list "high-confidence" protein (clusters) (GDB)
hprot=[os.path.join(RESULTS_DIR, "report/mmseqs2_highconf.tsv")] if FLAG_EXTRA_HPROT else []
include:
"rules/collect_data.smk"
......@@ -172,6 +175,51 @@ rule metap_report:
n_pct = 100 * n_cov / n_total
ofile.write("Protein ID list {}: {} of {} ({:.2f}%) covered by {}\n".format(list_name, n_cov, n_total, n_pct, metap_file))
rule highconf_mmseqs2_report:
input:
clusters=os.path.join(RESULTS_DIR, "analysis/mmseqs2/clusters.tsv"),
gcov_metat=expand(
os.path.join(RESULTS_DIR, "mapping/metat/{rtype_tool}.cov.pergene"),
rtype_tool=["{r}/{t}/ASSEMBLY.POLISHED.sr".format(r=r, t=t) for r, t in READ_ASSEMBLERS]
)
output:
tab=os.path.join(RESULTS_DIR, "report/mmseqs2_highconf.tsv")
threads: 1
run:
from scripts.utils import mmseqs2_tsv, mmseqs2_summary
# read in ave. gene coverage
cov = []
for ifile_path in input.gcov_metat:
cov_df = pandas.read_csv(ifile_path, sep="\t", header=0)
cov_df["tool"] = os.path.basename(os.path.dirname(ifile_path))
cov.append(cov_df)
# concat to single dataframe
cov = pandas.concat(cov, ignore_index=True)
# raw names = <tool>__<prot id>
cov["tool_prot_id"] = cov["tool"] + "__" + cov["prot_id"]
cov.set_index(keys="tool_prot_id", drop=True, verify_integrity=True, inplace=True)
# number of proteins per cluster and tool
counts = mmseqs2_tsv(input.clusters)
# clusters with proteins covering all tools/assemblies
all_cls = counts.index[counts.aggregate(axis="columns", func=lambda x: pandas.notnull(x).sum() == counts.shape[1])]
# get protein cluster IDs
mmseqs2_cluster = dict.fromkeys(list(cov.index), None)
with open(input.clusters, "r") as ifile:
for line in ifile:
cluster_name, cluster_member = line.strip().split("\t")
mmseqs2_cluster[cluster_member] = cluster_name
cov["mmseqs2_cluster"] = cov.index.map(mmseqs2_cluster)
assert cov.mmseqs2_cluster.isnull().sum() == 0
# flag proteins from clusters covering all tools/assemblies
cov["mmseqs2_all"] = cov["mmseqs2_cluster"].apply(func=lambda x: x in all_cls)
# save
cov.to_csv(output[0], sep="\t", header=True, index=True, index_label="tool_prot_id")
rule render_report:
input:
rmd=os.path.join(SRC_DIR, "report.Rmd"),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment