Commit eeac8cad authored by Valentina Galata's avatar Valentina Galata
Browse files

notes: high conf prots: more stats

parent d0c44b5b
......@@ -13,19 +13,36 @@ df = pandas.read_csv("/scratch/users/vgalata/gdb/results/report/mmseqs2_highconf
# proteins/protein clusters covering all assemblies
df_all = df.loc[df["mmseqs2_all"],["tool_prot_id", "mmseqs2_cluster"]]
# proteins/protein clusters NOT covering all assemblies but with ave. metaT cov. >= 10
# proteins/protein clusters NOT covering all assemblies BUT with ave. metaT cov. >= 10
df_cov = df.loc[~(df["mmseqs2_all"]) & (df["ave_cov"] >= 10),["tool_prot_id", "mmseqs2_cluster"]]
# proteins/protein clusters covering all assemblies AND with ave. metaT cov. >= 10
df_allcov = df.loc[(df["mmseqs2_all"]) & (df["ave_cov"] >= 10),["tool_prot_id", "mmseqs2_cluster"]]
# print stats
total_prots = df.shape[0]
total_clusters = len(set(df.mmseqs2_cluster))
all_prots = df_all.shape[0]
all_clusters = len(set(df_all.mmseqs2_cluster))
cov_prots = df_cov.shape[0]
cov_clusters = len(set(df_cov.mmseqs2_cluster))
allcov_prots = df_allcov.shape[0]
allcov_clusters = len(set(df_allcov.mmseqs2_cluster))
print(
"All assemblies: %d proteins, %d clusters\nmetaT cov. >= 10: %d proteins, %d clusters" % (
df_all.shape[0],
len(set(df_all.mmseqs2_cluster)),
df_cov.shape[0],
len(set(df_cov.mmseqs2_cluster))
"In total: %d proteins, %d clusters\n\
Percentage values are computed w.r.t. total number of proteins/clusters:\n\
1) Clusters representing ALL assemblies: %d proteins (%.2f%%), %d clusters (%.2f%%)\n\
2) Clusters NOT repr. all assemblies AND metaT cov. >= 10: %d proteins (%.2f%%), %d clusters (%.2f%%)\n\
3) Clusters repr. ALL assemblies AND metaT cov. >= 10: %d proteins (%.2f%%, %.2f%% of 1)), %d clusters (%.2f%%, %.2f%% of 1))\n" % (
total_prots, total_clusters,
all_prots, 100 * all_prots / total_prots, all_clusters, 100 * all_clusters / total_clusters,
cov_prots, 100 * cov_prots / total_prots, cov_clusters, 100 * cov_clusters / total_clusters,
allcov_prots, 100 * allcov_prots / total_prots, 100 * allcov_prots / all_prots, allcov_clusters, 100 * allcov_clusters / total_clusters, 100 * allcov_clusters / all_clusters
)
)
# All assemblies: 428107 proteins, 51459 clusters
# metaT cov. >= 10: 137054 proteins, 54747 clusters
# In total: 1863851 proteins, 602303 clusters
# Percentage values are computed w.r.t. total number of proteins/clusters:
# 1) Clusters representing ALL assemblies: 428107 proteins (22.97%), 51459 clusters (8.54%)
# 2) Clusters NOT repr. all assemblies AND metaT cov. >= 10: 137054 proteins (7.35%), 54747 clusters (9.09%)
# 3) Clusters repr. ALL assemblies AND metaT cov. >= 10: 106660 proteins (5.72%, 24.91% of 1)), 13435 clusters (2.23%, 26.11% of 1))
```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment