Commit 6e6e390a authored by Valentina Galata's avatar Valentina Galata
Browse files

report: mmseqs2 (issue #119)

parent db966e98
......@@ -302,4 +302,48 @@ def ave_gene_cov(cov_file, genes):
last_contig_id = contig_id
last_contig_cov = []
last_contig_cov.append(float(coverage))
return genes
\ No newline at end of file
return genes
def mmseqs2_tsv(ifile_path):
import pandas
# number of proteins per cluster and tool
counts = dict()
with open(ifile_path, "r") as ifile:
for line in ifile:
cluster_name, cluster_member = line.strip().split("\t")
cluster_member_tool = cluster_member.split("__")[0]
if cluster_name not in counts:
counts[cluster_name] = dict()
if cluster_member_tool not in counts[cluster_name]:
counts[cluster_name][cluster_member_tool] = 0
counts[cluster_name][cluster_member_tool] += 1
counts = pandas.DataFrame.from_dict(counts, orient='index')
return counts
def mmseqs2_summary(counts):
import pandas
# number of clusters and cluster members, i.e. proteins, per tool combination
summary = dict()
for cluster in counts.index:
c_comb = ",".join(sorted(counts.columns[ pandas.notnull(counts.loc[cluster,:]) ]))
c_sum = counts.loc[cluster,:].sum(skipna=True)
if c_comb not in summary:
summary[c_comb] = {"clusters": 0, "members": 0}
summary[c_comb]["clusters"] += 1
summary[c_comb]["members"] += c_sum
summary = pandas.DataFrame.from_dict(summary, orient='index')
return summary
def mmseqs2_tool_summary(counts):
import pandas
# number of proteins clustered w/ proteins from another tool (per tool)
summary = pandas.DataFrame(0, index=counts.columns, columns=counts.columns)
for tool1 in counts.columns:
for tool2 in counts.columns:
if tool1 != tool2: # shared w/ other tool
summary.loc[tool1,tool2] = counts.loc[(pandas.notnull(counts[tool1])) & (pandas.notnull(counts[tool2])), tool1].sum(skipna=True)
else: # total number of proteins (over all clusters)
summary.loc[tool1,tool2] = counts[tool1].sum(skipna=True)
return summary
......@@ -85,6 +85,7 @@ rule render_report:
diamond_db=[os.path.join(RESULTS_DIR, "analysis/diamond/summary.db.tsv")] if "diamond" in config["steps_analysis"] else [], # TODO: filter hits ???
diamond_metag=[os.path.join(RESULTS_DIR, "analysis/diamond/summary.metag.tsv")] if "diamond" in config["steps_analysis"] else [],
diamond_metat=[os.path.join(RESULTS_DIR, "analysis/diamond/summary.metat.tsv")] if "diamond" in config["steps_analysis"] and "metat" in META_TYPES else [],
mmseqs2=[os.path.join(RESULTS_DIR, "analysis/mmseqs2/summary.tsv")] if "mmseqs2" in config["steps_analysis"] else [],
covseg=[os.path.join(RESULTS_DIR, "mapping/summary.segsum.metag.tsv")] if "cov" in config["steps_analysis"] else [],
### taxonomy
# kraken2=TODO
......@@ -105,6 +106,7 @@ rule render_report:
pdf_diamond=os.path.join(RESULTS_DIR, "report/fig_diamond.pdf"),
pdf_diamond2=os.path.join(RESULTS_DIR, "report/fig_diamond2.pdf"),
pdf_cdhit=os.path.join(RESULTS_DIR, "report/fig_cdhit.pdf"),
pdf_mmseqs2=os.path.join(RESULTS_DIR, "report/fig_mmseqs2.pdf"),
pdf_mash=os.path.join(RESULTS_DIR, "report/fig_mash.pdf"),
pdf_fastani=os.path.join(RESULTS_DIR, "report/fig_fastani.pdf"),
pdf_mummer=os.path.join(RESULTS_DIR, "report/fig_mummer.pdf"),
......
......@@ -428,6 +428,17 @@ rule collect_cdhit:
for f in {input}; do tail -n +2 ${{f}}; done >> {output}
"""
rule collect_mmseqs2:
input:
os.path.join(RESULTS_DIR, "analysis/mmseqs2/clusters.tsv")
output:
os.path.join(RESULTS_DIR, "analysis/mmseqs2/summary.tsv")
run:
from scripts.utils import mmseqs2_tsv, mmseqs2_summary
counts = mmseqs2_tsv(input[0])
summary = mmseqs2_summary(counts)
summary.to_csv(output[0], sep="\t", header=True, index=True, index_label="combination")
rule collect_segsum:
input:
expand(
......
......@@ -27,7 +27,7 @@ ASM_TOOL_NAMES <- c(
"metaspadeshybrid"="HY metaSPAdes"
)
# ASM_TOOL_COLORS <- ggsci::pal_d3("category10", alpha=1)(10)[c(4,2,3,9,1,10,5)]
ASM_TOOL_COLORS <- ggsci::pal_jama("default", alpha=1)(7)[c(3,6,2,4,1,5, 7)]
ASM_TOOL_COLORS <- ggsci::pal_jama("default", alpha=1)(7)[c(3,6,2,4,1,5,7)]
names(ASM_TOOL_COLORS) <- names(ASM_TOOL_NAMES)
ASM_TOOL_SHAPES <- c(
"flye"=24,
......@@ -107,4 +107,8 @@ DIAMOND_VAR_LABELLER <- function(x){
} else {
return(x)
}
}
\ No newline at end of file
}
# mmseqs2
MMSEQS2_UPSETR_HIGHLIGHTS <- ggsci::pal_jama("default", alpha=1)(4)[c(4,2)]
names(MMSEQS2_UPSETR_HIGHLIGHTS) <- c("all", "unique")
\ No newline at end of file
......@@ -311,6 +311,17 @@ if("cdhit" %in% snakemake@config$steps_analysis){
FIGS$cdhit <- NULL
}
## MMSEQS2
if("mmseqs2" %in% snakemake@config$steps_analysis){
TABS$mmseqs2 <- read_mmseqs(snakemake@input$mmseqs2)
FIGS$mmseqs2 <- list()
FIGS$mmseqs2$overlap <- plot_mmseqs2_overlap(TABS$mmseqs2)
} else {
TABS$mmseqs2 <- NULL
FIGS$mmseqs2 <- NULL
}
# Contig cov. seg.
if("cov" %in% snakemake@config$steps_analysis){
TABS$covseg <- read_covseg(snakemake@input$covseg)
......
......@@ -184,6 +184,14 @@ if(!is.null(FIGS$cdhit)){
knitr::kable(TABS$cdhit, caption='CD-HIT (number of proteins in tool2 but not in tool1)')
```
## MMSEQS2
```{r figures-mmseqs2-overlap, echo=FALSE, fig.width=10, fig.height=7, fig.cap='Protein clustering w/ MMSEQS2', fig.topcaption=TRUE}
if(!is.null(TABS$mmseqs2)){
print(FIGS$mmseqs2$overlap)
}
```
## CRISPR
```{r figures-crispr-casc, echo=FALSE, fig.width=10, fig.height=7, fig.cap='CRISPR prediction (CASC)', fig.topcaption=TRUE}
......@@ -216,13 +224,13 @@ if(!is.null(FIGS$barrnap)){
## AMR prediction
```{r figures-rgi, echo=FALSE, fig.width=10, fig.height=5, fig.cap='AMR prediction w/ RGI'}
```{r figures-rgi, echo=FALSE, fig.width=10, fig.height=5, fig.cap='AMR prediction w/ RGI', fig.topcaption=TRUE}
if(!is.null(FIGS$rgi)){
print(FIGS$rgi$total)
}
```
```{r figures-rgi-overlap, echo=FALSE, fig.width=10, fig.height=7, fig.cap='AMR prediction w/ RGI'}
```{r figures-rgi-overlap, echo=FALSE, fig.width=10, fig.height=7, fig.cap='AMR prediction w/ RGI', fig.topcaption=TRUE}
if(!is.null(TABS$rgi)){
print(FIGS$rgi$overlap_strict)
}
......
......@@ -9,13 +9,13 @@ sink(file=file(snakemake@log$out, open="wt"), type=c("output", "message"))
suppressMessages(library(testit)) # assertions
suppressMessages(library(rmarkdown)) # Rmarkdown
suppressMessages(library(dplyr)) # tables
suppressMessages(library(ggplot2)) # plotting
suppressMessages(library(ggsci)) # color palettes
suppressMessages(library(reshape2)) # reshaping dataframes
suppressMessages(library(scales)) # plot scales
suppressMessages(library(pheatmap)) # heatmaps
suppressMessages(library(grid)) # for heatmaps
suppressMessages(library(viridis)) # color palettes
# suppressMessages(library(ggplot2)) # plotting
# suppressMessages(library(ggsci)) # color palettes
# suppressMessages(library(reshape2)) # reshaping dataframes
# suppressMessages(library(scales)) # plot scales
# suppressMessages(library(pheatmap)) # heatmaps
# suppressMessages(library(grid)) # for heatmaps
# suppressMessages(library(viridis)) # color palettes
# custom
source(snakemake@params$const)
......@@ -183,6 +183,15 @@ if(!is.null(FIGS$cdhit)){
}
dev.off()
# MMSEQS2
pdf(snakemake@output$pdf_mmseqs2, width=10, height=7)
if(!is.null(FIGS$mmseqs2)){
for(pp in FIGS$mmseqs2){ print(pp) }
} else {
print(draw_empty_plot())
}
dev.off()
# Contig cov. seg.
pdf(snakemake@output$pdf_covseg, width=10, height=10)
if(!is.null(FIGS$covseg)){
......
......@@ -2,12 +2,14 @@
## IMPORT
suppressMessages(library(UpSetR)) # subset plots
suppressMessages(library(ggplot2)) # plotting
suppressMessages(library(reshape2)) # reshaping dataframes
suppressMessages(library(scales)) # plot scales
suppressMessages(library(pheatmap)) # heatmaps
suppressMessages(library(grid)) # for heatmaps
suppressMessages(library(viridis)) # color palettes
suppressMessages(library(ggsci)) # color palettes
##############################
# INPUT
......@@ -315,6 +317,26 @@ read_covseg <- function(fname){
return(df)
}
read_mmseqs <- function(fname){
print(sprintf("Reading: %s", fname))
df <- read.csv(
file=fname,
sep="\t",
header=TRUE,
row.names=1,
check.names=FALSE,
stringsAsFactors=FALSE
)
rownames(df) <- sapply(
rownames(df),
function(x){
# "&" separator for UpSetR plots
paste(ASM_TOOL_NAMES[unlist(strsplit(x,","))], collapse="&")
}
)
return(df)
}
##############################
# PLOTS
......@@ -465,7 +487,7 @@ plot_rgi_overlap <- function(df, ctype, col){
nsets=length(ASM_TOOL_NAMES),
# y-label title
mainbar.y.label=sprintf("Intersection size (%s hits, %s)", ctype, col),
# text size
# text size: intersection size title, intersection size tick labels, set size title, set size tick labels, set names, numbers above bars
text.scale = c(1.2, 1.2, 1.2, 1.2, 1.2, 1.2)#,
# colors
# set.metadata=list(
......@@ -599,6 +621,35 @@ plot_diamondDB_density2d <- function(df){
return(pp)
}
plot_mmseqs2_overlap <- function(df){
# cluster summary: number of proteins in clusters grouped by represented tools
upsetr_input <- UpSetR::fromExpression(t(df %>% select(members))[1,,drop=TRUE])
testit::assert(all( colnames(upsetr_input) %in% ASM_TOOL_NAMES ))
upsetr_input <- upsetr_input[,ASM_TOOL_NAMES,drop=FALSE]
UpSetR::upset(
upsetr_input,
nsets=ncol(upsetr_input),
nintersects=20,
order.by="freq",
decreasing=TRUE,
# titles
mainbar.y.label="Total number of proteins in clusters",
sets.x.label="Total number of proteins per assembly",
# text size: intersection size title, intersection size tick labels, set size title, set size tick labels, set names, numbers above bars
text.scale = c(1.2, 1.2, 1.2, 1.2, 1.2, 1),
# highlight those w/ only one set
queries=c(
# unique proteins
lapply(
colnames(upsetr_input),
function(x){ list(query=intersects, params=list(x), active=TRUE, color=MMSEQS2_UPSETR_HIGHLIGHTS["unique"]) }
),
# all tools
list(list(query=intersects, params=as.list(colnames(upsetr_input)), active=TRUE, color=MMSEQS2_UPSETR_HIGHLIGHTS["all"]))
)
)
}
#' Contig coverage and segmentation plot
#' @input df_cov Coverage data.frame incl. contig ID, base, coverage and state
#' @input cid Contig ID
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment