Commit 43d6b1c4 authored by Valentina Galata's avatar Valentina Galata
Browse files

figures: added workflow for manuscript figures (not finished, issue #123)

parent 302b8adf
# working directory: will contain the results (should be writeable)
work_dir: "/scratch/users/vgalata/ONT_pilot_manuscript"
# samples: name and path to their results
samples:
gdb: "/scratch/users/vgalata/gdb/results"
nwc: "/scratch/users/vgalata/nwc/results"
rumen: "/scratch/users/sbusi/rumen/results"
zymo: "/scratch/users/vgalata/zymo/results"
# List of assemblers for different read types (same as in sample config files)
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye", "raven"]
hy: ["metaspadeshybrid", "operamsmegahit", "operamsmetaspades"]
\ No newline at end of file
# Manuscript figure pipeline: to be used after workflow/ and workflow_report/
##############################
# MODULES
import os
import re
import pandas
# from scripts.utils import assembler_pairs
##############################
# CONFIG
# Paths
SRC_DIR = srcdir("scripts")
ENV_DIR = srcdir("envs")
# default executable for snakmake
shell.executable("bash")
# working directory
workdir:
config["work_dir"]
##############################
# TARGETS & RULES
rule all:
input:
fig_gdb_rgi_aro3004454_flye="figures/fig_gdb_rgi_aro3004454_flye.pdf"
rule fig_gdb_rgi_aro3004454_flye_metaT:
input:
os.path.join(config["samples"]["gdb"], "mapping/metat/lr/flye/ASSEMBLY.POLISHED.sr.cov.perbase")
output:
temp("figures/fig_gdb_lr_flye_contig_4930:1.0-5516.0_1-1000_metatcov.tsv")
shell:
# metaT cov. of contig contig_4930:1.0-5516.0 for bases 1-1000
"grep -F \"contig_4930:1.0-5516.0\" {input} | awk -F'\\t' '$1 == \"contig_4930:1.0-5516.0\" && $2 >= 1 && $2 <= 1000 {{print $0}}' > {output}"
rule fig_gdb_rgi_aro3004454_flye:
input:
metat_cov=rules.fig_gdb_rgi_aro3004454_flye_metaT.output
output:
pdf="figures/fig_gdb_rgi_aro3004454_flye.pdf"
log:
"figures/fig_gdb_rgi_aro3004454_flye.log"
conda:
os.path.join(ENV_DIR, "r.yaml")
script:
os.path.join(SRC_DIR, "fig_gdb_rgi_aro3004454_flye.R")
\ No newline at end of file
../../workflow_report/envs/r.yaml
\ No newline at end of file
../../workflow_report/scripts/const.R
\ No newline at end of file
#!/usr/bin/Rscript
# Figure: GDB sample, RGI results, ARO:30004454 hit in Flye assembly (LR)
############################## LOG
sink(file=file(snakemake@log[[1]], open="wt"), type=c("output", "message"))
############################## LIBS
suppressMessages(library(testit)) # assertions
suppressMessages(library(dplyr)) # tables
suppressMessages(library(ggplot2)) # plotting
suppressMessages(library(ggsci)) # color palettes
suppressMessages(library(reshape2)) # reshaping dataframes
suppressMessages(library(scales)) # plot scales
suppressMessages(library(viridis)) # color palettes
# # custom
# source(snakemake@params$const)
# source(snakemake@params$utils)
############################## DATA
# metaT coverage: bases 1-1000 of contig contig_4930:1.0 from Flye assembly from GDB sample
metat_cov <- read.csv(snakemake@input$metat_cov, header=FALSE, sep="\t", col.names=c("contigID", "pos", "cov"), stringsAsFactors=FALSE)
# gene/protein sequences to be plotted
seqs <- data.frame(
sid=c("contig_4930:1.0-5516.0_1", "ARO:30004454", "contig_4930:1.0-5516.0_1NEW"),
start=c(182,182,420),
end=c(448,806,806),
nucl=c(
"ATGCAATTCACAAAGATTGATATAAATAATTGGACACGAAAAGAGTATTTCGACCACTATTTTGACAATACGCCCTGCACATATAGTATGACGGTAAAACTCGATATTTCTAAGTTGAAAAAGGATGGAAAAAAATTATATCCGACTCTCTTATATGGGGTTACAACAATCCTCAATCGACACGAAGAGTTCAGGACGGCATTAGATAAAAACGGACAGGTAGGTGTTTTTTTCAGAAATGCTGCCTTGCTACACAATCTTTCATAA",
"ATGCAATTCACAAAGATTGATATAAATAATTGGACACGAAAAGAGTATTTCGACCACTATTTTGGCAATACGCCCTGCACATATAGTATGACGGTAAAACTCGATATTTCTAAGTTGAAAAAGGATGGAAAAAAGTTATACCCAACTCTTTTATATGGAGTTACAACGATCATCAATCGACATGAAGAGTTCAGGACCGCATTAGATGAAAACGGACAGGTAGGCGTTTTT-TCAGAAATGCTGCCTTGCTACACAGTTTTTCATAAGGAAACTGAAACCTTTTCGAGTATTTGGACTGAGTTTACAGCAGACTATACTGAGTTTCTTCAGAACTATCAAAAGGATATAGACGCTTTTGGTGAACGAATGGGAATGTCCGCAAAGCCTAATCCTCCGGAAAACACTTTCCCTGTTTCTATGATACCGTGGACAAGCTTTGAAGGCTTTAACTTAAATCTAAAAAAAGGATATGACTATCTACTGCCGATATTTACGTTTGGGAAGTATTATGAGGAGGGCGGAAAATACTATATTCCCTTATCGATTCAAGTGCATCATGCCGTTTGTGACGGCTTTCATGTTTGCCGTTTTTTGGATGAATTACAAGACTTGCTGAATAAATAA",
"ATGCTGCCTTGCTACACAATCTTTCATAAGGAAACTGAAACCTTTTCGAGTATTTGGACTGAGTTTACAGCAGACTATACTGAGTTTCTTCAGAACTATCAAAAGGATATAGACGCTTATGGTGAACGAAAGGGAATGTTCGCAAAGCCTAATCCTCCGGAAAACACTTTCCCTGTTTCTATGATACCGTGGACAAGCTTTGAAGGCTTTAACTTAAATCTAAAAAAGGGATATGACTATCTACTGCCGATATTTACGTTTGGGAAGTATTATGAGGATGGCGGAAAATACTATATTCCCTTATCGATTCAAGTGCATCATGCCGTTTGTGATGGCTTTCATGTTTGCCGTTTTTTGGATGAATTACAAGACTTGCTGAATAAATAA"
),
prot=c(
"MQFTKIDINNWTRKEYFDHYFDNTPCTYSMTVKLDISKLKKDGKKLYPTLLYGVTTILNRHEEFRTALDKNGQVGVFFRNAALLHNLS*",
"MQFTKIDINNWTRKEYFDHYFGNTPCTYSMTVKLDISKLKKDGKKLYPTLLYGVTTIINRHEEFRTALDENGQVGVFSEMLPCYTVFHKETETFSSIWTEFTADYTEFLQNYQKDIDAFGERMGMSAKPNPPENTFPVSMIPWTSFEGFNLNLKKGYDYLLPIFTFGKYYEEGGKYYIPLSIQVHHAVCDGFHVCRFLDELQDLLNK*",
"MLPCYTIFHKETETFSSIWTEFTADYTEFLQNYQKDIDAYGERKGMFAKPNPPENTFPVSMIPWTSFEGFNLNLKKGYDYLLPIFTFGKYYEDGGKYYIPLSIQVHHAVCDGFHVCRFLDELQDLLNK*"
),
stringsAsFactors=FALSE
)
rownames(seqs) <- seqs$sid
# add seq. letters to metaT coverage
for(sid in seqs$sid){
# if(sid == "ARO:30004454"){ next }
sid_nucl <- unlist(strsplit(seqs[sid,"nucl"], split=""))
sid_prot <- unlist(strsplit(seqs[sid,"prot"], split=""))
sid_start <- seqs[sid,"start"]
sid_end <- seqs[sid,"end"]
# print(sprintf("%s: %d, %d, %d", sid, sid_start, sid_end, length(sid_nucl)))
metat_cov[,sid] <- NA
# metat_cov[sid_start:sid_end,sid] <- sid_nucl # nucl
metat_cov[seq(sid_start+ifelse(sid=="contig_4930:1.0-5516.0_1NEW",0,1), sid_end, 3),sid] <- sid_prot # prot
}
# data to plot seq.s
metat_cov_seqs <- reshape2::melt(
metat_cov[,setdiff(colnames(metat_cov),c("contigID", "cov"))],
id.vars="pos",
na.rm=TRUE
)
metat_cov_seqs$y <- as.numeric(metat_cov_seqs$variable) * -1
############################## PLOT
# custom theme
custom_theme <-
theme_classic() +
theme(
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
legend.position="none"
)
# AA colors
aa_colors <- c(pal_d3(palette="category20")(20), "black")
names(aa_colors) <- c(setdiff(sort(unique(metat_cov_seqs$value)), "*"),"*")
# Nucl. colors
# nc_colors <- c(pal_d3(palette="category20")(4), "black")
# names(nc_colors) <- c("A", "T", "C", "G", "-")
# Plot
p_cov <-
ggplot(data=metat_cov %>% filter(pos >= 160, pos <= 830), aes(x=pos, y=cov)) +
# overlap area
geom_polygon(data=data.frame(x=c(182+238+29,182+238,182+238,182+238+29), y=c(-6,-6,30,30)), aes(x=x, y=y), fill="gray", alpha=0.3) +
# cov. baseline
geom_hline(yintercept=0, linetype="dotted") +
# cov.
geom_step() +
# seq.s
geom_label(
data=metat_cov_seqs,
aes(y=y, label=value, fill=value),
color="white", size=2, fontface="bold",
label.padding=unit(0.1, "lines"), label.size=0, label.r=unit(0, "lines")
) +
# seq. annotations
annotate(
geom="segment",
x=c(182, 182, 404), y=c(1.5, -4.5, -4.5),
xend=c(182, 182, 417), yend=c(-0.5, -2.5, -3),
arrow=arrow(length=unit(2, "mm"))
) +
annotate(
geom="text",
x=c(182, 182, 402), y=c(2, -5, -5),
label=levels(metat_cov_seqs$variable),
hjust=c("left", "left", "right"),
size=4
) +
# colors
scale_fill_manual(values=aa_colors) +
# axis range
scale_x_continuous(limits=c(160, 830), expand=c(0, 0)) +
scale_y_continuous(limits=c(-6,30), expand=c(0, 0)) +
# titles
labs(
x="Contig position (LR, Flye, contig_4930:1.0-5516.0)",
y="metaT coverage"
) +
# theme
custom_theme
# PDF
pdf(snakemake@output$pdf, width=20, height=5)
print(p_cov)
dev.off()
\ No newline at end of file
../../workflow_report/scripts/utils.R
\ No newline at end of file
../../workflow/scripts/utils.py
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment