utils.R 4.03 KB
Newer Older
1
2
#!/usr/bin/Rscript

Valentina Galata's avatar
Valentina Galata committed
3
## IMPORT
4
5
suppressMessages(library(ggsci)) # colors

Valentina Galata's avatar
Valentina Galata committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
##############################
# INPUT
read_nanostats <- function(fname){
    df <- read.csv(
        file=fname,
        sep="\t",
        header=TRUE,
        check.names=FALSE,
        stringsAsFactors=FALSE
    )
    df_cols <- c("stat"="Statistic", "value"="Value")
    colnames(df) <- df_cols[colnames(df)]
    return(df)
}

21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
read_quast <- function(fname){
    df <- read.csv(
        file=fname,
        sep="\t",
        header=TRUE,
        row.names=1,
        check.names=FALSE,
        stringsAsFactors=FALSE
    )
    testit::assert(all(colnames(df) %in% names(ASM_TOOL_NAMES)))
    colnames(df) <- ASM_TOOL_NAMES[colnames(df)]
    df <- df[QUAST_VARS, ASM_TOOL_NAMES]
    return(df)
}

read_plasflow <- function(fname){
    df <- read.csv(
        file=fname,
        sep="\t",
        header=TRUE,
        stringsAsFactors=FALSE,
        check.names=FALSE
    )
    testit::assert(all(df$tool %in% names(ASM_TOOL_NAMES)))
    df$tool <- ASM_TOOL_NAMES[df$tool]
    df <- df[df$label %in% names(PLASFLOW_NAMES$labels),]
    df$label <- PLASFLOW_NAMES$labels[df$label]
    return(df)
}

51
52
53
54
55
56
57
58
59
60
61
62
63
64
read_rgi <- function(fname){
    df <- read.csv(
        file=fname,
        sep="\t",
        header=TRUE,
        stringsAsFactors=FALSE,
        check.names=FALSE
    )
    colnames(df) <- sapply(colnames(df), function(x){ ifelse(x %in% names(ASM_TOOL_NAMES), ASM_TOOL_NAMES[x], x) })
    testit::assert(all(df$col %in% names(RGI_NAMES$col)))
    df$col <- RGI_NAMES$col[df$col]
    return(df)
}

65
##############################
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# PLOTS

plot_rgi_overlap <- function(df, ctype, col){
    df <- df[df$type == ctype & df$col == col,]
    df_list <- lapply(ASM_TOOL_NAMES, function(x){ df[df[,x] > 0,"label"] })
    names(df_list) <- ASM_TOOL_NAMES[names(df_list)]
    UpSetR::upset(
        data=UpSetR::fromList(df_list),
        # overlap order
        order.by="degree",
        decreasing=FALSE,
        # y-label title
        mainbar.y.label=sprintf("Intersection size (%s hits, %s)", ctype, col),
        # text size
        text.scale = c(1.5, 1.5, 1.5, 1.5, 1.5, 1.5),
        # colors
        set.metadata=list(
            data=data.frame(
                sets=names(df_list),
                Tool=names(df_list),
                stringsAsFactors=FALSE
            ), 
            plots=list(list(type="matrix_rows", column="Tool", colors=ASM_TOOL_COLORS, alpha=0.7))
        )
    )
}

##############################
# CONST

###############
97
# Assemblers
98
99
100

# names
ASM_TOOL_NAMES <- c(
101
102
    "flye"="Flye",
    "megahit"="MEGAHIT",
103
104
105
    "metaspadeshybrid"="metaSPAdes (H)",
    "metaspades"="metaSPAdes",
    "operams"="OPERA-MS"
106
)
107
108
109
# colors
ASM_TOOL_COLORS <- ggsci::pal_nejm("default", alpha=1)(length(ASM_TOOL_NAMES))
names(ASM_TOOL_COLORS) <- ASM_TOOL_NAMES
110

111
###############
112
# Gene tools
113
114
115
116
117
118
# GENE_TOOL_NAMES <- c(
#     "prodigal_partial"="Prodigal (partial)",
#     "prodigal_total"="Prodigal (total)",
#     "cdhit_unique"="CD-HIT (unique)",
#     "cdhit_total"="CD-HIT (total)"
# )
119

120
###############
121
122
123
124
# CRISPR tools
CRISPR_TOOL_NAMES <- c(
    "minced"="MinCED",
    "casc"="CasC"
125
126
)

127
###############
128
# PlasFlow
129
130

# names
131
132
133
134
135
136
PLASFLOW_NAMES <- list(
    statstype=c(
        count="Sequence count",
        sum="Cumulative sequence length [bp]",
        count_pct="Sequence count [%]",
        sum_pct="Cumulative sequence length [%]"
137
138
139
140
141
    ),
    labels=c(
        chromosome="Chromosome",
        plasmid="Plasmid",
        unclassified="Unclassified"
142
143
    )
)
144
# colors
145
PLASFLOW_COLORS <- list(
146
    labels=ggsci::pal_nejm("default", alpha=1)(4)[c(2,3,4)]
147
)
148
names(PLASFLOW_COLORS$labels) <- PLASFLOW_NAMES$labels
149

150
###############
151
152
# RGI
RGI_NAMES <- list(
153
    col=c(
154
155
156
157
158
159
        "Best_Hit_ARO"="ARO term",
        "ARO"="ARO",
        "Drug Class"="Drug class",       
        "Resistance Mechanism"="Resistance mechanism",
        "AMR Gene Family"="Gene family"
    )
160
161
)

162
###############
163
164
165
166
167
168
169
170
# QUAST
QUAST_VARS <- c(
    "# contigs",
    "Largest contig",            
    "Total length",
    "N50",
    "L50",                       
    "# N's per 100 kbp"
171
)