Commit b62202cf authored by Valentina Galata's avatar Valentina Galata
Browse files

cleanup: rm old notes

parent b7671b52
#!/bin/bash -l
#SBATCH -J ONT_TMP
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 10
#SBATCH --time=0-00:30:00
#SBATCH -p bigmem
#SBATCH --qos=qos-bigmem
# input/output paths
idir="/scratch/users/sbusi/ONT/mock/reference_genomes"
odir="/scratch/users/vgalata/Zymo/mash_screen_test"
# assembly
asm="${odir}/metaspades.fna"
ln -sf $(realpath "${idir}/metaspades.fna") "${odir}/metaspades.fna"
# kraken2 analysis
# conda: Kraken version 2.0.9-beta
conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/012cd3de
# kraken2
kdb="/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
kraken2 --db "${kdb}" --threads 5 --use-names --report "${odir}/kraken2_metaspades.report.tsv" --output "${odir}/kraken2_metaspades.labels.tsv" "${asm}"
#!/usr/bin/env bash
# input/output paths
idir="/scratch/users/sbusi/ONT/mock/reference_genomes"
odir="/scratch/users/vgalata/Zymo/mash_screen_test"
mkdir -p "${odir}"
# reference
ref="${odir}/ref.fna"
cat "${idir}/eukaryotes.fna" "${idir}/prokaryotes.fna" > "${ref}"
# assembly
asm="${odir}/flye.fna"
ln -sf $(realpath "${idir}/flye.fna") "${odir}/flye.fna"
# mash analysis
# conda: Mash version 2.2.2
conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/d8ed275a
# mash sketch references
mash sketch -p 3 -k 31 -s 1000 -S 42 -i -o "${ref}" "${ref}"
mash sketch -p 3 -k 31 -s 1000 -S 42 -i -o "${asm}" "${asm}"
# mash screen
mash screen -p 3 -i -1 "${ref}.msh" "${asm}" > "${odir}/mash_screen_ref_flye.tsv"
mash screen -p 3 -i -1 "${asm}.msh" "${ref}" > "${odir}/mash_screen_flye_ref.tsv"
# XXX
cut -f2,5 "${odir}/mash_screen_flye_ref.tsv" | sed "s@/@\t@" | sort -n -k1,1 | head -n 10 > "${odir}/mash_screen_flye_ref.top10.txt"
#!/usr/bin/Rscript
## NOTE
# XXX
## ARGS
ARGS <- commandArgs(trailingOnly=TRUE)
COV_G <- ARGS[1]
COV_T <- ARGS[2]
UNIQ <- ARGS[3]
GENES <- ARGS[4]
TITLE <- ARGS[5]
PDF <- ARGS[6]
cols_cov <- c("contig", "start", "end", "coverage")
cols_gen <- c("gene", "start", "end", "strand", "info")
## IMPORT
suppressMessages(library(ggplot2)) # plotting
suppressMessages(library(reshape2)) # reshaping dataframes
suppressMessages(library(scales)) # plot scales
suppressMessages(library(viridis)) # color palette
## DATA
# Coverage
df_cov_G <- read.csv(file=COV_G, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_cov)
df_cov_T <- read.csv(file=COV_T, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_cov)
# Genes
df_uniq <- read.csv(file=UNIQ, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_gen)
df_genes <- read.csv(file=GENES, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_gen)
df_genes$uniq <- df_genes$gene %in% df_uniq$gene
## PLOT
cov_plot <- function(df_c, df_g){
max_y <- max(df_c$coverage)
pp <-
ggplot(data=df_c) +
geom_rect(data=df_g, aes(xmin=start, xmax=end, ymin=0, ymax=max_y, fill=uniq, colour=uniq), alpha=0.3) +
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=coverage), fill="black", colour=NA, alpha=1) +
geom_hline(yintercept=10, colour="red") +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
labs(
title=TITLE,
x="base",
y="coverage"
) +
theme_bw()
return(pp)
}
pp_G <- cov_plot(df_cov_G, df_genes)
pp_T <- cov_plot(df_cov_T, df_genes)
pdf(PDF, width=60, height=4)
print(pp_G)
print(pp_T)
dev.off()
#!/bin/bash -l
#SBATCH -J ONT_GDB
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 15
#SBATCH --time=0-01:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
# NOTE: script to compute and plot metaT and metaG coverage for a specific assembly
# and genes not found in another assembly
# NOTE: this is for testing purposes only !!!
# Looking for conda env.s
# grep -r --include="*.yaml" "samtools" ~/miniconda3/envs/ONT_pilot/pipeline/
# Contig cov. and length:
# sort -n -k2,2 ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.cov.ave.txt | tail -n 10
# sort -n -k2,2 ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.cov.ave.txt | tail -n 10 | cut -f1 -d" " | while read f; do grep "${f}" ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.idxstats.txt; done
# TEST: lr_flye, contig_446, sr_metaspades
# TEST: lr_flye_sr, contig_446, sr_metaspades
cd /mnt/lscratch/users/vgalata/ont_pilot/cdhit_cov_example
rtype1="sr"; tool1="metaspades"
rtype2="lr"; tool2="flye_sr"
contig="contig_446"
threads=15
sr1="../results/preproc/metag/sr/R1.fastp.fastq.gz"
sr2="../results/preproc/metag/sr/R2.fastp.fastq.gz"
asm="../results/assembly/${rtype2}/${tool2}/ASSEMBLY.FILTERED.fasta"
idx="../results/assembly/${rtype2}/${tool2}/ASSEMBLY.FILTERED"
faa="../results/annotation/prodigal/${rtype2}/${tool2}/proteins.faa"
uniq="../results/analysis/cdhit/${rtype1}_${tool1}__${rtype2}_${tool2}.faa"
bam_metaG="${rtype2}_${tool2}.metaG.bam"
bam_metaT="../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.bam"
cov_metaG="${rtype2}_${tool2}.metaG.cov"
cov_metaT="${rtype2}_${tool2}.metaT.cov"
ids_all="${rtype2}_${tool2}_${contig}.ids"
ids_uniq="${rtype2}_${tool2}_${contig}.uniq.ids"
pdf="${rtype2}_${tool2}_${contig}.pdf"
# BAMs
# metaG, SR: as in the rules
# metaT, SR: already exists
conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/7070f59f
bwa mem -t ${threads} ${idx} ${sr1} ${sr2} | samtools view -@ ${threads} -SbT ${asm} | samtools sort -@ ${threads} -m 4G -T ${bam_metaG%.*} > ${bam_metaG}
conda deactivate
# genome coverage in required format
# https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html
conda activate /scratch/users/vgalata/miniconda3/ONT_pilot/pipeline/121c497d
bedtools genomecov -bg -ibam ${bam_metaG} > ${cov_metaG}
bedtools genomecov -bg -ibam ${bam_metaT} > ${cov_metaT}
conda deactivate
awk -F"\t" '$1=="'${contig}'" {print $0}' ${cov_metaG} > ${cov_metaG}.${contig}
awk -F"\t" '$1=="'${contig}'" {print $0}' ${cov_metaT} > ${cov_metaT}.${contig}
# all/uniq genes from Flye
grep "${contig}_" ${faa} | sed "s/^>${contig}_//;s/ # /\t/g" > ${ids_all}
grep "^>${rtype2}__${tool2}__${contig}_" ${uniq} | sed "s/^>${rtype2}__${tool2}__${contig}_//;s/ # /\t/g" | sort -n -k1,1 > ${ids_uniq}
# plot
conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/e231f1b8
Rscript /mnt/irisgpfs/users/vgalata/projects/ONT_pilot/notes/cdhit_uniq_cov.R ${cov_metaG}.${contig} ${cov_metaT}.${contig} ${ids_uniq} ${ids_all} "Proteins in ${rtype2}_${tool2} but not in ${rtype1}_${tool1}, contig ${contig}" ${pdf}
conda deactivate
\ No newline at end of file
#!/usr/bin/Rscript
# print(snakemake)
## LOG FILE
# sink(file=file(snakemake@log$out, open="wt"), type="output")
# sink(file=file(snakemake@log$err, open="wt"), type="message")
## NOTE
# XXX
## IMPORT
suppressMessages(library(testit)) # assertions
# suppressMessages(library(tools))
# suppressMessages(library(gtools))
# suppressMessages(library(rjson))
# suppressMessages(library(rmarkdown)) #Rmarkdown
# suppressMessages(library(knitr))
# suppressMessages(library(DT))
# suppressMessages(require(grid))
suppressMessages(library(ggplot2)) # plotting
suppressMessages(library(reshape2)) # reshaping dataframes
suppressMessages(library(scales)) # plot scales
suppressMessages(library(pheatmap)) # heatmaps
suppressMessages(require(grid)) # for heatmaps
suppressMessages(library(viridis)) # color palette
# custom
# source("workflow_report/scripts/utils.R")
# names
ASM_TOOL_NAMES <- c(
"flye"="Flye",
"wtdbg2"="wtdbg2",
"canu"="Canu",
"flye_sr"="Flye + racon(SR)",
"wtdbg2_sr"="wtdbg2 + racon(SR)",
"canu_sr"="Canu + racon(SR)",
"megahit"="MEGAHIT",
"metaspades"="metaSPAdes",
"operams"="OPERA-MS",
"metaspadeshybrid"="metaSPAdes (H)",
"operams_sr"="OPERA-MS + racon(SR)",
"metaspadeshybrid_sr"="metaSPAdes (H) + racon(SR)",
"imp3"="IMP3 (MG/MT)"
)
# colors
ASM_TOOL_PALETTE1 <- ggsci::pal_nejm("default", alpha=1)(8)
ASM_TOOL_PALETTE2 <- ggsci::pal_nejm("default", alpha=0.8)(8)
ASM_TOOL_COLORS <- c(
"Flye"=ASM_TOOL_PALETTE2[1],
"wtdbg2"=ASM_TOOL_PALETTE2[2],
"Canu"=ASM_TOOL_PALETTE2[3],
"Flye + racon(SR)"=ASM_TOOL_PALETTE1[1],
"wtdbg2 + racon(SR)"=ASM_TOOL_PALETTE1[2],
"Canu + racon(SR)"=ASM_TOOL_PALETTE1[3],
"MEGAHIT"=ASM_TOOL_PALETTE1[4],
"metaSPAdes"=ASM_TOOL_PALETTE1[5],
"OPERA-MS"=ASM_TOOL_PALETTE2[6],
"metaSPAdes (H)"=ASM_TOOL_PALETTE2[7],
"OPERA-MS + racon(SR)"=ASM_TOOL_PALETTE1[6],
"metaSPAdes (H) + racon(SR)"=ASM_TOOL_PALETTE1[7],
"IMP3 (MG/MT)"=ASM_TOOL_PALETTE1[8]
)
ASM_TOOL_COLORS <- ASM_TOOL_COLORS[ASM_TOOL_NAMES]
## DATA
df <- read.csv(
file="notes/cdhit_uniq_stats_GDB.tsv",
sep="\t",
header=TRUE,
check.names=FALSE,
stringsAsFactors=FALSE
)
df$tool1 <- ASM_TOOL_NAMES[df$tool1]
df$tool2 <- ASM_TOOL_NAMES[df$tool2]
df$tool1 <- factor(df$tool1, ordered=TRUE, levels=ASM_TOOL_NAMES)
df$tool2 <- factor(df$tool2, ordered=TRUE, levels=ASM_TOOL_NAMES)
## PLOT
pps <- list()
for(pn in c("gene_total_count", "gene_ave_length", "cluster_gene_total_count", "cluster_gene_total_pct", "cluster_gene_ave_count", "cluster_gene_ave_length")){
pps[[pn]] <-
ggplot(data=df, aes_string(x="tool2", y=pn, fill="tool2")) +
geom_col() +
facet_wrap(vars(tool1), ncol=1) +
scale_fill_manual(values=ASM_TOOL_COLORS, guide=NULL) +
labs(
subtitle=pn,
x="Assembly 2",
y="Proteins from Assembly 2 not in Assembly 1"
) +
cdhit_theme
}
## PDF
pdf("notes/cdhit_uniq_stats_GDB.pdf", width=16, height=16)
for(pp in pps){ print(pp) }
dev.off()
#!/usr/bin/env python
# Example call: python -u notes/cdhit_uniq_stats.py 10 "/scratch/users/vgalata/GDB/results/analysis/cdhit/" | tee notes/cdhit_uniq_stats_GDB.tsv
import os
import re
import sys
import glob
import pandas
from multiprocessing import Pool
def parse_faa(faa_file):
df = []
rtype1 = os.path.basename(faa_file).split("__")[0].split("_")[0]
tool1 = "_".join(os.path.basename(faa_file).split("__")[0].split("_")[1:])
with open(faa_file, "r") as ifile:
for line in ifile:
if not re.match("^>", line):
continue
line = re.sub("^>", "", line)
sid, sstart, send, sstrand, sinfo = line.split(" # ")
srtype, stool, sid = sid.split("__")
scontigid = "_".join(sid.split("_")[:-1])
sgeneid = sid.split("_")[-1]
spartial = sinfo.split(";")[1]
slen = int(send) - int(sstart) + 1
sdict = {
"rtype1": rtype1,
"tool1": tool1,
"rtype2": srtype,
"tool2": stool,
"contig_id": scontigid,
"gene_id": sgeneid,
"start": int(sstart),
"end": int(send),
"strand": int(sstrand),
"partial": spartial,
"length": slen
}
df.append(sdict.copy())
df = pandas.DataFrame(df)
return df
def find_clusters(df):
assert len(set(df.rtype1)) == 1 and len(set(df.tool1)) == 1 and len(set(df.rtype2)) == 1 and len(set(df.tool2)) == 1
df = df.sort_values(by=["contig_id", "gene_id"], axis=0, inplace=False)
df = df.assign(cluster_label=0)
df = df.assign(cluster=False)
last_i = ""
last_cid = ""
last_gid = -1
cluster = 0
for i in df.index:
i_cid = df.loc[i, "contig_id"]
i_gid = int(df.loc[i, "gene_id"])
# same contig, consecutive genes
if last_cid == i_cid and last_gid + 1 == i_gid:
df.loc[last_i, "cluster"] = True
df.loc[i, "cluster"] = True
else:
cluster += 1
df.loc[i, "cluster_label"] = cluster
last_i = i
last_cid = i_cid
last_gid = i_gid
return df
def print_stats(df):
# stats
gene_total_count = df.shape[0]
gene_ave_length = df["length"].mean()
cluster_total_count = len(set(df.loc[df.cluster,"cluster_label"]))
cluster_gene_total_count = sum(df.cluster)
cluster_gene_ave_count = df.loc[df.cluster,:].groupby("cluster_label")["gene_id"].agg(["size"]).mean().values[0]
cluster_gene_ave_length = df.loc[df.cluster,:]["length"].mean()
# print
# df_lb = (df.rtype1[0], df.tool1[0], df.rtype2[0], df.tool2[0])
# print("\
# genes in {} but NOT in {}\n\
# \t{} genes w/ ave. length of {:.2f}bp\n\
# \t{} clusters w/ {} ({:.2f}%) genes in total\n\
# \tave. number of genes in clusters: {:.2f}\n\
# \tave. length of genes in clusters: {:.2f}bp\n\
# ".format(
# df_lb[2:], df_lb[0:2],
# gene_total_count, gene_ave_length,
# cluster_total_count, cluster_gene_total_count, 100 * cluster_gene_total_count / gene_total_count,
# cluster_gene_ave_count,
# cluster_gene_ave_length
# ))
print("{rt1}\t{to1}\t{rt2}\t{to2}\t{gtotal}\t{gavelen:.2f}\t{ctotal}\t{cgtotal}\t{cgpct:.2f}\t{cgavecount:.2f}\t{cgavelen:.2f}".format(
rt1=df.rtype1[0], to1=df.tool1[0], rt2=df.rtype2[0], to2=df.tool2[0],
gtotal=gene_total_count, gavelen=gene_ave_length,
ctotal=cluster_total_count,
cgtotal=cluster_gene_total_count, cgpct=100*cluster_gene_total_count/gene_total_count,
cgavecount=cluster_gene_ave_count,
cgavelen=cluster_gene_ave_length
))
return
def proc_input(faa_file):
df = parse_faa(faa_file)
df = find_clusters(df)
print_stats(df)
if __name__ == "__main__":
# files to process
faa_files = sorted(glob.glob(os.path.join(sys.argv[2], "*.faa")))
# header
print("\t".join([
"rtype1", "tool1", "rtype2", "tool2",
"gene_total_count", "gene_ave_length",
"cluster_total_count", "cluster_gene_total_count", "cluster_gene_total_pct",
"cluster_gene_ave_count", "cluster_gene_ave_length"
]))
# stat.s
with Pool(int(sys.argv[1])) as p:
p.map(proc_input, faa_files)
\ No newline at end of file
# Align proteins from one assembly to the proteins of another assembly
# diamond CMD: http://www.diamondsearch.org/index.php?pages/command_line_options/
# conda
conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/857cb426
# data
prot1="/scratch/users/vgalata/GDB/results/annotation/prodigal/lr/flye/proteins.faa"
prot2="/scratch/users/vgalata/GDB/results/annotation/prodigal/lr/canu/proteins.faa"
db1="./lr_flye_proteins"
db2="./lr_canu_proteins"
daa1="./lr_flye__lr_canu.daa"
daa2="./lr_canu__lr_flye.daa"
tsv1="./lr_flye__lr_canu.tsv"
tsv2="./lr_canu__lr_flye.tsv"
outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
# DBs (very fast - seconds, will add "*.dmnd" as extension)
diamond makedb --in ${prot1} --db ${db1} --threads 5
diamond makedb --in ${prot2} --db ${db2} --threads 5
# alignment (fast - minutes)
diamond blastp -q ${prot2} --db ${db1} --out ${daa1} --threads 5 --outfmt 100 --id 90 --query-cover 90 --subject-cover 90 --more-sensitive
diamond blastp -q ${prot1} --db ${db2} --out ${daa2} --threads 5 --outfmt 100 --id 90 --query-cover 90 --subject-cover 90 --more-sensitive
# tsv
diamond view --daa ${daa1%.*} --max-target-seqs 1 --threads 5 --outfmt ${outfmt} --out ${tsv1}
diamond view --daa ${daa2%.*} --max-target-seqs 1 --threads 5 --outfmt ${outfmt} --out ${tsv2}
# qcov, scov, query/subject length ratio
awk -F"\t" '{qcov=100*sqrt(($8-$7+1)^2)/$13; scov=100*sqrt(($10-$9+1)^2)/$14; qslr=$13/$14; print $0"\t"qcov"\t"scov"\t"qslr}' ${tsv1} > ${tsv1}.more
awk -F"\t" '{qcov=100*sqrt(($8-$7+1)^2)/$13; scov=100*sqrt(($10-$9+1)^2)/$14; qslr=$13/$14; print $0"\t"qcov"\t"scov"\t"qslr}' ${tsv2} > ${tsv2}.more
# stats
echo -e "
flye: $(grep '^>' ${prot1} | wc -l) proteins
canu: $(grep '^>' ${prot2} | wc -l) proteins
unique matches in flye: $(cut -f1,3,15,16,17 ${tsv1}.more | awk -F'\t' '$2>=90.0 && $3>=90.0 && $4>=90 {print $1}' | sort | uniq | wc -l)
unique matches in canu: $(cut -f1,3,15,16,17 ${tsv2}.more | awk -F'\t' '$2>=90.0 && $3>=90.0 && $4>=90 {print $1}' | sort | uniq | wc -l)
"
\ No newline at end of file
#!/usr/bin/env python
# NOTE: This is a test script to filter DIAMOND results and get subject annotations from the corresponding database FASTA file
# NOTE: requirements: pandas, Biopython (use RGI's conda env.)
# NOTE: Used FASTA has 167,761,270 entries
#
# TEST:
# conda env.: /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/33951b18
# input:
# grep "^>" /work/projects/ecosystem_biology/local_tools/databases/uniprot_trembl.fasta > /scratch//users/vgalata/uniprot_trembl.fasta.tsv
# sed -i "s/^>//" /scratch//users/vgalata/uniprot_trembl.fasta.tsv
# sed -i "s/\s\+/\t/" /scratch//users/vgalata/uniprot_trembl.fasta.tsv
# python diamond_annot.py /scratch/users/vgalata/GDB/results/annotation/diamond/sr/megahit/proteins.tsv /scratch//users/vgalata/uniprot_trembl.fasta.tsv /scratch/users/vgalata/GDB/results/annotation/diamond/sr/megahit/proteins.filtered.annot.tsv
# python diamond_annot.py /scratch/users/vgalata/GDB/results/annotation/diamond/lr/flye/proteins.tsv /scratch//users/vgalata/uniprot_trembl.fasta.tsv /scratch/users/vgalata/GDB/results/annotation/diamond/lr/flye/proteins.filtered.annot.tsv
import sys
from datetime import datetime
from utils import parse_diamond_tsv
# from Bio import SeqIO
def mynow():
return datetime.now().strftime("%H:%M:%S")
if __name__ == "__main__":
# parse hits
hits = parse_diamond_tsv(sys.argv[1])
print("{} read in {} hits\n{}\n".format(mynow(), hits.shape[0], hits.head()))
# filter hits
hits = hits.loc[hits.qs_len_ratio <= 0.5,:]
print("{} kept in {} hits\n{}\n".format(mynow(), hits.shape[0], hits.head()))
# add annotation
sseqids = set(hits.sseqid)
sdescr = dict.fromkeys(sseqids, None)
progress = 0
with open(sys.argv[2], "r") as ifile:
for line in ifile:
line_id, line_descr = line.rstrip().split("\t")
if line_id in sseqids:
sdescr[line_id] = line_descr
progress += 1
if progress % 100000 == 0:
print ("Processed {} DB entries".format(progress), end="\r")
print("{} collected subject info".format(mynow()))
hits = hits.assign(sdescr=hits.sseqid.apply(lambda x: sdescr[x]))
print("{} added subject info\n{}\n".format(mynow(), hits.head()))
# write to file
hits.to_csv(sys.argv[3], sep="\t", header=True, index=False, index_label=False)
#!/usr/bin/env python
# python diamond_annot_clusters.py /scratch/users/vgalata/GDB/results/annotation/diamond/sr/megahit/proteins.filtered.annot.tsv
# python diamond_annot_clusters.py /scratch/users/vgalata/GDB/results/annotation/diamond/lr/flye/proteins.filtered.annot.tsv
import sys
import pandas
if __name__ == "__main__":
qids = pandas.read_csv(sys.argv[1], sep="\t", header=0, usecols=["qseqid", "sseqid"])
# qids = pandas.read_csv("/scratch/users/vgalata/GDB/results/annotation/diamond/lr/flye/proteins.filtered.annot.tsv", sep="\t", header=0, usecols=["qseqid", "sseqid"])
qids.sort_values(by=["qseqid"], axis=0, inplace=True)
qids.set_index(keys="qseqid", drop=False, inplace=True, verify_integrity=True)
qids["cluster_label"] = 0
qids["cluster"] = False
last_qid = ""
last_cid = ""
last_sid = -1
cluster = 0
smatters = bool(sys.argv[2])
for qid in qids.qseqid:
qid_sid = int(qid.split("_")[-1])
qid_cid = "_".join(qid.split("_")[:-1])
# same contig, consecutive genes, (same subject or it does not matter)
if last_cid == qid_cid and last_sid + 1 == qid_sid and (qids.loc[last_qid, "sseqid"] == qids.loc[qid, "sseqid"] or not smatters):
qids.loc[last_qid, "cluster"] = True
qids.loc[qid, "cluster"] = True
else:
cluster += 1
qids.loc[qid, "cluster_label"] = cluster
last_qid = qid
last_cid = qid_cid
last_sid = qid_sid
print("{}:\nlooking for consecutive genes (clusters) where {}\n{} total hits\n{} ({:.2f}%) queries are in clusters\n{} clusters w/ average length of {:.2f} genes".format(
sys.argv[1],
"subject ID does not matter" if not smatters else "subject ID does matter",
qids.shape[0],
qids.cluster.sum(),
100 * qids.cluster.sum() / qids.shape[0],
len(set(qids.loc[qids.cluster,"cluster_label"])),
qids.loc[qids.cluster,:].groupby("cluster_label")["sseqid"].agg(["size"]).mean().values[0]
))
\ No newline at end of file
# FASTQ -> FASTA
zcat /scratch/users/vgalata/GDB/results/preproc/metat/sr/R1.fastp.fastq.gz | sed -n '1~4s/^@/>/p;2~4p' > R1.fastp.fasta
zcat /scratch/users/vgalata/GDB/results/preproc/metat/sr/R2.fastp.fastq.gz | sed -n '1~4s/^@/>/p;2~4p' > R2.fastp.fasta
#
cm="/scratch/users/vgalata/GDB/dbs/infernal/Rfam.cm"
clain="/scratch/users/vgalata/GDB/dbs/infernal/Rfam.clanin"
date && cmscan --cpu 5 --noali --anytrunc --nohmmonly --rfam --cut_ga --fmt 2 --oclan --oskip --clanin ${clain} -o R1.fastp.out --tblout R1.fastp.tblout ${cm} R1.fastp.fasta && date
# conda activate mashmap
cores=28
# estimate how many contigs from assembly_2 can be mapped to contigs in assembly_1
# splitting is allowed, report best hits for each query
# one query can have multiple hits
for pident in 80 90; do
echo -e "pct\tmapped\ttotal\tquery\tref" > "mashmap.asm.cov.id${pident}.tsv"
find data_GDB/results/assembly/ -type f -name "ASSEMBLY.FILTERED.fasta" | sort | while read asm1; do
find data_GDB/results/assembly/ -type f -name "ASSEMBLY.FILTERED.fasta" | sort | while read asm2; do
if [[ "${asm1}" != "${asm2}" ]]; then
mashmap -r ${asm1} -q ${asm2} -s 1000 --pi ${pident} -f map -t ${cores} -o mashmap.12.out &> mashmap.12.log
mapped=$(cut -d" " -f1 mashmap.12.out | sort | uniq | wc -l)
total=$(grep '^>' ${asm2} | wc -l)
mapped_pct=$(awk -v m=${mapped} -v t=${total} 'BEGIN{printf("%.2f\n",100*m/t)}')
echo -e "${mapped_pct}\t${mapped}\t${total}\t$(basename $(dirname ${asm2}))\t$(basename $(dirname ${asm1}))"
fi
done
done >> "mashmap.asm.cov.id${pident}.tsv"
done
# one-to-one mappings between two assemblies
# results are filtered to get best hit per query and reference, splitting is not allowed
for pident in 80 90; do
echo -e "pct\tmapped\ttotal\tquery\tref" > "mashmap.asm.map.id${pident}.tsv"
find data_GDB/results/assembly/ -type f -name "ASSEMBLY.FILTERED.fasta" | sort | while read asm1; do
find data_GDB/results/assembly/ -type f -name "ASSEMBLY.FILTERED.fasta" | sort | while read asm2; do
if [[ "${asm1}" != "${asm2}" ]]; then
mashmap -r ${asm1} -q ${asm2} -s 1000 --pi ${pident} -f one-to-one --noSplit -t ${cores} -o mashmap.12.out &> mashmap.12.log
mapped=$(cut -d" " -f1 mashmap.12.out | sort | uniq | wc -l)
total=$(grep '^>' ${asm2} | wc -l)
mapped_pct=$(awk -v m=${mapped} -v t=${total} 'BEGIN{printf("%.2f\n",100*m/t)}')
echo -e "${mapped_pct}\t${mapped}\t${total}\t$(basename $(dirname ${asm2}))\t$(basename $(dirname ${asm1}))"
fi
done
done >> "mashmap.asm.map.id${pident}.tsv"
done
# map long reads to assemblies
# fastq_lr="data_GDB/results/preproc/metag/lr/lr.fastq.gz"
# r_total=$(( $(zcat ${fastq_lr} | wc -l)/4 ))
# for pident in 70 80 90; do
# echo -e "reads_pct\treads_mapped\treads_total\tcontigs_pct\tcontigs_mapped\tcontigs_total\tref