From 5098527dd138d2e7020c3d6a58d6ceaed2b67ce5 Mon Sep 17 00:00:00 2001 From: Valentina Galata Date: Fri, 12 Feb 2021 10:59:05 +0100 Subject: [PATCH 1/2] rna filtering: added bbmap/bbduk option (issue #12) [ci skip] --- config/config.imp.yaml | 3 +- schemas/schema.imp.yaml | 10 +- workflow/envs/IMP_preprocessing.yaml | 2 +- .../rules/Preprocessing/rna.filtering.smk | 105 +++++++++++++----- workflow/rules/ini/databases.smk | 18 ++- 5 files changed, 94 insertions(+), 44 deletions(-) diff --git a/config/config.imp.yaml b/config/config.imp.yaml index 8e06f21..d09c0cf 100755 --- a/config/config.imp.yaml +++ b/config/config.imp.yaml @@ -39,7 +39,8 @@ trimmomatic: nextseq: false filtering: filter: hg38 -sortmerna: +rnafiltering: + tool: "sortmerna" # sortmerna (default), bbduk (from bbmap tools) files: - rfam-5.8s-database-id98 - silva-arc-16s-id95 diff --git a/schemas/schema.imp.yaml b/schemas/schema.imp.yaml index a9447f8..1a32074 100644 --- a/schemas/schema.imp.yaml +++ b/schemas/schema.imp.yaml @@ -112,11 +112,17 @@ properties: filter: type: string pattern: "^[^_.]+$" - sortmerna: + rnafiltering: type: object properties: - files: + files: type: array + tool: + type: string + enum: + - "sortmerna" + - "bbduk" + default: "sortmerna" assembly: type: object properties: diff --git a/workflow/envs/IMP_preprocessing.yaml b/workflow/envs/IMP_preprocessing.yaml index 81e65d0..8a3fab6 100644 --- a/workflow/envs/IMP_preprocessing.yaml +++ b/workflow/envs/IMP_preprocessing.yaml @@ -43,6 +43,7 @@ dependencies: - readline=8.0=hf8c457e_0 - setuptools=42.0.1=py37_0 - sortmerna=2.1b=he860b03_4 + - bbmap=38.90=h1296035_0 - sqlite=3.30.1=hcee41ef_0 - tk=8.6.10=hed695b0_0 - trimmomatic=0.32=0 @@ -66,4 +67,3 @@ dependencies: - xz=5.2.4=h14c3975_1001 - zlib=1.2.11=h516909a_1006 - zstd=1.4.3=h3b9ef0a_0 - diff --git a/workflow/rules/Preprocessing/rna.filtering.smk b/workflow/rules/Preprocessing/rna.filtering.smk index 1116c58..1d60d28 100755 --- a/workflow/rules/Preprocessing/rna.filtering.smk +++ b/workflow/rules/Preprocessing/rna.filtering.smk @@ -1,6 +1,6 @@ -FASTAFILES = expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna") +FASTAFILES = expand("{path}/{files}.fasta", files=config["rnafiltering"]["files"], path=DBPATH + "/sortmerna") -FASTAINDEXED = expand("{path}/idx/{files}", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna") +FASTAINDEXED = expand("{path}/idx/{files}", files=config["rnafiltering"]["files"], path=DBPATH + "/sortmerna") REF = ':'.join('%s,%s' % (a, b) for a, b in zip(FASTAFILES, FASTAINDEXED)) @@ -18,31 +18,76 @@ FILTER_RNA_SHELL = """ rm $TMP_R12* """ -rule filter_rna: - input: - 'Preprocessing/mt.r1.trimmed.fq', - 'Preprocessing/mt.r2.trimmed.fq', - 'Preprocessing/mt.se.trimmed.fq', - expand( - "{db}/sortmerna/idx/{files}.{ext}", - files=config["sortmerna"]["files"], - ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats'], - db=DBPATH) - output: - 'Preprocessing/mt.r1.trimmed.rna_filtered.fq', - 'Preprocessing/mt.r2.trimmed.rna_filtered.fq', - 'Preprocessing/mt.se.trimmed.rna_filtered.fq', - 'Preprocessing/mt.r1.trimmed.rna.fq', - 'Preprocessing/mt.r2.trimmed.rna.fq', - 'Preprocessing/mt.se.trimmed.rna.fq' - threads: 8 - params: - ref = REF, - sortmernamem = str(SORTME_MEM), - runtime = "48:00:00", - mem = MEMCORE - conda: ENVDIR + "/IMP_preprocessing.yaml" - log: "logs/preprocessing_filter_rna.log" - message: "filter_rna: filtering rRNA from mt reads." - shell: - FILTER_RNA_SHELL +if config["rnafiltering"]["tool"] == "sortmerna": + rule filter_rna_sortmerna: + input: + 'Preprocessing/mt.r1.trimmed.fq', + 'Preprocessing/mt.r2.trimmed.fq', + 'Preprocessing/mt.se.trimmed.fq', + expand( + "{db}/sortmerna/idx/{files}.{ext}", + files=config["rnafiltering"]["files"], + ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats'], + db=DBPATH) + output: + 'Preprocessing/mt.r1.trimmed.rna_filtered.fq', + 'Preprocessing/mt.r2.trimmed.rna_filtered.fq', + 'Preprocessing/mt.se.trimmed.rna_filtered.fq', + 'Preprocessing/mt.r1.trimmed.rna.fq', + 'Preprocessing/mt.r2.trimmed.rna.fq', + 'Preprocessing/mt.se.trimmed.rna.fq' + threads: 8 + params: + ref = REF, + sortmernamem = str(SORTME_MEM), + runtime = "48:00:00", + mem = MEMCORE + conda: + os.path.join(ENVDIR, "IMP_preprocessing.yaml") + log: + "logs/preprocessing_filter_rna.log" + message: + "filter_rna: sortmerna: filtering rRNA from mt reads." + shell: + FILTER_RNA_SHELL +elif config["rnafiltering"]["tool"] == "bbduk": + rule filter_rna_bbduk: + input: + r1='Preprocessing/mt.r1.trimmed.fq', + r2='Preprocessing/mt.r2.trimmed.fq', + se='Preprocessing/mt.se.trimmed.fq', + refs=expand(os.path.join(DBPATH, "sortmerna/{files}.fasta"), files=config["rnafiltering"]["files"]) + output: + # reads unmapped to references + r1u='Preprocessing/mt.r1.trimmed.rna_filtered.fq', + r2u='Preprocessing/mt.r2.trimmed.rna_filtered.fq', + seu='Preprocessing/mt.se.trimmed.rna_filtered.fq', + # reads mapped to references + r1m='Preprocessing/mt.r1.trimmed.rna.fq', + r2m='Preprocessing/mt.r2.trimmed.rna.fq', + sem='Preprocessing/mt.se.trimmed.rna.fq', + # stats + stats_pe='Preprocessing/mt.pe.trimmed.rna_filtered.stats.txt', + stats_se='Preprocessing/mt.se.trimmed.rna_filtered.stats.txt' + threads: 8 + params: + # sortmernamem = str(SORTME_MEM), # TODO + runtime = "24:00:00", + mem = MEMCORE, + params="k=27 qin=auto qout=auto rcomp=t ziplevel=9 ordered=t" + conda: + os.path.join(ENVDIR, "IMP_preprocessing.yaml") + log: + "logs/preprocessing_filter_rna.log" + message: + "filter_rna: bbmap/bbduk: filtering rRNA from mt reads." + shell: + # bbmap repo: https://github.com/BioInfoTools/BBMap/ + # bbduk guide: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/ + """ + ( + refs=$(echo \"{input.refs}\" | tr ' ' ',') && + bbduk.sh in={input.r1} in2={input.r2} out={output.r1u} out2={output.r2u} outm={output.r1m} outm2={output.r2m} ref=${{refs}} refstats={output.stats_pe} threads={threads} {params.params} && + bbduk.sh in={input.se} out={output.seu} outm={output.sem} ref=${{refs}} refstats={output.stats_se} threads={threads} {params.params} + ) &> {log} + """ diff --git a/workflow/rules/ini/databases.smk b/workflow/rules/ini/databases.smk index 15d21be..7c218c3 100755 --- a/workflow/rules/ini/databases.smk +++ b/workflow/rules/ini/databases.smk @@ -12,7 +12,7 @@ rule trimmomatic_adapters: rule sortmerna_databases: output: - expand("sortmerna/{files}.fasta", files=config["sortmerna"]["files"]) + expand(os.path.join(DBPATH, "sortmerna/{files}.fasta"), files=config["rnafiltering"]["files"]) shell: """ TMPD=$(mktemp -d -t --tmpdir={tmp} "XXXXXX") @@ -22,23 +22,21 @@ rule sortmerna_databases: mv $TMPD/rRNA_databases/*.fasta sortmerna/. rm -rf $TMPD """.format( - pkg_url=config["sortmerna"]["pkg_url"], + pkg_url=config["rnafiltering"]["pkg_url"], tmp=TMPDIR ) - rule sortmerna_databases_index: input: - expand("sortmerna/{files}.fasta", files=config["sortmerna"]["files"]) + expand(os.path.join(DBPATH, "sortmerna/{files}.fasta"), files=config["rnafiltering"]["files"]) output: expand( - "sortmerna/idx/{files}.{ext}", - files=config["sortmerna"]["files"], - ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats']) + os.path.join(DBPATH, "sortmerna/idx/{files}.{ext}"), + files=config["rnafiltering"]["files"], + ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats'] + ) run: - fastaindexed = expand( - "sortmerna/idx/{files}", - files=config["sortmerna"]["files"]) + fastaindexed = [os.path.splitext(f)[0] for f in input] ref = ':'.join('%s,%s' % (a, b) for a, b in zip(input, fastaindexed)) shell("mkdir -p sortmerna") shell("indexdb_rna --ref {ref}") -- GitLab From 99ce25b0680d18253b971fe08a633b3615cdb602 Mon Sep 17 00:00:00 2001 From: Valentina Galata Date: Thu, 4 Mar 2021 09:37:58 +0100 Subject: [PATCH 2/2] rna filtering: changed bbmap/bbduk params (issue #12) [ci skip] --- .../rules/Preprocessing/rna.filtering.smk | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/workflow/rules/Preprocessing/rna.filtering.smk b/workflow/rules/Preprocessing/rna.filtering.smk index 1d60d28..1befabb 100755 --- a/workflow/rules/Preprocessing/rna.filtering.smk +++ b/workflow/rules/Preprocessing/rna.filtering.smk @@ -51,6 +51,8 @@ if config["rnafiltering"]["tool"] == "sortmerna": shell: FILTER_RNA_SHELL elif config["rnafiltering"]["tool"] == "bbduk": + # bbmap repo: https://github.com/BioInfoTools/BBMap/ + # bbduk manual: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/ rule filter_rna_bbduk: input: r1='Preprocessing/mt.r1.trimmed.fq', @@ -67,14 +69,18 @@ elif config["rnafiltering"]["tool"] == "bbduk": r2m='Preprocessing/mt.r2.trimmed.rna.fq', sem='Preprocessing/mt.se.trimmed.rna.fq', # stats - stats_pe='Preprocessing/mt.pe.trimmed.rna_filtered.stats.txt', - stats_se='Preprocessing/mt.se.trimmed.rna_filtered.stats.txt' + stats_pe='Preprocessing/mt.pe.trimmed.refstats.txt', + stats_se='Preprocessing/mt.se.trimmed.refstats.txt' threads: 8 params: - # sortmernamem = str(SORTME_MEM), # TODO - runtime = "24:00:00", + runtime = "12:00:00", mem = MEMCORE, - params="k=27 qin=auto qout=auto rcomp=t ziplevel=9 ordered=t" + # from manual: ca. 85% of the machine’s physical memory + mem2="-Xmx%dg" % round(8 * int(MEMCORE[:-1]) * 0.85), + # filtering (trade-off between sens and spec, affects memory/runtime) + filt="k=27 qhdist=1 mkh=1", # (issue #12) + # incl. rev. complement, quality = auto, do not zip, keep read order + misc="rcomp=t qin=auto qout=auto ziplevel=1 ordered=t" conda: os.path.join(ENVDIR, "IMP_preprocessing.yaml") log: @@ -82,12 +88,10 @@ elif config["rnafiltering"]["tool"] == "bbduk": message: "filter_rna: bbmap/bbduk: filtering rRNA from mt reads." shell: - # bbmap repo: https://github.com/BioInfoTools/BBMap/ - # bbduk guide: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/ """ ( refs=$(echo \"{input.refs}\" | tr ' ' ',') && - bbduk.sh in={input.r1} in2={input.r2} out={output.r1u} out2={output.r2u} outm={output.r1m} outm2={output.r2m} ref=${{refs}} refstats={output.stats_pe} threads={threads} {params.params} && - bbduk.sh in={input.se} out={output.seu} outm={output.sem} ref=${{refs}} refstats={output.stats_se} threads={threads} {params.params} + bbduk.sh in={input.r1} in2={input.r2} out={output.r1u} out2={output.r2u} outm={output.r1m} outm2={output.r2m} ref=${{refs}} refstats={output.stats_pe} threads={threads} {params.mem2} {params.filt} {params.misc} && + bbduk.sh in={input.se} out={output.seu} outm={output.sem} ref=${{refs}} refstats={output.stats_se} threads={threads} {params.mem2} {params.filt} {params.misc} ) &> {log} """ -- GitLab