Changes

Susheel Busi · e7b156d6
--- a/ONT_pilot_w_GDB_samples.md
+++ b/ONT_pilot_w_GDB_samples.md
@@ -451,11 +451,11 @@ for file in *.txt; do echo $file; cat $file; done >> counts
 cat counts | paste - - > partial_gene_counts.txt

 # editing fasta headers to make it easier after cd-hit clustering
+si    # interactive
 conda activate bbmap
 rename.sh in=metaspades.faa out=spades.faa prefix=spades ignorejunk=t
 rename.sh in=metaspades_hybrid.faa out=hybrid.faa prefix=hybrid ignorejunk=t

-si    # interactive
 conda activate cd-hit
 cd-hit-2d -i spades.faa -i2 hybrid.faa -o spades_hybrid -c 0.9 -n 5 -d 0 -M 16000 -T 8
 cd-hit-2d -i hybrid.faa -i2 spades.faa -o hybrid_spades -c 0.9 -n 5 -d 0 -M 16000 -T 8
@@ -463,10 +463,17 @@ cd-hit-2d -i hybrid.faa -i2 spades.faa -o hybrid_spades -c 0.9 -n 5 -d 0 -M 1600
 # determining number of unique sequences
 # according to http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d
 # CD-HIT-2D outputs two files: a fasta file of proteins in "db2" that are not similar to db1 and a text file that lists similar sequences between db1 & db2
+# non-mod-basecalled
+grep -c '>' spades_hybrid   # db2 == hybrid
+# 63911 == no.unique in hybrid
+grep -c '>' hybrid_spades   # db2 == spades
+# 27526 == no.unique in spades
+
+# "methylation-aware"_basecalled
 grep -c '>' spades_hybrid   # db2 == hybrid
-# 63911
+# 61828 == no.unique in hybrid
 grep -c '>' hybrid_spades   # db2 == spades
-# 27526
+# 30807 == no.unique in spades

 # using auxilliary scripts to merge the cluster (output) files
 # making plots for all .clstr files (http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d)