... | ... | @@ -451,11 +451,11 @@ for file in *.txt; do echo $file; cat $file; done >> counts |
|
|
cat counts | paste - - > partial_gene_counts.txt
|
|
|
|
|
|
# editing fasta headers to make it easier after cd-hit clustering
|
|
|
si # interactive
|
|
|
conda activate bbmap
|
|
|
rename.sh in=metaspades.faa out=spades.faa prefix=spades ignorejunk=t
|
|
|
rename.sh in=metaspades_hybrid.faa out=hybrid.faa prefix=hybrid ignorejunk=t
|
|
|
|
|
|
si # interactive
|
|
|
conda activate cd-hit
|
|
|
cd-hit-2d -i spades.faa -i2 hybrid.faa -o spades_hybrid -c 0.9 -n 5 -d 0 -M 16000 -T 8
|
|
|
cd-hit-2d -i hybrid.faa -i2 spades.faa -o hybrid_spades -c 0.9 -n 5 -d 0 -M 16000 -T 8
|
... | ... | @@ -463,10 +463,17 @@ cd-hit-2d -i hybrid.faa -i2 spades.faa -o hybrid_spades -c 0.9 -n 5 -d 0 -M 1600 |
|
|
# determining number of unique sequences
|
|
|
# according to http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d
|
|
|
# CD-HIT-2D outputs two files: a fasta file of proteins in "db2" that are not similar to db1 and a text file that lists similar sequences between db1 & db2
|
|
|
# non-mod-basecalled
|
|
|
grep -c '>' spades_hybrid # db2 == hybrid
|
|
|
# 63911 == no.unique in hybrid
|
|
|
grep -c '>' hybrid_spades # db2 == spades
|
|
|
# 27526 == no.unique in spades
|
|
|
|
|
|
# "methylation-aware"_basecalled
|
|
|
grep -c '>' spades_hybrid # db2 == hybrid
|
|
|
# 63911
|
|
|
# 61828 == no.unique in hybrid
|
|
|
grep -c '>' hybrid_spades # db2 == spades
|
|
|
# 27526
|
|
|
# 30807 == no.unique in spades
|
|
|
|
|
|
# using auxilliary scripts to merge the cluster (output) files
|
|
|
# making plots for all .clstr files (http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d)
|
... | ... | |