Commit 71334906 authored by AHB's avatar AHB
Browse files

added marker gene scripts

parent 60b37a69
# this script is used in the MuSt project to find the taxonomic annotation and mOTU of genes (of one COG) found in a BLAST search
#inputs: as argument in the call - COG, the COG ID of the genes of interest
# uses - the mapping files "mOTU.v1.padded.motu.linkage.map", "mOTU.v1.map.txt" and "mOTU-LG.v1.annotations.txt" of the mOTU database and
# - a prepared file with the best hit output of BLASTN "<COG>.bestHits.tsv"
#output: a table with the query gene and the taxonomy of the best hit (written to a file named "<COG>.bestHitPhylogeny.tsv")
#
#written by Anna Heintz-Buschart (March 2015)
args<-commandArgs(TRUE)
COG <- args[1]
phyl <- read.delim("../../mOTU.v1.padded.motu.linkage.map",stringsAsFactors=F)
map <- read.delim("../../mOTU.v1.map.txt",stringsAsFactors=F)
mapPhyl <- read.delim("../../mOTU-LG.v1.annotations.txt",stringsAsFactors=F)
bh <- read.delim(paste(COG,".bestHits.tsv",sep=""),header=F,comment.char="#",stringsAsFactors=F)
colnames(bh) <- c("queryID", "subjectID", "percIdentity", "alignmentLength", "mismatches", "gapOpens", "q.start", "q.end", "s.start", "s.end", "evalue", "bitScore")
bh$subjectID <- sapply(bh$subjectID,function(x) paste(unlist(strsplit(x,split=".",fixed=T))[-c(length(unlist(strsplit(x,split=".",fixed=T)))-1:0)],sep=".",collapse="."))
bh <- merge(bh,map,by.x=2,by.y=1,all.x=T)
bh <- merge(bh,phyl[,c(1,11)],by.x="mOTU",by.y=1,all.x=T)
bh <- merge(bh,mapPhyl,by.x=ncol(bh),by.y=1,all.x=T)
write.table(bh,paste(COG,".bestHitPhylogeny.tsv",sep=""),quote=F,sep="\t",row.names=F)
# this script is used in the MuSt project to find the names of the genes (of one COG) in the mOTU padded database which represent the mOTUs present in a sample
#inputs: as argument in the call - COG, the COG ID of the genes of interest
# uses - the mapping files "mOTU.v1.padded.motu.linkage.map" and "mOTU.v1.map.txt" of the mOTU database and
# - a prepared file with the names of the present mOTUs "mOTUs.present.all"
#output: a list of genenames (written to a file named with the COG ID followed by ".present.mOTU.genenames.txt")
#
#written by Anna Heintz-Buschart (February 2015)
args<-commandArgs(TRUE)
COG <- args[1]
phyl <- read.delim("../../mOTU.v1.padded.motu.linkage.map",stringsAsFactors=F)
map <- read.delim("../../mOTU.v1.map.txt",stringsAsFactors=F)
pres <- read.delim("mOTUs.present.all",header=F,stringsAsFactors=F)
phyl2 <- phyl[ phyl$mOTU.species.annotation %in% pres$V1,]
mp <- merge(map,phyl2,by.x="mOTU",by.y=1)
coord <- read.delim(paste("../../",COG,"/",COG,".mOTU.v1.padded.coord",sep=""),header=F,stringsAsFactors=F)
co <- gsub(" ",".",coord$V1)
coord <- read.delim(paste("../../",COG,"/",COG,".mOTU.v1.padded.coord",sep=""),header=F,stringsAsFactors=F,sep=" ")
coord$name <- co
mp2 <- merge(mp[mp$MarkerGene ==COG,],coord[,c(1,4)],by.x="SequenceID",by.y=1)
write.table(mp2$name,paste(COG,".present.mOTU.genenames.txt",sep=""),col.names=F,quote=F,sep="\t",row.names=F)
# this script is used in the MuSt project to find the taxonomic annotation and mOTU of genes (of one COG) found in a BLAST search
#inputs: as argument in the call - COG, the COG ID of the genes of interest
# uses - the mapping files "mOTU.v1.padded.motu.linkage.map", "mOTU.v1.map.txt" and "mOTU-LG.v1.annotations.txt" of the mOTU database and
# - a prepared file with the best hit output of BLASTN "<COG>.bestPresentHits.tsv"
#output: a table with the query gene and the taxonomy of the best hit (written to a file named "<COG>.bestPresentHitPhylogeny.tsv")
#
#written by Anna Heintz-Buschart (March 2015)
args<-commandArgs(TRUE)
COG <- args[1]
phyl <- read.delim("../../mOTU.v1.padded.motu.linkage.map",stringsAsFactors=F)
map <- read.delim("../../mOTU.v1.map.txt",stringsAsFactors=F)
mapPhyl <- read.delim("../../mOTU-LG.v1.annotations.txt",stringsAsFactors=F)
bh <- read.delim(paste(COG,".bestPresentHits.tsv",sep=""),header=F,comment.char="#",stringsAsFactors=F)
colnames(bh) <- c("queryID", "subjectID", "percIdentity", "alignmentLength", "mismatches", "gapOpens", "q.start", "q.end", "s.start", "s.end", "evalue", "bitScore")
bh$subjectID <- sapply(bh$subjectID,function(x) paste(unlist(strsplit(x,split=".",fixed=T))[-c(length(unlist(strsplit(x,split=".",fixed=T)))-1:0)],sep=".",collapse="."))
bh <- merge(bh,map,by.x=2,by.y=1,all.x=T)
bh <- merge(bh,phyl[,c(1,11)],by.x="mOTU",by.y=1,all.x=T)
bh <- merge(bh,mapPhyl,by.x=ncol(bh),by.y=1,all.x=T)
write.table(bh,paste(COG,".bestPresentHitPhylogeny.tsv",sep=""),quote=F,sep="\t",row.names=F)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment