Commit f81585c9 authored by AHB's avatar AHB
Browse files

more scripts

parent 324068b3
#this script gets all genes of related genomes, along with their expression levels on the different omic levels and the best annotation
#### reads are summed up by functions and DESeq-normalized
#### and creates a number of workspace with the data
# the script uses allClusterInfo.RDS, which contains statistics on all bins of all samples
# the function getExprData() defined in lines 23-68 does the interfacing with MongoDB
# the loop in line 76-109 does the gathering by same taxonomic annotation
# written by Anna Heintz-Buschart, September 2015 - this version is from January 2016, shortened to the part that was actually used in the MuSt
clusterInfo <- readRDS("allClusterInfo.RDS")
library(rmongodb)
mongo <- mongo.create()
db <- "mydb"
coll <- "mydb.must"
if(mongo.is.connected(mongo)) print(paste("found connection to mongod -",mongo.count(mongo,coll),"documents in collection"))
library(DESeq2)
#####
getExprData <- function(sampleID,clusOI){
#matches for pipelines:
matchFun <- mongo.bson.from.JSON(paste('{"$match": {"genes.bestAnnotation": {"$exists": "true"}} }',sep=""))
matchCluster <- mongo.bson.from.JSON(paste('{"$match": {"cluster": "',clusOI,'"} }',sep=""))
matchSample <- mongo.bson.from.JSON(paste('{"$match": {"sample": "',sampleID,'"} }',sep=""))
matchProt <- mongo.bson.from.JSON(paste('{"$match": {"genes.proteinIdentification": "uniquely"} }',sep=""))
#unwinds for pipelines:
unwindGenes <- mongo.bson.from.JSON('{"$unwind": "$genes"}')
unwindProteins <- mongo.bson.from.JSON('{"$unwind": "$genes.proteinIdentification"}')
#grouping for pipelines:
groupGeneA <- mongo.bson.from.JSON('{"$group": {"_id": "$_id", "gene": {"$push": "$genes.gene"},
"aveCovDNA":{"$push":"$genes.aveCovDNA"},"aveCovRNA":{"$push":"$genes.aveCovRNAfw"},
"readsRNA":{"$push":"$genes.readsRNAfw"},"fun":{"$push":"$genes.bestAnnotation"}}}')
groupGeneProt <- mongo.bson.from.JSON('{"$group": {"_id": "$_id", "gene": {"$push": "$genes.gene"},
"proteinIdentification":{"$push":"$genes.proteinIdentification"},
"proteinArea":{"$push":"$genes.proteinArea"}}}')
#projections for pipelines:
projectGeneA <- mongo.bson.from.JSON('{"$project": {"_id": 0, "fun":1, "gene":1, "aveCovDNA":1,"aveCovRNA":1,"readsRNA":1}}')
projectGeneProt <- mongo.bson.from.JSON('{"$project": {"_id": 0, "gene":1,"proteinIdentification":1,"proteinArea":1}}')
if(!mongo.is.connected(mongo)) {
stop("Mongo is not connected.")
}else{
#aggregation pipelines:
genesA <- mongo.bson.to.list(mongo.aggregation(mongo,coll,list(matchCluster,matchSample,unwindGenes,matchFun,groupGeneA,
projectGeneA)))$result
if(length(genesA)==0){
warning("No genes found")
} else {
tFeat <- do.call(rbind,lapply(genesA,function(x) cbind(x$gene,x$fun,x$aveCovDNA,x$aveCovRNA,x$readsRNA)))
tFeat[sapply(tFeat[,2],length)>1,2] <- sapply(tFeat[sapply(tFeat[,2],length)>1,2],function(x) paste(x,sep=";",collapse=";"))
tFeat <- data.frame("gene"=unlist(tFeat[,1]),"bestAnno"=unlist(tFeat[,2]),"aveCovDNA"=as.numeric(unlist(tFeat[,3])),
"aveCovRNA"=as.numeric(unlist(tFeat[,4])),"readsRNA"=as.numeric(unlist(tFeat[,5])),stringsAsFactors=F)
genesP <- mongo.bson.to.list(mongo.aggregation(mongo,coll,list(matchCluster,matchSample,unwindGenes,matchFun,matchProt,groupGeneProt,
projectGeneProt)))$result
pFeat <- do.call(rbind,lapply(genesP,function(x) cbind(x$gene, x$proteinArea)))
pFeat[sapply(pFeat[,2],length)>1,2] <- sapply(pFeat[sapply(pFeat[,2],length)>1,2], mean)
pFeat <- data.frame("gene"=unlist(pFeat[,1]),"proteinArea"=as.numeric(unlist(pFeat[,2])),stringsAsFactors=F)
aFeat <- merge(tFeat,pFeat,by=1,all.x=T)
aFeat[is.na(aFeat)] <- 0
if(!"proteinArea" %in% colnames(aFeat)) aFeat$proteinArea <- 0
return(aFeat)
}
}
}
motuDup <- names(table(sapply(grep(";",clusterInfo$motuPresent,invert=T,value=T),
function(x)unlist(strsplit(x,"\\("))[1]))
[table(sapply(grep(";",clusterInfo$motuPresent,invert=T,value=T),
function(x)unlist(strsplit(x,"\\("))[1]))>1])
for(currMotu in motuDup){
clOI <- clusterInfo[grepl(paste(currMotu,"\\(",sep=""),clusterInfo$motuPresent)&!grepl(";",clusterInfo$motuPresent),c("cluster","sample")]
first <- T
for(i in 1:nrow(clOI)){
sI <- clOI$sample[i]
clI <- clOI$cluster[i]
if(!is.na(sI)&!is.na(clI)){
cwd <- paste(sI,clI,sep="_")
if(first){
exprall <- data.frame("cluster"=cwd,getExprData(sI,clI),stringsAsFactors=F)
first <- F
}else{
texp <- data.frame("cluster"=cwd,getExprData(sI,clI),stringsAsFactors=F)
exprall <- rbind(exprall,texp)
}
}
}
pres <- tapply(exprall$gene,list(exprall$bestAnno,exprall$cluster),function(x)as.numeric(length(x)>0))
pres[is.na(pres)] <-0
if(ncol(pres)>1&nrow(pres)>1){
readR <- tapply(exprall$readsRNA,list(exprall$bestAnno,exprall$cluster),sum)
readR[is.na(readR)] <-0
gps <- c(grep("P",colnames(pres)),grep("G",colnames(pres)))
if(length(gps)>2){
readRA <- readR[,gps]
readRA <- readRA[apply(dcov[,gps],1,function(x)all(x>0)),]
readDDS <- DESeqDataSetFromMatrix(readRA,data.frame("sample"=colnames(readRA)),~sample)
ts2 <- estimateSizeFactors(readDDS)
readRN <- counts(ts2,normalized=T)
save.image(paste(currMotu,"_WS.Rdata",sep=""))
}
}
}
mongo.destroy(mongo)
#this script gets all genes with a function of interest as best annotation
#### and creates a number of plots which show from which genomes this function is expressed in the different samples and on the different omic levels
#### it also returns a workspace with the data
# it takes 3 ARGUMENTs when called: the function of interest, which mOTU annotation (best hit out of the mOTUs found at reads level ("mOTUpresent") or
#### of all ("mOTUbest")) to use and whether to order the plots by whether the donors of the samples have T1DM ("T1DM") or belong to a group defined in another file ("BG")
# the name of the function of interest is used to create a directory which houses all the output plots
# the script is constructed in two parts: the part that accesses the database and the part that makes the plots. The plotting needs a lot of additional informations
### and also the script 140510_heatmap2.R. The database access is found in lines 14-15, and 34-76. The rest is plotting.
# written by Anna Heintz-Buschart, this version is from October 2015
args<-commandArgs(TRUE)
funOI <- args[1]
####
motuCh <- args[2]
ord <- args[3]
source("140510_heatmap2.R")
clusterInfo <- readRDS("allClusterInfo.RDS")
motu <- readRDS("motuAnnotation.RDS")
bigGroup <- readRDS("bigGroup.RDS")
mapStats <- readRDS("mappedReads.RDS")
protStats <- read.delim("allStats.tsv",stringsAsFactors=F,row.names=1)
colnames(protStats) <- gsub(".V","-V",colnames(protStats),fixed=T)
miMetaA <- readRDS("miMetaCombi.RDS")
library(gplots)
library(RColorBrewer)
library(vegan)
########
library(rmongodb)
getExprTab <- function(funOI){
if(mongo.is.connected(mongo)) {
unwindGenes <- mongo.bson.from.JSON('{"$unwind": "$genes"}')
matchAnno <- mongo.bson.from.JSON(paste('{"$match": {"genes.bestAnnotation": "',funOI,'"} }',sep=""))
matchProt <- mongo.bson.from.JSON(paste('{"$match": {"genes.proteinIdentification": "uniquely"} }',sep=""))
groupDNARNAname <- mongo.bson.from.JSON('{"$group": {"_id": "$_id", "aveCovDNA": {"$push": "$genes.aveCovDNA"},"sample":{"$push":"$sample"},"gene":{"$push":"$genes.gene"},"cluster":{"$push": "$cluster"},"aveCovRNA":{"$push": "$genes.aveCovRNAfw"}}}')
projectDNARNAname <- mongo.bson.from.JSON('{"$project": {"_id": 0, "gene":1,"aveCovDNA": 1, "cluster" :1,"aveCovRNA":1,"sample":1}}')
groupProtname <- mongo.bson.from.JSON('{"$group": {"_id": "$_id", "gene": {"$push": "$genes.gene"},"sample":{"$push":"$sample"},
"proteinIdentification":{"$push":"$genes.proteinIdentification"},
"proteinArea":{"$push":"$genes.proteinArea"}}}')
projectProtname <- mongo.bson.from.JSON('{"$project": {"_id": 0, "gene":1,"proteinIdentification":1,"proteinArea":1,"sample":1}}')
genesA <- mongo.bson.to.list(mongo.aggregation(mongo,coll,list(matchAnno,unwindGenes,matchAnno,groupDNARNAname,projectDNARNAname)))$result
genesP <- mongo.bson.to.list(mongo.aggregation(mongo,coll,list(matchAnno,unwindGenes,matchAnno,matchProt,groupProtname,projectProtname)))$result
if(length(genesA)==0){
warning("No genes found")
return(NULL)
}
res <- do.call(rbind,lapply(genesA,function(x) cbind(x$sample,x$gene,x$cluster,x$aveCovDNA,x$aveCovRNA)))
res <- data.frame("sample"=res[,1],"gene"=res[,2],"cluster"=res[,3],"aveCovDNA"=as.numeric(res[,4]),"aveCovRNA"=as.numeric(res[,5]),
stringsAsFactors=F)
if(length(genesP)>0){
pFeat <- do.call(rbind,lapply(genesP,function(x) cbind(x$sample,x$gene, x$proteinArea)))
pFeat[sapply(pFeat[,3],length)>1,3] <- sapply(pFeat[sapply(pFeat[,3],length)>1,3], mean)
pFeat <- data.frame("sample"=unlist(pFeat[,1]),"gene"=unlist(pFeat[,2]),"proteinArea"=as.numeric(unlist(pFeat[,3])),stringsAsFactors=F)
aFeat <- merge(res,pFeat,by=c(1,2),all.x=T)
}else{
aFeat <- res
aFeat$proteinArea <- 0
}
aFeat[is.na(aFeat)] <- 0
return(aFeat)
} else {
stop("Mongo is not connected.")
}
}
mongo <- mongo.create()
db <- "mydb"
coll <- "mydb.must"
exTab <- getExprTab(funOI)
mongo.destroy(mongo)
#######
mapFac <- mapStats[,2:3]
rownames(mapFac) <- mapStats[,1]
mapFac$DNAreadsOnContigs <- 1/(mapStats[,2]/mean(mapStats[,2]))
mapFac$RNAreadsFwOnGenes <- 1/(mapStats[,3]/mean(mapStats[,3]))
miMeta <- miMetaA[-10,]
McolA <- c(brewer.pal(7,"Blues")[3:7],brewer.pal(7,"Oranges")[3:7],brewer.pal(6,"Purples")[3:6],brewer.pal(8,"Greens")[3:8]) #all 4 families
Mcol <- McolA[-c(5,11)]
visFacA <- c(rep(1:4,each=3),rep(5,times=2),rep(6:9,each=3),rep(10:11,each=2),rep(12:13,each=3),rep(14:15,each=2),rep(16:20,each=3)) #all 4 families
visFac <- c(rep(1:3,each=3),rep(4,times=2),rep(5,times=3),rep(6,times=2),rep(7:8,each=3),rep(9,times=2),10:13,rep(14,times=2),15,rep(16,times=2),17,rep(18,times=2))
rownames(miMeta) <- gsub("-0","-",gsub("M-","M",rownames(miMeta)))
indiv <- c(paste("M01.",c(1:4),sep=""),paste("M02.",c(1:5),sep=""),paste("M03.",c(3:5),sep=""),
paste("M04.",c(1:6),sep=""))
hmgrey <- colorRampPalette(brewer.pal(9,"Greys"),bias=2.5)(256)
DNAhm <- colorRampPalette(c("white",rgb(0,204,204,maxColorValue=255),rgb(0,102,102,maxColorValue=255)))(256)
RNAhm <- colorRampPalette(c("white",rgb(204,0,204,maxColorValue=255),rgb(102,0,102,maxColorValue=255)))(256)
Prothm <- colorRampPalette(c("white",rgb(204,204,0,maxColorValue=255),rgb(102,102,0,maxColorValue=255)))(256)
rathm <- colorRampPalette(c(rgb(0,102,102,maxColorValue=255),rgb(0,204,204,maxColorValue=255),"white",rgb(204,0,204,maxColorValue=255),rgb(102,0,102,maxColorValue=255)))(256)
plotExprClus <- function(exprTab,cI=clusterInfo,oOI=funOI,motuChoice=motuCh,retVal="plot"){
require(RColorBrewer)
for(lib in sort(unique(exprTab$sample))){
popRNACov <- aggregate(exprTab$aveCovRNA[exprTab$sample==lib],list(exprTab$cluster[exprTab$sample==lib]),sum)
colnames(popRNACov) <- c("cluster","cumCovRNA")
popGenes <- aggregate(exprTab$aveCovRNA[exprTab$sample==lib],list(exprTab$cluster[exprTab$sample==lib]),length)
colnames(popGenes) <- c("cluster","geneNo")
pops <- merge(popRNACov,cI[cI$sample==lib,c("cluster","aveCov","uniqueEss",motuChoice)],by=1,all.x=T)
pops$motuUnanimous <- ifelse(grepl(";",pops[[motuChoice]])|pops[[motuChoice]]=="","uncertain",
sapply(pops[[motuChoice]],function(x) unlist(strsplit(x,split="\\("))[1]))
#pops$motuUnanimous[pops$motuUnanimous==""] <- ""
if(any(!pops$cluster %in% c("N","S"))){
exTab <- pops[!pops$cluster %in% c("N","S"),]
exTab <- merge(exTab,popGenes,by="cluster")
exTab <- exTab[order(exTab$cumCovRNA,decreasing=T),]
exTab$cluster <- paste(exTab$cluster," (",exTab$geneNo,ifelse(exTab$geneNo==1," gene)"," genes)"),"\n",ifelse(exTab$motuUnanimous %in% c("uncertain",""),exTab$motuUnanimous,gsub("SpeciesCluster of ","",sapply(exTab$motuUnanimous,function(x)motu$SpeciesCluster[motu$ID==x]))),sep="")
exTab <- exTab[!is.na(exTab$cumCovRNA),]
if(any(pops$cluster %in% c("N","S"))){
exTab <- rbind(exTab[,1:4],c(paste(sum(popGenes$geneNo[popGenes$cluster %in% c("N","S")]),
ifelse(sum(popGenes$geneNo[popGenes$cluster %in% c("N","S")])==1,"other gene","other genes")),
sum(popRNACov$cumCovRNA[popRNACov$cluster %in% c("N","S")]),1))
}
}else{
exTab <- data.frame("cluster"=paste(sum(popGenes$geneNo[popGenes$cluster %in% c("N","S")]),
ifelse(sum(popGenes$geneNo[popGenes$cluster %in% c("N","S")])==1,"other gene","other genes")),
"cumCovRNA"=sum(popRNACov$cumCovRNA[popRNACov$cluster %in% c("N","S")]),"aveCov"=NA,"uniqueEss"=0)
}
if("plot" %in% retVal){
maxy <- 1.1*max(as.numeric(c(exTab$cumCovRNA)))
par(mar=c(3,10,0.5,0.5),mgp=c(1.9,0.6,0))
barplot(as.numeric(exTab$cumCovRNA),names.arg=exTab$cluster,las=2,xlim=c(0,maxy),horiz=T,cex.names=0.6,
xlab=paste("metaT coverage depth of",oOI),
col=colorRampPalette(brewer.pal(11,"Spectral"))(109)[109:1][as.numeric(exTab$uniqueEss)+1],
cex.axis=0.8)
mtext(paste("populations in sample",lib),2,8.7,cex=1.2)
if(any(popGenes$cluster %in% c("N","S"))){
popCount <- nrow(exTab)-1
geneCount <- popGenes$geneNo[popGenes$cluster %in% c("N","S")]
barplot(cbind(matrix(0,nrow=sum(geneCount),ncol=popCount),
sort(exprTab$aveCovRNA[exprTab$sample==lib&exprTab$cluster %in% c("N","S")],decreasing=T)),
names.arg=rep("",popCount+1),axes=F,add=T,horiz=T)
}
if(any(!grepl("other",exTab$cluster))){
maxx <- 1.1*max(as.numeric(exTab$aveCov[!grepl("other",exTab$cluster)]))
maxy <- 1.1*max(as.numeric(exTab$cumCovRNA[!grepl("other",exTab$cluster)]))
par(mar=c(3,3,1.1,0.5))
plot(as.numeric(exTab$aveCov[!grepl("other",exTab$cluster)]),as.numeric(exTab$cumCovRNA[!grepl("other",exTab$cluster)]),
las=1,xlim=c(0,maxx),ylim=c(0,maxy),xlab="metaG coverage depth of cluster",ylab=paste("metaT coverage depth of",oOI),
col=colorRampPalette(brewer.pal(11,"Spectral"))(109)[109:1][as.numeric(exTab$uniqueEss[!grepl("other",exTab$cluster)])+1],
cex.axis=0.8,cex=sqrt(as.numeric(exTab$geneNo[!grepl("other",exTab$cluster)])),pch=16)
mtext(lib,3,0,cex=1.2)
text(as.numeric(exTab$aveCov[!grepl("other",exTab$cluster)]),as.numeric(exTab$cumCovRNA[!grepl("other",exTab$cluster)]),
labels=exTab$cluster[!grepl("other",exTab$cluster)],adj=c(-0.2,-0.2),cex=0.8,
col=colorRampPalette(brewer.pal(11,"Spectral"))(109)[109:1][as.numeric(exTab$uniqueEss[!grepl("other",exTab$cluster)])+1])
}
}
}
if("max" %in% retVal) retList <- exprTab$cluster[which.max(exprTab$aveCovRNA)]
if("exprTabMotu" %in% retVal) retList <- merge(exprTab,cI[,c("sample","cluster",motuChoice)],by=c("sample","cluster"))
return(retList)
}
outDir <- paste("./",funOI,sep="")
dir.create(outDir)
pdf(paste(outDir,"/metaTcovAllClusters.pdf",sep=""),width=3.5,height=3.5,pointsize=8)
resL <- plotExprClus(exTab,retVal=c("exprTabMotu","plot"))
dev.off()
if(!is.null(resL)){
resL$motuUnanimous <- ifelse(grepl(";",resL$motuPresent)|resL$motuPresent == "","uncertain",
sapply(resL$motuPresent,function(x) unlist(strsplit(x,split="\\("))[1]))
resL$motuUnanimous[is.na(resL$motuUnanimous)] <- "uncertain"
resLclus <- tapply(resL$cluster,list(resL$motuUnanimous,resL$sample),function(x) length(unique(x)))
resLclus[is.na(resLclus)] <- 0
resLgene <- tapply(resL$aveCovDNA,list(resL$motuUnanimous,resL$sample),length)
resLgene[is.na(resLgene)] <- 0
mapFac <- mapFac[rownames(mapFac) %in% unique(resL$sample),]
visFac <- visFac[sort(unique(clusterInfo$sample)) %in% unique(resL$sample)]
resLDNA <- tapply(resL$aveCovDNA,list(resL$motuUnanimous,resL$sample),sum)
resLDNA[is.na(resLDNA)] <- 0
resLDNA <- t(apply(resLDNA,1,function(x) x*mapFac$DNAreadsOnContigs))
resLRNA <- tapply(resL$aveCovRNA,list(resL$motuUnanimous,resL$sample),sum)
resLRNA[is.na(resLRNA)] <- 0
resLRNA <- t(apply(resLRNA,1,function(x) x*mapFac$RNAreadsFwOnGenes))
resLProtein <- tapply(resL$protein,list(resL$motuUnanimous,resL$sample),sum)
resLProtein[is.na(resLProtein)] <- 0
protFac <- as.vector(t(protStats[rownames(protStats)=="totalArea",colnames(protStats) %in% resL$sample]))
resLProtein <- t(apply(resLProtein,1,function(x) x/protFac))
if(nrow(resLgene)>1){
resLgene <- resLgene[order(rowSums(resLRNA),decreasing=T),]
resLclus <- resLclus[order(rowSums(resLRNA),decreasing=T),]
resLDNA <- resLDNA[order(rowSums(resLRNA),decreasing=T),]
resLProtein <- resLProtein[order(rowSums(resLRNA),decreasing=T),]
resLRNA <- resLRNA[order(rowSums(resLRNA),decreasing=T),]
trsl <- sapply(rownames(resLgene),function(x) if(x=="uncertain") x else(motu$SpeciesCluster[motu$ID == x]))
trsl <- gsub("SpeciesCluster of ","",trsl)
pdf(paste(outDir,"/heatmaps_all.pdf",sep=""),height=4.9,width=7.8,pointsize=8)
if(any(resLgene>0)){
if(ord=="T1DM"){
ordervec <- c(which(miMeta$DIABETESTY1[visFac]=="Yes"),which(miMeta$DIABETESTY1[visFac]!="Yes"))
}else if(ord=="BG"){
ordervec <- c(which(bigGroup[visFac]==1),which(bigGroup[visFac]==2))
}else{
ordervec <- c(1:ncol(resLgene))
}
heatmap.2a(resLgene[,ordervec],keyName="number of genes",
trace="n",Colv="none",col=hmgrey,Rowv="none",density.info="n",labRow=trsl,dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLclus>0)){
heatmap.2a(resLclus[,ordervec],keyName="number of clusters",
trace="n",Colv="none",col=hmgrey,Rowv="none",density.info="n",labRow=trsl,dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLDNA>0)){
heatmap.2a(resLDNA[,ordervec],keyName="cluster cov metaG",
trace="n",Colv="none",col=DNAhm,Rowv="none",density.info="n",labRow=trsl,dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLRNA>0)){
heatmap.2a(resLRNA[,ordervec],keyName="gene cov metaT",
trace="n",Colv="none",col=RNAhm,Rowv="none",density.info="n",labRow=trsl,dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLRNA>0)&any(resLDNA>0)){
resRat <- log10(resLRNA[,ordervec]+0.001)-log10(resLDNA[,ordervec]+0.001)
heatmap.2a(resRat[,ordervec],keyName="log10 metaT/metaG",symbreaks=T,
trace="n",Colv="none",col=rathm,Rowv="none",density.info="n",labRow=trsl,dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLProtein>0)){
heatmap.2a(resLProtein[,ordervec],keyName="protein rel quant",
trace="n",Colv="none",col=Prothm,Rowv="none",density.info="n",labRow=trsl,dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
dev.off()
top <- apply(resLRNA,1,function(x) max(x/colSums(resLRNA),na.rm=T))>0.1
pdf(paste(outDir,"/heatmaps_10perc.pdf",sep=""),height=4.9,width=7.8,pointsize=8)
if(any(resLgene[top,]>0)){
heatmap.2a(resLgene[top,ordervec],keyName="number of genes",
trace="n",Colv="none",col=hmgrey,Rowv="none",density.info="n",labRow=trsl[top],dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLclus[top,]>0)){
heatmap.2a(resLclus[top,ordervec],keyName="number of clusters",
trace="n",Colv="none",col=hmgrey,Rowv="none",density.info="n",labRow=trsl[top],dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLDNA[top,]>0)){
heatmap.2a(resLDNA[top,ordervec],keyName="cluster cov metaG",
trace="n",Colv="none",col=DNAhm,Rowv="none",density.info="n",labRow=trsl[top],dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLRNA[top,]>0)){
heatmap.2a(resLRNA[top,ordervec],keyName="gene cov metaT",
trace="n",Colv="none",col=RNAhm,Rowv="none",density.info="n",labRow=trsl[top],dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLRNA[top,]>0)&any(resLDNA[top,]>0)){
resRat <- log10(resLRNA[top,ordervec]+0.001)-log10(resLDNA[top,ordervec]+0.001)
heatmap.2a(resRat[,ordervec],keyName="log10 metaT/metaG",symbreaks=T,
trace="n",Colv="none",col=rathm,Rowv="none",density.info="n",labRow=trsl,dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
if(any(resLProtein[top,]>0)){
heatmap.2a(resLProtein[top,ordervec],keyName="protein rel quant",
trace="n",Colv="none",col=Prothm,Rowv="none",density.info="n",labRow=trsl[top],dendrogram="none",margins=c(4,25),
ColSideColors=rbind(Mcol[visFac],c("black","white")[1+as.numeric(miMeta$DIABETESTY1[visFac]=="No")])[,ordervec])
}
dev.off()
save.image(paste(outDir,"/WS.Rdata",sep=""))
}
}
sample MG-RAST ID project name bps sequences
M01.1-V1 4628901.3 MUSTcombined M01.1V1combinedGenes 306874789 1032885
M01.1-V2 4628902.3 MUSTcombined M01.1V2combinedGenes 316237110 1097696
M01.1-V3 4669474.3 MUSTcombined M1.1V3.combinedAssemblyGenes 328899111 1166358
M01.2-V1 4628903.3 MUSTcombined M01.2V1combinedGenes 277654346 887632
M01.2-V2 4628904.3 MUSTcombined M01.2V2combinedGenes 304799713 963652
M01.2-V3 4628905.3 MUSTcombined M01.2V3combinedGenes 290139955 918590
M01.3-V1 4669475.3 MUSTcombined M1.3V1.combinedAssemblyGenes 218317013 703792
M01.3-V2 4628906.3 MUSTcombined M01.3V2combinedGenes 281448774 857559
M01.3-V3 4628907.3 MUSTcombined M01.3V3combinedGenes 193485507 587840
M01.4-V2 4628909.3 MUSTcombined M01.4V2combinedGenes 327075140 1134311
M01.4-V3 4628910.3 MUSTcombined M01.4V3combinedGenes 198630820 730455
M02.1-V1 4628911.3 MUSTcombined M02.1V1combinedGenes 235838262 768902
M02.1-V2 4628912.3 MUSTcombined M02.1V2combinedGenes 251632226 809249
M02.1-V3 4628913.3 MUSTcombined M02.1V3combinedGenes 227740492 729383
M02.2-V2 4628914.3 MUSTcombined M02.2V2combinedGenes 248165421 741909
M02.2-V3 4628915.3 MUSTcombined M02.2V3combinedGenes 232574763 734061
M02.3-V1 4628916.3 MUSTcombined M02.3V1combinedGenes 221482682 772724
M02.3-V2 4628917.3 MUSTcombined M02.3V2combinedGenes 253875019 813578
M02.3-V3 4669473.3 MUSTcombined M2.3V3.combinedAssemblyGenes 232795970 745497
M02.4-V1 4628918.3 MUSTcombined M02.4V1combinedGenes 307008187 1021626
M02.4-V2 4628919.3 MUSTcombined M02.4V2combinedGenes 287553535 964234
M02.4-V3 4628920.3 MUSTcombined M02.4V3combinedGenes 297349300 943224
M02.5-V1 4628921.3 MUSTcombined M02.5V1combinedGenes 231567045 701177
M02.5-V2 4628922.3 MUSTcombined M02.5V2combinedGenes 197439251 594754
M03.3-V3 4628923.3 MUSTcombined M03.3V3combinedGenes 247276950 723870
M03.4-V2 4628924.3 MUSTcombined M03.4V2combinedGenes 282810825 914964
M03.5-V1 4628925.3 MUSTcombined M03.5V1combinedGenes 232742592 749768
M04.1-V3 4628927.3 MUSTcombined M04.1V3combinedGenes 211842215 676995
M04.2-V2 4628928.3 MUSTcombined M04.2V2combinedGenes 237588004 736214
M04.2-V3 4628929.3 MUSTcombined M04.2V3combinedGenes 223289045 750410
M04.3-V1 4628930.3 MUSTcombined M04.3V1combinedGenes 229245553 709613
M04.4-V1 4628932.3 MUSTcombined M04.4V1combinedGenes 221048092 674436
M04.4-V3 4628934.3 MUSTcombined M04.4V3combinedGenes 240355458 800542
M04.5-V3 4628935.3 MUSTcombined M04.5V3combinedGenes 231803086 765414
M04.6-V2 4628936.3 MUSTcombined M04.6V2combinedGenes 267481944 907877
M04.6-V3 4628937.3 MUSTcombined M04.6V3combinedGenes 272775396 879890
\ No newline at end of file
#!/usr/bin/perl
# this script takes a fasta file with contigs and the output of one Barrnap run to produce a fasta file with rRNA sequences and a .tab file in the style of prodigal
# 4 inputs: - the fasta file of the contigs
# - the .gff file from Barrnap
# - the name of the .tab output
# - the name of the fasta output
# 2 outputs: - a fasta file with the rRNA gene sequences, naming is the contig name appended with _r and a continuous number per contig
# - a table with contig name rRNA gene name (see above), the sense, length, start position, end position, completeness and kind (16S, 5S etc)
# Anna Heintz-Buschart, November 2014
use strict;
use Bio::DB::Fasta;
my $fastaFile = shift;
my $bacFile = shift;
my $listFile = shift;
my $geneFile = shift;
my %allContigs = ();
my $db = Bio::DB::Fasta->new( $fastaFile );
open (IN, $bacFile);
open(GEN, ">", $geneFile) or die "cannot open $geneFile \n";
open(TAB, ">", $listFile) or die "cannot open $listFile \n";
print TAB "contig\tgene\tsense\tlength\tstart\tend\tcompleteness\tkind\n";
while (my $line = <IN>){
unless ($line =~ /^#/) {
chomp $line;
my($contig, undef, undef, $start, $end, undef, $sense, undef, $attribute) = split "\t", $line;
my $completeness = "";
if ($attribute =~ /partial/){
$completeness = "incomplete"
} else {
$completeness = "complete"
}
if (exists $allContigs{$contig}) {
$allContigs{$contig}++
} else {
$allContigs{$contig} = 1
}
my $gene = join("", $contig, "_r", $allContigs{$contig});
my $length = 1 + $end - $start;
my @attributes = split ";", $attribute,2;
my $kind = $attributes[0];
substr($kind, 0 , 5) = "";
print TAB join("\t",$contig,$gene,$sense,$length,$start,$end,$completeness,$kind), "\n";
my $sequence = "";
if ($sense == "+") {
$sequence = $db->seq($contig, $start, $end );
} else {
$sequence = $db->seq($contig, $end, $start);
}
if (!defined( $sequence )) {
print STDERR "Sequence $contig not found. \n";
next;
} else {
print GEN ">$gene\n", "$sequence\n";
}
}
}
close(IN);
close(GEN);
close(TAB);
#!/usr/bin/env python
# this file uses the NCBI taxonomy to return the species, genus, family, order, class, phylum and kingdom of a taxon
# this script uses the NCBI taxonomy to return the species, genus, family, order, class, phylum and kingdom of a taxon
# for multiple taxa a least common ancestor is returned
# 4 inputs: - the path to a folder containing the NCBI taxdump
# - a file with multiple annotations for the same gene
......@@ -8,7 +8,7 @@
# - the name of the output
# 1 output - a table for all genes in the same order as in the two input files with LCA, species, genus, family, order, class, phylum, kingdom (tab separated)
# the code is mostly borrowed from ???, with the exception of the assembly of the output table; Anna Heintz-Buschart, April 2014
# the code is mostly borrowed from Romain Studer (http://evosite3d.blogspot.de/2013/06/browsing-ncbi-taxonomy-with-python.html), with the exception of the assembly of the output table; Anna Heintz-Buschart, April 2014
import os
import sys
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment