150928_MUST_relatedClusterWSFromMongo.R 5.63 KB
Newer Older
AHB's avatar
AHB committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#this script gets all genes of related genomes, along with their expression levels on the different omic levels and the best annotation
#### reads are summed up by functions and DESeq-normalized
#### and creates a number of workspace with the data

# the script uses allClusterInfo.RDS, which contains statistics on all bins of all samples
# the function getExprData() defined in lines 23-68 does the interfacing with MongoDB
# the loop in line 76-109 does the gathering by same taxonomic annotation

# written by Anna Heintz-Buschart, September 2015 - this version is from January 2016, shortened to the part that was actually used in the MuSt

clusterInfo <- readRDS("allClusterInfo.RDS")

library(rmongodb)
mongo <- mongo.create()
db <- "mydb"
coll <- "mydb.must"
if(mongo.is.connected(mongo)) print(paste("found connection to mongod -",mongo.count(mongo,coll),"documents in collection"))

library(DESeq2)

#####

getExprData <- function(sampleID,clusOI){
  #matches for pipelines:
  matchFun <- mongo.bson.from.JSON(paste('{"$match": {"genes.bestAnnotation": {"$exists": "true"}} }',sep=""))
  matchCluster <- mongo.bson.from.JSON(paste('{"$match": {"cluster": "',clusOI,'"} }',sep=""))
  matchSample <- mongo.bson.from.JSON(paste('{"$match": {"sample": "',sampleID,'"} }',sep=""))
  matchProt <- mongo.bson.from.JSON(paste('{"$match": {"genes.proteinIdentification": "uniquely"} }',sep=""))
  #unwinds for pipelines:
  unwindGenes <- mongo.bson.from.JSON('{"$unwind": "$genes"}')
  unwindProteins <- mongo.bson.from.JSON('{"$unwind": "$genes.proteinIdentification"}')
  #grouping for pipelines:
  groupGeneA <- mongo.bson.from.JSON('{"$group": {"_id": "$_id", "gene": {"$push": "$genes.gene"},
                                     "aveCovDNA":{"$push":"$genes.aveCovDNA"},"aveCovRNA":{"$push":"$genes.aveCovRNAfw"},
                                     "readsRNA":{"$push":"$genes.readsRNAfw"},"fun":{"$push":"$genes.bestAnnotation"}}}')
  groupGeneProt <- mongo.bson.from.JSON('{"$group": {"_id": "$_id", "gene": {"$push": "$genes.gene"},
                                        "proteinIdentification":{"$push":"$genes.proteinIdentification"},
                                        "proteinArea":{"$push":"$genes.proteinArea"}}}')  
  #projections for pipelines:
  projectGeneA <- mongo.bson.from.JSON('{"$project": {"_id": 0, "fun":1, "gene":1, "aveCovDNA":1,"aveCovRNA":1,"readsRNA":1}}')
  projectGeneProt <- mongo.bson.from.JSON('{"$project": {"_id": 0, "gene":1,"proteinIdentification":1,"proteinArea":1}}')
  
  if(!mongo.is.connected(mongo)) {
    stop("Mongo is not connected.")
  }else{
    #aggregation pipelines:
    genesA <- mongo.bson.to.list(mongo.aggregation(mongo,coll,list(matchCluster,matchSample,unwindGenes,matchFun,groupGeneA,
                                                                   projectGeneA)))$result
    if(length(genesA)==0){ 
      warning("No genes found")
    } else {
      tFeat <- do.call(rbind,lapply(genesA,function(x) cbind(x$gene,x$fun,x$aveCovDNA,x$aveCovRNA,x$readsRNA)))
      tFeat[sapply(tFeat[,2],length)>1,2] <- sapply(tFeat[sapply(tFeat[,2],length)>1,2],function(x) paste(x,sep=";",collapse=";"))
      tFeat <- data.frame("gene"=unlist(tFeat[,1]),"bestAnno"=unlist(tFeat[,2]),"aveCovDNA"=as.numeric(unlist(tFeat[,3])),
                          "aveCovRNA"=as.numeric(unlist(tFeat[,4])),"readsRNA"=as.numeric(unlist(tFeat[,5])),stringsAsFactors=F)
      
      genesP <- mongo.bson.to.list(mongo.aggregation(mongo,coll,list(matchCluster,matchSample,unwindGenes,matchFun,matchProt,groupGeneProt,
                                                                     projectGeneProt)))$result
      pFeat <- do.call(rbind,lapply(genesP,function(x) cbind(x$gene, x$proteinArea)))
      pFeat[sapply(pFeat[,2],length)>1,2] <- sapply(pFeat[sapply(pFeat[,2],length)>1,2], mean)
      pFeat <- data.frame("gene"=unlist(pFeat[,1]),"proteinArea"=as.numeric(unlist(pFeat[,2])),stringsAsFactors=F)
      aFeat <- merge(tFeat,pFeat,by=1,all.x=T)
      aFeat[is.na(aFeat)] <- 0
      if(!"proteinArea" %in% colnames(aFeat)) aFeat$proteinArea <- 0
      return(aFeat)
    } 
  }
}


motuDup <- names(table(sapply(grep(";",clusterInfo$motuPresent,invert=T,value=T),
                              function(x)unlist(strsplit(x,"\\("))[1]))
                 [table(sapply(grep(";",clusterInfo$motuPresent,invert=T,value=T),
                               function(x)unlist(strsplit(x,"\\("))[1]))>1])

for(currMotu in motuDup){
  clOI <- clusterInfo[grepl(paste(currMotu,"\\(",sep=""),clusterInfo$motuPresent)&!grepl(";",clusterInfo$motuPresent),c("cluster","sample")]
  first <- T
  for(i in 1:nrow(clOI)){
    sI <- clOI$sample[i]
    clI <- clOI$cluster[i]
    if(!is.na(sI)&!is.na(clI)){
      cwd <- paste(sI,clI,sep="_")
      if(first){
        exprall <- data.frame("cluster"=cwd,getExprData(sI,clI),stringsAsFactors=F)
        first <- F
      }else{
        texp <- data.frame("cluster"=cwd,getExprData(sI,clI),stringsAsFactors=F)
        exprall <- rbind(exprall,texp)
      }
    }
  }
  pres <- tapply(exprall$gene,list(exprall$bestAnno,exprall$cluster),function(x)as.numeric(length(x)>0))
  pres[is.na(pres)] <-0
  if(ncol(pres)>1&nrow(pres)>1){    
    readR <- tapply(exprall$readsRNA,list(exprall$bestAnno,exprall$cluster),sum)
    readR[is.na(readR)] <-0

    gps <- c(grep("P",colnames(pres)),grep("G",colnames(pres)))
    if(length(gps)>2){
      readRA <- readR[,gps]
      readRA <- readRA[apply(dcov[,gps],1,function(x)all(x>0)),]
      readDDS <- DESeqDataSetFromMatrix(readRA,data.frame("sample"=colnames(readRA)),~sample)
      ts2 <- estimateSizeFactors(readDDS)
      readRN <- counts(ts2,normalized=T)
      save.image(paste(currMotu,"_WS.Rdata",sep=""))
    } 
  }
}

mongo.destroy(mongo)