update VCF fisher test

6153a659 · Wei Gu · 05ddf0be · 6153a659
Commit 6153a659 authored 10 years ago by Wei Gu
--- a/tool-data/Rscripts/VCF_FE_test.R
+++ b/tool-data/Rscripts/VCF_FE_test.R
-#!/usr/bin/env Rscript
-
-##############################################################
-## Name: VCF_FE_test.R                                      ##
-## Description: Performs the Fisher exact test to detect    ##
-## significant deviations in the proportions for minor      ##
-## and major alleles between the two subsets provided as    ##
-## input files.
-## Usage: VCF_FE_test.R <input_file_1> <input_file_2>       ##
-##                      <output_file>                       ##
-## Author: serge.eifes@uni.lu                               ##
-##############################################################
-
-#######################################
-#######################################
-### START     FUNCTIONS            ####
-
-get_input_data = function(file){
-  data = as.matrix(read.delim(file, header=F))
-
-  #Adding column and row names
-  data_w_names = data[-1,-1]
-  rownames = data[-1,1]
-  colnames = t(data[1,-1]) 
-
-  rownames(data_w_names) = rownames
-  colnames(data_w_names) = colnames
-  
-  return(data_w_names)
-}
-
-split_alleles = function(row){
-  row = as.vector(row)
-
-  num_patients = length(row)
-  row.split.list = strsplit(row, "\\/", perl=T)
-  all_alleles_per_marker = vector(mode="character", length=num_patients*2)
-  
-  counter=1
-  
-  for(i in 1:num_patients){
-    patient_alleles = row.split.list[[i]]
-    for(j in 1:2){
-      allele = patient_alleles[j]
-      all_alleles_per_marker[counter] = allele
-      counter=counter+1
-    }
-  }
-  return(table(all_alleles_per_marker))
-}
-
-
-create_subset_count_list = function(data){
-  
-  subset = list()
-  markers = rownames(data)
-  
-  for(i in 1:dim(data)[1]){
-    
-    datarow = data[i,]
-    marker = markers[i]
-    nt_counts = split_alleles(datarow)
-    nt_count_matrix = as.matrix(nt_counts)
-    
-    subset[[marker]] = nt_count_matrix
-  }
-  
-  return(subset)
-}
-
-
-get_sorted_count_vec = function(subset_data){
-  
-  alleles_subset = rownames(subset_data)
-  subset_vec = as.vector(subset_data[,1])
-  names(subset_vec) = alleles_subset
-  subset_vec_sorted = sort(subset_vec, method="quick")
-  return(subset_vec_sorted)
-}
-
-
-#Checks sorted_count_vec if minor allele (with count 0) needs to be added
-check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
-	if(length(sorted_count_vec_a)==1){
-		
-		#Need to add minor allele with count 0 if it exists in other subset
-		sorted_count_vec = vector(mode="integer", length=2)
-		sorted_count_vec[2] = sorted_count_vec_a[1]
-		sorted_count_vec[1] = 0
-		sorted_count_vec_a = sorted_count_vec
-	}
-	return(sorted_count_vec_a)
-}
-
-### END       FUNCTIONS          ######
-#######################################
-#######################################
-
-
-
-#######################################
-### START        MAIN CODE         ####
-
-
-#Getting command line arguments
-args = commandArgs(trailingOnly = TRUE)
-
-input_file_1=as.vector(args[1])
-input_file_2=as.vector(args[2])
-output_file=as.vector(args[3])
-
-data_1 = get_input_data(input_file_1)
-data_2 = get_input_data(input_file_2)
-
-#Test case for debugging
-# print(data_1["1_3396099",])
-# print("___")
-# print(data_2["1_3396099",])
-
-
-subset_list_1 = create_subset_count_list(data_1)
-subset_list_2 = create_subset_count_list(data_2)
- 
-markers_subset_list_1 = names(subset_list_1)
-markers_subset_list_2 = names(subset_list_2)
- 
-# In this matrix the results of the FE test for each marker
-#will be stored
-results = matrix(c("Chromosome", "Position", "P-value"), ncol=3, nrow=1)
-
-#Iterating over each marker and performing FE test to compare
-#minor and major allele frequencies in both subsets
-for(i in 1:length(markers_subset_list_1)){
-
-  
-  subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
-  subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
-  
-  
-  sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
-  sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
-  
-  if((length(sorted_count_vec_1)!=1) || (length(sorted_count_vec_2)!=1)){
-     
-	#sorted_count_vec_1 = check_sorted_count_vec_for_minor(sorted_count_vec_1)
-	#sorted_count_vec_2 = check_sorted_count_vec_for_minor(sorted_count_vec_2)
-    if(length(sorted_count_vec_1)==1){
-      
-      #Need to add minor allele with count 0 if it exists in other subset
-      cp_sorted_count_vec_2 = sorted_count_vec_2
-      cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
-      cp_sorted_count_vec_2[1] = 0
-      sorted_count_vec_1 = cp_sorted_count_vec_2
-    }
-    
-    if(length(sorted_count_vec_2)==1){
-      
-      #Need to add minor allele with count 0 if it exists in other subset
-      cp_sorted_count_vec_1 = sorted_count_vec_1
-      cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
-      cp_sorted_count_vec_1[1] = 0
-      sorted_count_vec_2 = cp_sorted_count_vec_1
-    }
-	
-	##2x2 contingency table
-    input.matrix = matrix(nrow=2, ncol=2)
-    input.matrix[1,] = sorted_count_vec_1
-    input.matrix[2,] = sorted_count_vec_2
-    rownames(input.matrix) = c("subset_1", "subset_2")
-    colnames(input.matrix) = names(sorted_count_vec_1)
-    
-	#FE test
-    p.value = fisher.test(input.matrix)$p.value
-
-	#Reformating and storing results in a matrix
-	marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
-	results = rbind(results, c(marker_split[1], marker_split[2], p.value))
-
-#	if(p.value<0.15){
-#		print(markers_subset_list_1[i])
-#		print(input.matrix)
-#		print(p.value)
-#		print("//////////////")
-#		}
-	}
-}
-
-#Printing results to file ordered by p-value
-results_header = as.vector(results[1,])
-results = results[-1,]
-colnames(results) = results_header
-results = results[order(results[,"P-value"]), ]
-write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
-
-### END          MAIN CODE         ####
-#######################################
-
+#!/usr/bin/env Rscript
+
+##############################################################
+## Name: VCF_FE_test.R                                      ##
+## Description: Performs the Fisher exact test to detect    ##
+## significant deviations in the proportions for minor      ##
+## and major alleles between the two subsets provided as    ##
+## input files. The script Create_VCF_Allele_matrix.pl      ##
+## generates the two input files for this script. Each      ##
+## input file provides the data for one cohort/subset       ##
+## Usage: VCF_FE_test.R <input_file_1> <input_file_2>       ##
+##                      <output_file>                       ##
+## Author: Serge Eifes (University of Luxembourg)           ##
+## Email: serge.eifes@uni.lu                                ##
+##                                                          ##
+## This work is licensed under the Creative Commons         ##
+## Attribution-NonCommercial-ShareAlike 4.0                 ##
+## International License. To view a copy of this license,   ##
+## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
+##############################################################
+
+#######################################
+#######################################
+### START     FUNCTIONS            ####
+
+get_input_data = function(file){
+  data = as.matrix(read.delim(file, header=F))
+
+  #Adding column and row names
+  data_w_names = data[-1,-1]
+  rownames = data[-1,1]
+  colnames = t(data[1,-1]) 
+
+  rownames(data_w_names) = rownames
+  colnames(data_w_names) = colnames
+  
+  return(data_w_names)
+}
+
+split_alleles = function(row){
+  row = as.vector(row)
+
+  num_patients = length(row)
+  row.split.list = strsplit(row, "\\/", perl=T)
+  all_alleles_per_marker = vector(mode="character", length=num_patients*2)
+  
+  counter=1
+  
+  for(i in 1:num_patients){
+    patient_alleles = row.split.list[[i]]
+    for(j in 1:2){
+      allele = patient_alleles[j]
+      all_alleles_per_marker[counter] = allele
+      counter=counter+1
+    }
+  }
+  return(table(all_alleles_per_marker))
+}
+
+
+create_subset_count_list = function(data){
+  
+  subset = list()
+  markers = rownames(data)
+  
+  for(i in 1:dim(data)[1]){
+    
+    datarow = data[i,]
+    marker = markers[i]
+    nt_counts = split_alleles(datarow)
+    nt_count_matrix = as.matrix(nt_counts)
+    
+    subset[[marker]] = nt_count_matrix
+  }
+  
+  return(subset)
+}
+
+
+get_sorted_count_vec = function(subset_data){
+  
+  alleles_subset = rownames(subset_data)
+  subset_vec = as.vector(subset_data[,1])
+  names(subset_vec) = alleles_subset
+  subset_vec_sorted = sort(subset_vec, method="quick")
+  
+  #When multiple minor alleles exist in one subset ... merge them:
+  if(length(subset_vec_sorted)>2){
+	  subset_vec_sorted = merge_minor_allele_counts(subset_vec_sorted)
+  }
+  
+  return(subset_vec_sorted)
+}
+
+merge_minor_allele_counts = function(allele_count_vec){
+	
+	
+	#Create new allele count vector with minor and major allele count by merging the minor ones:
+	merged_allele_count_vec = c(
+								sum(allele_count_vec[1:(length(allele_count_vec)-1)]),
+								allele_count_vec[length(allele_count_vec)]
+								)
+	
+	names.merged.allele = paste(names(allele_count_vec[1:(length(allele_count_vec)-1)]), collapse="/")
+	
+	return(merged_allele_count_vec)
+}
+
+#Checks sorted_count_vec if minor allele (with count 0) needs to be added
+check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
+	if(length(sorted_count_vec_a)==1){
+		
+		#Need to add minor allele with count 0 if it exists in other subset
+		sorted_count_vec = vector(mode="integer", length=2)
+		sorted_count_vec[2] = sorted_count_vec_a[1]
+		sorted_count_vec[1] = 0
+		sorted_count_vec_a = sorted_count_vec
+	}
+	return(sorted_count_vec_a)
+}
+
+### END       FUNCTIONS          ######
+#######################################
+#######################################
+
+
+
+#######################################
+### START        MAIN CODE         ####
+
+
+#Getting command line arguments
+args = commandArgs(trailingOnly = TRUE)
+
+input_file_1=as.vector(args[1])
+input_file_2=as.vector(args[2])
+output_file=as.vector(args[3])
+
+data_1 = get_input_data(input_file_1)
+data_2 = get_input_data(input_file_2)
+
+#Test case for debugging
+# print(data_1["1_3396099",])
+# print("___")
+# print(data_2["1_3396099",])
+
+
+subset_list_1 = create_subset_count_list(data_1)
+subset_list_2 = create_subset_count_list(data_2)
+ 
+markers_subset_list_1 = names(subset_list_1)
+markers_subset_list_2 = names(subset_list_2)
+ 
+# In this matrix the results of the FE test for each marker
+#will be stored
+results = matrix(c("Chromosome", "Position", "P-value"), ncol=3, nrow=1)
+
+#Iterating over each marker and performing FE test to compare
+#minor and major allele frequencies in both subsets
+for(i in 1:length(markers_subset_list_1)){
+
+  
+  subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
+  subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
+  
+  
+  sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
+  sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
+  
+  if((length(sorted_count_vec_1)!=1) || (length(sorted_count_vec_2)!=1)){
+     
+	#sorted_count_vec_1 = check_sorted_count_vec_for_minor(sorted_count_vec_1)
+	#sorted_count_vec_2 = check_sorted_count_vec_for_minor(sorted_count_vec_2)
+    if(length(sorted_count_vec_1)==1){
+      
+      #Need to add minor allele with count 0 if it exists in other subset
+      cp_sorted_count_vec_2 = sorted_count_vec_2
+      cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
+      cp_sorted_count_vec_2[1] = 0
+      sorted_count_vec_1 = cp_sorted_count_vec_2
+    }
+    
+    if(length(sorted_count_vec_2)==1){
+      
+      #Need to add minor allele with count 0 if it exists in other subset
+      cp_sorted_count_vec_1 = sorted_count_vec_1
+      cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
+      cp_sorted_count_vec_1[1] = 0
+      sorted_count_vec_2 = cp_sorted_count_vec_1
+    }
+	
+	
+	print(paste(nchar(names(sorted_count_vec_1)[1]), nchar(names(sorted_count_vec_2)[1]), sep="--" ))
+	#In case one of the subsets contains no or multiple minor alleles
+	#we rename the minor allele of the other subset according to this one
+	if(nchar(names(sorted_count_vec_1)[1]) > nchar(names(sorted_count_vec_2)[1])){
+		names(sorted_count_vec_2) = names(sorted_count_vec_1)
+	}
+	if(nchar(names(sorted_count_vec_2)[1]) > nchar(names(sorted_count_vec_1)[1])){
+		names(sorted_count_vec_1) = names(sorted_count_vec_2)
+	}
+	
+	##2x2 contingency table
+    input.matrix = matrix(nrow=2, ncol=2)
+    input.matrix[1,] = sorted_count_vec_1
+    input.matrix[2,] = sorted_count_vec_2
+    rownames(input.matrix) = c("subset_1", "subset_2")
+    colnames(input.matrix) = names(sorted_count_vec_1)
+    
+	
+	#FE test
+    p.value = fisher.test(input.matrix)$p.value
+
+	#Reformating and storing results in a matrix
+	marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
+	results = rbind(results, c(marker_split[1], marker_split[2], p.value))
+
+	}
+}
+
+#Printing results to file ordered by p-value
+results_header = as.vector(results[1,])
+results = results[-1,]
+colnames(results) = results_header
+results = results[order(results[,"P-value"]), ]
+
+##Adding fdr MHC:
+#fdr = p.adjust(results[,"P-value"], method="fdr")
+#results = cbind(results, FDR=fdr)
+
+write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
+
+### END          MAIN CODE         ####
+#######################################
+