add more output in VCF fisher-test

f03e2d48 · Wei Gu · 6153a659 · f03e2d48
Commit f03e2d48 authored 10 years ago by Wei Gu
--- a/tool-data/Rscripts/VCF_FE_test.R
+++ b/tool-data/Rscripts/VCF_FE_test.R
-#!/usr/bin/env Rscript
-
-##############################################################
-## Name: VCF_FE_test.R                                      ##
-## Description: Performs the Fisher exact test to detect    ##
-## significant deviations in the proportions for minor      ##
-## and major alleles between the two subsets provided as    ##
-## input files. The script Create_VCF_Allele_matrix.pl      ##
-## generates the two input files for this script. Each      ##
-## input file provides the data for one cohort/subset       ##
-## Usage: VCF_FE_test.R <input_file_1> <input_file_2>       ##
-##                      <output_file>                       ##
-## Author: Serge Eifes (University of Luxembourg)           ##
-## Email: serge.eifes@uni.lu                                ##
-##                                                          ##
-## This work is licensed under the Creative Commons         ##
-## Attribution-NonCommercial-ShareAlike 4.0                 ##
-## International License. To view a copy of this license,   ##
-## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
-##############################################################
-
-#######################################
-#######################################
-### START     FUNCTIONS            ####
-
-get_input_data = function(file){
-  data = as.matrix(read.delim(file, header=F))
-
-  #Adding column and row names
-  data_w_names = data[-1,-1]
-  rownames = data[-1,1]
-  colnames = t(data[1,-1]) 
-
-  rownames(data_w_names) = rownames
-  colnames(data_w_names) = colnames
-  
-  return(data_w_names)
-}
-
-split_alleles = function(row){
-  row = as.vector(row)
-
-  num_patients = length(row)
-  row.split.list = strsplit(row, "\\/", perl=T)
-  all_alleles_per_marker = vector(mode="character", length=num_patients*2)
-  
-  counter=1
-  
-  for(i in 1:num_patients){
-    patient_alleles = row.split.list[[i]]
-    for(j in 1:2){
-      allele = patient_alleles[j]
-      all_alleles_per_marker[counter] = allele
-      counter=counter+1
-    }
-  }
-  return(table(all_alleles_per_marker))
-}
-
-
-create_subset_count_list = function(data){
-  
-  subset = list()
-  markers = rownames(data)
-  
-  for(i in 1:dim(data)[1]){
-    
-    datarow = data[i,]
-    marker = markers[i]
-    nt_counts = split_alleles(datarow)
-    nt_count_matrix = as.matrix(nt_counts)
-    
-    subset[[marker]] = nt_count_matrix
-  }
-  
-  return(subset)
-}
-
-
-get_sorted_count_vec = function(subset_data){
-  
-  alleles_subset = rownames(subset_data)
-  subset_vec = as.vector(subset_data[,1])
-  names(subset_vec) = alleles_subset
-  subset_vec_sorted = sort(subset_vec, method="quick")
-  
-  #When multiple minor alleles exist in one subset ... merge them:
-  if(length(subset_vec_sorted)>2){
-	  subset_vec_sorted = merge_minor_allele_counts(subset_vec_sorted)
-  }
-  
-  return(subset_vec_sorted)
-}
-
-merge_minor_allele_counts = function(allele_count_vec){
-	
-	
-	#Create new allele count vector with minor and major allele count by merging the minor ones:
-	merged_allele_count_vec = c(
-								sum(allele_count_vec[1:(length(allele_count_vec)-1)]),
-								allele_count_vec[length(allele_count_vec)]
-								)
-	
-	names.merged.allele = paste(names(allele_count_vec[1:(length(allele_count_vec)-1)]), collapse="/")
-	
-	return(merged_allele_count_vec)
-}
-
-#Checks sorted_count_vec if minor allele (with count 0) needs to be added
-check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
-	if(length(sorted_count_vec_a)==1){
-		
-		#Need to add minor allele with count 0 if it exists in other subset
-		sorted_count_vec = vector(mode="integer", length=2)
-		sorted_count_vec[2] = sorted_count_vec_a[1]
-		sorted_count_vec[1] = 0
-		sorted_count_vec_a = sorted_count_vec
-	}
-	return(sorted_count_vec_a)
-}
-
-### END       FUNCTIONS          ######
-#######################################
-#######################################
-
-
-
-#######################################
-### START        MAIN CODE         ####
-
-
-#Getting command line arguments
-args = commandArgs(trailingOnly = TRUE)
-
-input_file_1=as.vector(args[1])
-input_file_2=as.vector(args[2])
-output_file=as.vector(args[3])
-
-data_1 = get_input_data(input_file_1)
-data_2 = get_input_data(input_file_2)
-
-#Test case for debugging
-# print(data_1["1_3396099",])
-# print("___")
-# print(data_2["1_3396099",])
-
-
-subset_list_1 = create_subset_count_list(data_1)
-subset_list_2 = create_subset_count_list(data_2)
- 
-markers_subset_list_1 = names(subset_list_1)
-markers_subset_list_2 = names(subset_list_2)
- 
-# In this matrix the results of the FE test for each marker
-#will be stored
-results = matrix(c("Chromosome", "Position", "P-value"), ncol=3, nrow=1)
-
-#Iterating over each marker and performing FE test to compare
-#minor and major allele frequencies in both subsets
-for(i in 1:length(markers_subset_list_1)){
-
-  
-  subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
-  subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
-  
-  
-  sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
-  sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
-  
-  if((length(sorted_count_vec_1)!=1) || (length(sorted_count_vec_2)!=1)){
-     
-	#sorted_count_vec_1 = check_sorted_count_vec_for_minor(sorted_count_vec_1)
-	#sorted_count_vec_2 = check_sorted_count_vec_for_minor(sorted_count_vec_2)
-    if(length(sorted_count_vec_1)==1){
-      
-      #Need to add minor allele with count 0 if it exists in other subset
-      cp_sorted_count_vec_2 = sorted_count_vec_2
-      cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
-      cp_sorted_count_vec_2[1] = 0
-      sorted_count_vec_1 = cp_sorted_count_vec_2
-    }
-    
-    if(length(sorted_count_vec_2)==1){
-      
-      #Need to add minor allele with count 0 if it exists in other subset
-      cp_sorted_count_vec_1 = sorted_count_vec_1
-      cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
-      cp_sorted_count_vec_1[1] = 0
-      sorted_count_vec_2 = cp_sorted_count_vec_1
-    }
-	
-	
-	print(paste(nchar(names(sorted_count_vec_1)[1]), nchar(names(sorted_count_vec_2)[1]), sep="--" ))
-	#In case one of the subsets contains no or multiple minor alleles
-	#we rename the minor allele of the other subset according to this one
-	if(nchar(names(sorted_count_vec_1)[1]) > nchar(names(sorted_count_vec_2)[1])){
-		names(sorted_count_vec_2) = names(sorted_count_vec_1)
-	}
-	if(nchar(names(sorted_count_vec_2)[1]) > nchar(names(sorted_count_vec_1)[1])){
-		names(sorted_count_vec_1) = names(sorted_count_vec_2)
-	}
-	
-	##2x2 contingency table
-    input.matrix = matrix(nrow=2, ncol=2)
-    input.matrix[1,] = sorted_count_vec_1
-    input.matrix[2,] = sorted_count_vec_2
-    rownames(input.matrix) = c("subset_1", "subset_2")
-    colnames(input.matrix) = names(sorted_count_vec_1)
-    
-	
-	#FE test
-    p.value = fisher.test(input.matrix)$p.value
-
-	#Reformating and storing results in a matrix
-	marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
-	results = rbind(results, c(marker_split[1], marker_split[2], p.value))
-
-	}
-}
-
-#Printing results to file ordered by p-value
-results_header = as.vector(results[1,])
-results = results[-1,]
-colnames(results) = results_header
-results = results[order(results[,"P-value"]), ]
-
-##Adding fdr MHC:
-#fdr = p.adjust(results[,"P-value"], method="fdr")
-#results = cbind(results, FDR=fdr)
-
-write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
-
-### END          MAIN CODE         ####
-#######################################
-
+#!/usr/bin/env Rscript
+
+##############################################################
+## Name: VCF_FE_test.R                                      ##
+## Description: Performs the Fisher exact test to detect    ##
+## significant deviations in the proportions for minor      ##
+## and major alleles between the two subsets provided as    ##
+## input files. The script Create_VCF_Allele_matrix.pl      ##
+## generates the two input files for this script. Each      ##
+## input file provides the data for one cohort/subset       ##
+## Usage: VCF_FE_test.R <input_file_1> <input_file_2>       ##
+##                      <output_file>                       ##
+## Author: Serge Eifes (University of Luxembourg)           ##
+## Email: serge.eifes@uni.lu                                ##
+##                                                          ##
+## This work is licensed under the Creative Commons         ##
+## Attribution-NonCommercial-ShareAlike 4.0                 ##
+## International License. To view a copy of this license,   ##
+## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
+##############################################################
+
+#######################################
+#######################################
+### START     FUNCTIONS            ####
+
+get_input_data = function(file){
+	data = as.matrix(read.delim(file, header=F))
+	
+	#Adding column and row names
+	data_w_names = data[-1,-1]
+	rownames = data[-1,1]
+	colnames = t(data[1,-1]) 
+	
+	rownames(data_w_names) = rownames
+	colnames(data_w_names) = colnames
+	
+	return(data_w_names)
+}
+
+split_alleles = function(row){
+	row = as.vector(row)
+	
+	num_patients = length(row)
+	row.split.list = strsplit(row, "\\/", perl=T)
+	all_alleles_per_marker = vector(mode="character", length=num_patients*2)
+	
+	counter=1
+	
+	for(i in 1:num_patients){
+		patient_alleles = row.split.list[[i]]
+		for(j in 1:2){
+			allele = patient_alleles[j]
+			all_alleles_per_marker[counter] = allele
+			counter=counter+1
+		}
+	}
+	return(table(all_alleles_per_marker))
+}
+
+
+create_subset_count_list = function(data){
+	
+	subset = list()
+	markers = rownames(data)
+	
+	for(i in 1:dim(data)[1]){
+		
+		datarow = data[i,]
+		marker = markers[i]
+		nt_counts = split_alleles(datarow)
+		nt_count_matrix = as.matrix(nt_counts)
+		
+		subset[[marker]] = nt_count_matrix
+	}
+	
+	return(subset)
+}
+
+
+get_sorted_count_vec = function(subset_data){
+	
+	alleles_subset = rownames(subset_data)
+	subset_vec = as.vector(subset_data[,1])
+	names(subset_vec) = alleles_subset
+	subset_vec_sorted = sort(subset_vec, method="quick")
+	
+	#When multiple minor alleles exist in one subset ... merge them:
+	if(length(subset_vec_sorted)>2){
+		subset_vec_sorted = merge_minor_allele_counts(subset_vec_sorted)
+	}
+	
+	return(subset_vec_sorted)
+}
+
+merge_minor_allele_counts = function(allele_count_vec){
+	
+	
+	#Create new allele count vector with minor and major allele count by merging the minor ones:
+	merged_allele_count_vec = c(
+			sum(allele_count_vec[1:(length(allele_count_vec)-1)]),
+			allele_count_vec[length(allele_count_vec)]
+	)
+	
+	names.merged.allele = paste(names(allele_count_vec[1:(length(allele_count_vec)-1)]), collapse="/")
+	
+	return(merged_allele_count_vec)
+}
+
+#Checks sorted_count_vec if minor allele (with count 0) needs to be added
+check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
+	if(length(sorted_count_vec_a)==1){
+		
+		#Need to add minor allele with count 0 if it exists in other subset
+		sorted_count_vec = vector(mode="integer", length=2)
+		sorted_count_vec[2] = sorted_count_vec_a[1]
+		sorted_count_vec[1] = 0
+		sorted_count_vec_a = sorted_count_vec
+	}
+	return(sorted_count_vec_a)
+}
+
+### END       FUNCTIONS          ######
+#######################################
+#######################################
+
+
+
+#######################################
+### START        MAIN CODE         ####
+
+
+#Getting command line arguments
+args = commandArgs(trailingOnly = TRUE)
+
+input_file_1=as.vector(args[1])
+input_file_2=as.vector(args[2])
+output_file=as.vector(args[3])
+
+data_1 = get_input_data(input_file_1)
+data_2 = get_input_data(input_file_2)
+
+#Test case for debugging
+# print(data_1["1_3396099",])
+# print("___")
+# print(data_2["1_3396099",])
+
+
+subset_list_1 = create_subset_count_list(data_1)
+subset_list_2 = create_subset_count_list(data_2)
+
+markers_subset_list_1 = names(subset_list_1)
+markers_subset_list_2 = names(subset_list_2)
+
+
+# In this matrix the results of the FE test for each marker
+#will be stored
+results = matrix(c("Chromosome", "Position",
+				"Subset_1_Minor_Allele_Count", "Subset_1_Major_Allele_Count",
+				"Subset_2_Minor_Allele_Count", "Subset_2_Major_Allele_Count",
+				"Odds_ratio", "P-value"),
+		ncol=8, nrow=1)
+
+
+#Iterating over each marker and performing FE test to compare
+#minor and major allele frequencies in both subsets
+for(i in 1:length(markers_subset_list_1)){
+	
+	
+	subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
+	subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
+	
+	
+	sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
+	sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
+	
+	
+	#Adding minor allele count (equals 0) to sorted_count_vec_1 if not existing in vector:
+	if(length(sorted_count_vec_1)==1){
+		
+		#Need to add minor allele with count 0 if it exists in other subset
+		cp_sorted_count_vec_2 = sorted_count_vec_2
+		cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
+		cp_sorted_count_vec_2[1] = 0
+		sorted_count_vec_1 = cp_sorted_count_vec_2
+	}
+	
+	#Adding minor allele count (equals 0) to sorted_count_vec_1 if not existing:
+	if(length(sorted_count_vec_2)==1){
+		
+		#Need to add minor allele with count 0 if it exists in other subset
+		cp_sorted_count_vec_1 = sorted_count_vec_1
+		cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
+		cp_sorted_count_vec_1[1] = 0
+		sorted_count_vec_2 = cp_sorted_count_vec_1
+	}
+	
+	#In case one of the subsets contains no or multiple minor alleles
+	#we rename the minor allele of the other subset according to this one
+	if(nchar(names(sorted_count_vec_1)[1]) > nchar(names(sorted_count_vec_2)[1])){
+		names(sorted_count_vec_2) = names(sorted_count_vec_1)
+	}
+	if(nchar(names(sorted_count_vec_2)[1]) > nchar(names(sorted_count_vec_1)[1])){
+		names(sorted_count_vec_1) = names(sorted_count_vec_2)
+	}
+	
+	##2x2 contingency table
+	input.matrix = matrix(nrow=2, ncol=2)
+	input.matrix[1,] = sorted_count_vec_1
+	input.matrix[2,] = sorted_count_vec_2
+	rownames(input.matrix) = c("subset_1", "subset_2")
+	colnames(input.matrix) = names(sorted_count_vec_1)
+	
+	
+	#FE test
+	res.test = fisher.test(input.matrix)
+	p.value = res.test$p.value
+	odds_ratio = res.test$estimate
+	
+#	print(input.matrix)
+#	print(res.test)
+	#Reformating and storing results in a matrix
+	marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
+	results = rbind(results, c(marker_split[1], marker_split[2],
+					input.matrix[1,1], input.matrix[1,2],
+					input.matrix[2,1], input.matrix[2,2],
+					odds_ratio, p.value)
+	)
+}
+
+
+#Printing results to file ordered by p-value
+results_header = as.vector(results[1,])
+results = results[-1,]
+colnames(results) = results_header
+results = results[order(results[,"P-value"]), ]
+
+
+##Adding fdr MHC:
+fdr = p.adjust(results[,"P-value"], method="fdr")
+results = cbind(results, FDR=fdr)
+
+write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
+
+#print(warnings())
+### END          MAIN CODE         ####
+#######################################
+