Skip to content
Snippets Groups Projects
Commit 6153a659 authored by Wei Gu's avatar Wei Gu
Browse files

update VCF fisher test

parent 05ddf0be
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env Rscript
##############################################################
## Name: VCF_FE_test.R ##
## Description: Performs the Fisher exact test to detect ##
## significant deviations in the proportions for minor ##
## and major alleles between the two subsets provided as ##
## input files.
## Usage: VCF_FE_test.R <input_file_1> <input_file_2> ##
## <output_file> ##
## Author: serge.eifes@uni.lu ##
##############################################################
#######################################
#######################################
### START FUNCTIONS ####
get_input_data = function(file){
data = as.matrix(read.delim(file, header=F))
#Adding column and row names
data_w_names = data[-1,-1]
rownames = data[-1,1]
colnames = t(data[1,-1])
rownames(data_w_names) = rownames
colnames(data_w_names) = colnames
return(data_w_names)
}
split_alleles = function(row){
row = as.vector(row)
num_patients = length(row)
row.split.list = strsplit(row, "\\/", perl=T)
all_alleles_per_marker = vector(mode="character", length=num_patients*2)
counter=1
for(i in 1:num_patients){
patient_alleles = row.split.list[[i]]
for(j in 1:2){
allele = patient_alleles[j]
all_alleles_per_marker[counter] = allele
counter=counter+1
}
}
return(table(all_alleles_per_marker))
}
create_subset_count_list = function(data){
subset = list()
markers = rownames(data)
for(i in 1:dim(data)[1]){
datarow = data[i,]
marker = markers[i]
nt_counts = split_alleles(datarow)
nt_count_matrix = as.matrix(nt_counts)
subset[[marker]] = nt_count_matrix
}
return(subset)
}
get_sorted_count_vec = function(subset_data){
alleles_subset = rownames(subset_data)
subset_vec = as.vector(subset_data[,1])
names(subset_vec) = alleles_subset
subset_vec_sorted = sort(subset_vec, method="quick")
return(subset_vec_sorted)
}
#Checks sorted_count_vec if minor allele (with count 0) needs to be added
check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
if(length(sorted_count_vec_a)==1){
#Need to add minor allele with count 0 if it exists in other subset
sorted_count_vec = vector(mode="integer", length=2)
sorted_count_vec[2] = sorted_count_vec_a[1]
sorted_count_vec[1] = 0
sorted_count_vec_a = sorted_count_vec
}
return(sorted_count_vec_a)
}
### END FUNCTIONS ######
#######################################
#######################################
#######################################
### START MAIN CODE ####
#Getting command line arguments
args = commandArgs(trailingOnly = TRUE)
input_file_1=as.vector(args[1])
input_file_2=as.vector(args[2])
output_file=as.vector(args[3])
data_1 = get_input_data(input_file_1)
data_2 = get_input_data(input_file_2)
#Test case for debugging
# print(data_1["1_3396099",])
# print("___")
# print(data_2["1_3396099",])
subset_list_1 = create_subset_count_list(data_1)
subset_list_2 = create_subset_count_list(data_2)
markers_subset_list_1 = names(subset_list_1)
markers_subset_list_2 = names(subset_list_2)
# In this matrix the results of the FE test for each marker
#will be stored
results = matrix(c("Chromosome", "Position", "P-value"), ncol=3, nrow=1)
#Iterating over each marker and performing FE test to compare
#minor and major allele frequencies in both subsets
for(i in 1:length(markers_subset_list_1)){
subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
if((length(sorted_count_vec_1)!=1) || (length(sorted_count_vec_2)!=1)){
#sorted_count_vec_1 = check_sorted_count_vec_for_minor(sorted_count_vec_1)
#sorted_count_vec_2 = check_sorted_count_vec_for_minor(sorted_count_vec_2)
if(length(sorted_count_vec_1)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_2 = sorted_count_vec_2
cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
cp_sorted_count_vec_2[1] = 0
sorted_count_vec_1 = cp_sorted_count_vec_2
}
if(length(sorted_count_vec_2)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_1 = sorted_count_vec_1
cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
cp_sorted_count_vec_1[1] = 0
sorted_count_vec_2 = cp_sorted_count_vec_1
}
##2x2 contingency table
input.matrix = matrix(nrow=2, ncol=2)
input.matrix[1,] = sorted_count_vec_1
input.matrix[2,] = sorted_count_vec_2
rownames(input.matrix) = c("subset_1", "subset_2")
colnames(input.matrix) = names(sorted_count_vec_1)
#FE test
p.value = fisher.test(input.matrix)$p.value
#Reformating and storing results in a matrix
marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
results = rbind(results, c(marker_split[1], marker_split[2], p.value))
# if(p.value<0.15){
# print(markers_subset_list_1[i])
# print(input.matrix)
# print(p.value)
# print("//////////////")
# }
}
}
#Printing results to file ordered by p-value
results_header = as.vector(results[1,])
results = results[-1,]
colnames(results) = results_header
results = results[order(results[,"P-value"]), ]
write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
### END MAIN CODE ####
#######################################
#!/usr/bin/env Rscript
##############################################################
## Name: VCF_FE_test.R ##
## Description: Performs the Fisher exact test to detect ##
## significant deviations in the proportions for minor ##
## and major alleles between the two subsets provided as ##
## input files. The script Create_VCF_Allele_matrix.pl ##
## generates the two input files for this script. Each ##
## input file provides the data for one cohort/subset ##
## Usage: VCF_FE_test.R <input_file_1> <input_file_2> ##
## <output_file> ##
## Author: Serge Eifes (University of Luxembourg) ##
## Email: serge.eifes@uni.lu ##
## ##
## This work is licensed under the Creative Commons ##
## Attribution-NonCommercial-ShareAlike 4.0 ##
## International License. To view a copy of this license, ##
## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
##############################################################
#######################################
#######################################
### START FUNCTIONS ####
get_input_data = function(file){
data = as.matrix(read.delim(file, header=F))
#Adding column and row names
data_w_names = data[-1,-1]
rownames = data[-1,1]
colnames = t(data[1,-1])
rownames(data_w_names) = rownames
colnames(data_w_names) = colnames
return(data_w_names)
}
split_alleles = function(row){
row = as.vector(row)
num_patients = length(row)
row.split.list = strsplit(row, "\\/", perl=T)
all_alleles_per_marker = vector(mode="character", length=num_patients*2)
counter=1
for(i in 1:num_patients){
patient_alleles = row.split.list[[i]]
for(j in 1:2){
allele = patient_alleles[j]
all_alleles_per_marker[counter] = allele
counter=counter+1
}
}
return(table(all_alleles_per_marker))
}
create_subset_count_list = function(data){
subset = list()
markers = rownames(data)
for(i in 1:dim(data)[1]){
datarow = data[i,]
marker = markers[i]
nt_counts = split_alleles(datarow)
nt_count_matrix = as.matrix(nt_counts)
subset[[marker]] = nt_count_matrix
}
return(subset)
}
get_sorted_count_vec = function(subset_data){
alleles_subset = rownames(subset_data)
subset_vec = as.vector(subset_data[,1])
names(subset_vec) = alleles_subset
subset_vec_sorted = sort(subset_vec, method="quick")
#When multiple minor alleles exist in one subset ... merge them:
if(length(subset_vec_sorted)>2){
subset_vec_sorted = merge_minor_allele_counts(subset_vec_sorted)
}
return(subset_vec_sorted)
}
merge_minor_allele_counts = function(allele_count_vec){
#Create new allele count vector with minor and major allele count by merging the minor ones:
merged_allele_count_vec = c(
sum(allele_count_vec[1:(length(allele_count_vec)-1)]),
allele_count_vec[length(allele_count_vec)]
)
names.merged.allele = paste(names(allele_count_vec[1:(length(allele_count_vec)-1)]), collapse="/")
return(merged_allele_count_vec)
}
#Checks sorted_count_vec if minor allele (with count 0) needs to be added
check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
if(length(sorted_count_vec_a)==1){
#Need to add minor allele with count 0 if it exists in other subset
sorted_count_vec = vector(mode="integer", length=2)
sorted_count_vec[2] = sorted_count_vec_a[1]
sorted_count_vec[1] = 0
sorted_count_vec_a = sorted_count_vec
}
return(sorted_count_vec_a)
}
### END FUNCTIONS ######
#######################################
#######################################
#######################################
### START MAIN CODE ####
#Getting command line arguments
args = commandArgs(trailingOnly = TRUE)
input_file_1=as.vector(args[1])
input_file_2=as.vector(args[2])
output_file=as.vector(args[3])
data_1 = get_input_data(input_file_1)
data_2 = get_input_data(input_file_2)
#Test case for debugging
# print(data_1["1_3396099",])
# print("___")
# print(data_2["1_3396099",])
subset_list_1 = create_subset_count_list(data_1)
subset_list_2 = create_subset_count_list(data_2)
markers_subset_list_1 = names(subset_list_1)
markers_subset_list_2 = names(subset_list_2)
# In this matrix the results of the FE test for each marker
#will be stored
results = matrix(c("Chromosome", "Position", "P-value"), ncol=3, nrow=1)
#Iterating over each marker and performing FE test to compare
#minor and major allele frequencies in both subsets
for(i in 1:length(markers_subset_list_1)){
subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
if((length(sorted_count_vec_1)!=1) || (length(sorted_count_vec_2)!=1)){
#sorted_count_vec_1 = check_sorted_count_vec_for_minor(sorted_count_vec_1)
#sorted_count_vec_2 = check_sorted_count_vec_for_minor(sorted_count_vec_2)
if(length(sorted_count_vec_1)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_2 = sorted_count_vec_2
cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
cp_sorted_count_vec_2[1] = 0
sorted_count_vec_1 = cp_sorted_count_vec_2
}
if(length(sorted_count_vec_2)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_1 = sorted_count_vec_1
cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
cp_sorted_count_vec_1[1] = 0
sorted_count_vec_2 = cp_sorted_count_vec_1
}
print(paste(nchar(names(sorted_count_vec_1)[1]), nchar(names(sorted_count_vec_2)[1]), sep="--" ))
#In case one of the subsets contains no or multiple minor alleles
#we rename the minor allele of the other subset according to this one
if(nchar(names(sorted_count_vec_1)[1]) > nchar(names(sorted_count_vec_2)[1])){
names(sorted_count_vec_2) = names(sorted_count_vec_1)
}
if(nchar(names(sorted_count_vec_2)[1]) > nchar(names(sorted_count_vec_1)[1])){
names(sorted_count_vec_1) = names(sorted_count_vec_2)
}
##2x2 contingency table
input.matrix = matrix(nrow=2, ncol=2)
input.matrix[1,] = sorted_count_vec_1
input.matrix[2,] = sorted_count_vec_2
rownames(input.matrix) = c("subset_1", "subset_2")
colnames(input.matrix) = names(sorted_count_vec_1)
#FE test
p.value = fisher.test(input.matrix)$p.value
#Reformating and storing results in a matrix
marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
results = rbind(results, c(marker_split[1], marker_split[2], p.value))
}
}
#Printing results to file ordered by p-value
results_header = as.vector(results[1,])
results = results[-1,]
colnames(results) = results_header
results = results[order(results[,"P-value"]), ]
##Adding fdr MHC:
#fdr = p.adjust(results[,"P-value"], method="fdr")
#results = cbind(results, FDR=fdr)
write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
### END MAIN CODE ####
#######################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment