Skip to content
Snippets Groups Projects
Commit f03e2d48 authored by Wei Gu's avatar Wei Gu
Browse files

add more output in VCF fisher-test

parent 6153a659
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env Rscript
##############################################################
## Name: VCF_FE_test.R ##
## Description: Performs the Fisher exact test to detect ##
## significant deviations in the proportions for minor ##
## and major alleles between the two subsets provided as ##
## input files. The script Create_VCF_Allele_matrix.pl ##
## generates the two input files for this script. Each ##
## input file provides the data for one cohort/subset ##
## Usage: VCF_FE_test.R <input_file_1> <input_file_2> ##
## <output_file> ##
## Author: Serge Eifes (University of Luxembourg) ##
## Email: serge.eifes@uni.lu ##
## ##
## This work is licensed under the Creative Commons ##
## Attribution-NonCommercial-ShareAlike 4.0 ##
## International License. To view a copy of this license, ##
## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
##############################################################
#######################################
#######################################
### START FUNCTIONS ####
get_input_data = function(file){
data = as.matrix(read.delim(file, header=F))
#Adding column and row names
data_w_names = data[-1,-1]
rownames = data[-1,1]
colnames = t(data[1,-1])
rownames(data_w_names) = rownames
colnames(data_w_names) = colnames
return(data_w_names)
}
split_alleles = function(row){
row = as.vector(row)
num_patients = length(row)
row.split.list = strsplit(row, "\\/", perl=T)
all_alleles_per_marker = vector(mode="character", length=num_patients*2)
counter=1
for(i in 1:num_patients){
patient_alleles = row.split.list[[i]]
for(j in 1:2){
allele = patient_alleles[j]
all_alleles_per_marker[counter] = allele
counter=counter+1
}
}
return(table(all_alleles_per_marker))
}
create_subset_count_list = function(data){
subset = list()
markers = rownames(data)
for(i in 1:dim(data)[1]){
datarow = data[i,]
marker = markers[i]
nt_counts = split_alleles(datarow)
nt_count_matrix = as.matrix(nt_counts)
subset[[marker]] = nt_count_matrix
}
return(subset)
}
get_sorted_count_vec = function(subset_data){
alleles_subset = rownames(subset_data)
subset_vec = as.vector(subset_data[,1])
names(subset_vec) = alleles_subset
subset_vec_sorted = sort(subset_vec, method="quick")
#When multiple minor alleles exist in one subset ... merge them:
if(length(subset_vec_sorted)>2){
subset_vec_sorted = merge_minor_allele_counts(subset_vec_sorted)
}
return(subset_vec_sorted)
}
merge_minor_allele_counts = function(allele_count_vec){
#Create new allele count vector with minor and major allele count by merging the minor ones:
merged_allele_count_vec = c(
sum(allele_count_vec[1:(length(allele_count_vec)-1)]),
allele_count_vec[length(allele_count_vec)]
)
names.merged.allele = paste(names(allele_count_vec[1:(length(allele_count_vec)-1)]), collapse="/")
return(merged_allele_count_vec)
}
#Checks sorted_count_vec if minor allele (with count 0) needs to be added
check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
if(length(sorted_count_vec_a)==1){
#Need to add minor allele with count 0 if it exists in other subset
sorted_count_vec = vector(mode="integer", length=2)
sorted_count_vec[2] = sorted_count_vec_a[1]
sorted_count_vec[1] = 0
sorted_count_vec_a = sorted_count_vec
}
return(sorted_count_vec_a)
}
### END FUNCTIONS ######
#######################################
#######################################
#######################################
### START MAIN CODE ####
#Getting command line arguments
args = commandArgs(trailingOnly = TRUE)
input_file_1=as.vector(args[1])
input_file_2=as.vector(args[2])
output_file=as.vector(args[3])
data_1 = get_input_data(input_file_1)
data_2 = get_input_data(input_file_2)
#Test case for debugging
# print(data_1["1_3396099",])
# print("___")
# print(data_2["1_3396099",])
subset_list_1 = create_subset_count_list(data_1)
subset_list_2 = create_subset_count_list(data_2)
markers_subset_list_1 = names(subset_list_1)
markers_subset_list_2 = names(subset_list_2)
# In this matrix the results of the FE test for each marker
#will be stored
results = matrix(c("Chromosome", "Position", "P-value"), ncol=3, nrow=1)
#Iterating over each marker and performing FE test to compare
#minor and major allele frequencies in both subsets
for(i in 1:length(markers_subset_list_1)){
subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
if((length(sorted_count_vec_1)!=1) || (length(sorted_count_vec_2)!=1)){
#sorted_count_vec_1 = check_sorted_count_vec_for_minor(sorted_count_vec_1)
#sorted_count_vec_2 = check_sorted_count_vec_for_minor(sorted_count_vec_2)
if(length(sorted_count_vec_1)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_2 = sorted_count_vec_2
cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
cp_sorted_count_vec_2[1] = 0
sorted_count_vec_1 = cp_sorted_count_vec_2
}
if(length(sorted_count_vec_2)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_1 = sorted_count_vec_1
cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
cp_sorted_count_vec_1[1] = 0
sorted_count_vec_2 = cp_sorted_count_vec_1
}
print(paste(nchar(names(sorted_count_vec_1)[1]), nchar(names(sorted_count_vec_2)[1]), sep="--" ))
#In case one of the subsets contains no or multiple minor alleles
#we rename the minor allele of the other subset according to this one
if(nchar(names(sorted_count_vec_1)[1]) > nchar(names(sorted_count_vec_2)[1])){
names(sorted_count_vec_2) = names(sorted_count_vec_1)
}
if(nchar(names(sorted_count_vec_2)[1]) > nchar(names(sorted_count_vec_1)[1])){
names(sorted_count_vec_1) = names(sorted_count_vec_2)
}
##2x2 contingency table
input.matrix = matrix(nrow=2, ncol=2)
input.matrix[1,] = sorted_count_vec_1
input.matrix[2,] = sorted_count_vec_2
rownames(input.matrix) = c("subset_1", "subset_2")
colnames(input.matrix) = names(sorted_count_vec_1)
#FE test
p.value = fisher.test(input.matrix)$p.value
#Reformating and storing results in a matrix
marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
results = rbind(results, c(marker_split[1], marker_split[2], p.value))
}
}
#Printing results to file ordered by p-value
results_header = as.vector(results[1,])
results = results[-1,]
colnames(results) = results_header
results = results[order(results[,"P-value"]), ]
##Adding fdr MHC:
#fdr = p.adjust(results[,"P-value"], method="fdr")
#results = cbind(results, FDR=fdr)
write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
### END MAIN CODE ####
#######################################
#!/usr/bin/env Rscript
##############################################################
## Name: VCF_FE_test.R ##
## Description: Performs the Fisher exact test to detect ##
## significant deviations in the proportions for minor ##
## and major alleles between the two subsets provided as ##
## input files. The script Create_VCF_Allele_matrix.pl ##
## generates the two input files for this script. Each ##
## input file provides the data for one cohort/subset ##
## Usage: VCF_FE_test.R <input_file_1> <input_file_2> ##
## <output_file> ##
## Author: Serge Eifes (University of Luxembourg) ##
## Email: serge.eifes@uni.lu ##
## ##
## This work is licensed under the Creative Commons ##
## Attribution-NonCommercial-ShareAlike 4.0 ##
## International License. To view a copy of this license, ##
## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
##############################################################
#######################################
#######################################
### START FUNCTIONS ####
get_input_data = function(file){
data = as.matrix(read.delim(file, header=F))
#Adding column and row names
data_w_names = data[-1,-1]
rownames = data[-1,1]
colnames = t(data[1,-1])
rownames(data_w_names) = rownames
colnames(data_w_names) = colnames
return(data_w_names)
}
split_alleles = function(row){
row = as.vector(row)
num_patients = length(row)
row.split.list = strsplit(row, "\\/", perl=T)
all_alleles_per_marker = vector(mode="character", length=num_patients*2)
counter=1
for(i in 1:num_patients){
patient_alleles = row.split.list[[i]]
for(j in 1:2){
allele = patient_alleles[j]
all_alleles_per_marker[counter] = allele
counter=counter+1
}
}
return(table(all_alleles_per_marker))
}
create_subset_count_list = function(data){
subset = list()
markers = rownames(data)
for(i in 1:dim(data)[1]){
datarow = data[i,]
marker = markers[i]
nt_counts = split_alleles(datarow)
nt_count_matrix = as.matrix(nt_counts)
subset[[marker]] = nt_count_matrix
}
return(subset)
}
get_sorted_count_vec = function(subset_data){
alleles_subset = rownames(subset_data)
subset_vec = as.vector(subset_data[,1])
names(subset_vec) = alleles_subset
subset_vec_sorted = sort(subset_vec, method="quick")
#When multiple minor alleles exist in one subset ... merge them:
if(length(subset_vec_sorted)>2){
subset_vec_sorted = merge_minor_allele_counts(subset_vec_sorted)
}
return(subset_vec_sorted)
}
merge_minor_allele_counts = function(allele_count_vec){
#Create new allele count vector with minor and major allele count by merging the minor ones:
merged_allele_count_vec = c(
sum(allele_count_vec[1:(length(allele_count_vec)-1)]),
allele_count_vec[length(allele_count_vec)]
)
names.merged.allele = paste(names(allele_count_vec[1:(length(allele_count_vec)-1)]), collapse="/")
return(merged_allele_count_vec)
}
#Checks sorted_count_vec if minor allele (with count 0) needs to be added
check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
if(length(sorted_count_vec_a)==1){
#Need to add minor allele with count 0 if it exists in other subset
sorted_count_vec = vector(mode="integer", length=2)
sorted_count_vec[2] = sorted_count_vec_a[1]
sorted_count_vec[1] = 0
sorted_count_vec_a = sorted_count_vec
}
return(sorted_count_vec_a)
}
### END FUNCTIONS ######
#######################################
#######################################
#######################################
### START MAIN CODE ####
#Getting command line arguments
args = commandArgs(trailingOnly = TRUE)
input_file_1=as.vector(args[1])
input_file_2=as.vector(args[2])
output_file=as.vector(args[3])
data_1 = get_input_data(input_file_1)
data_2 = get_input_data(input_file_2)
#Test case for debugging
# print(data_1["1_3396099",])
# print("___")
# print(data_2["1_3396099",])
subset_list_1 = create_subset_count_list(data_1)
subset_list_2 = create_subset_count_list(data_2)
markers_subset_list_1 = names(subset_list_1)
markers_subset_list_2 = names(subset_list_2)
# In this matrix the results of the FE test for each marker
#will be stored
results = matrix(c("Chromosome", "Position",
"Subset_1_Minor_Allele_Count", "Subset_1_Major_Allele_Count",
"Subset_2_Minor_Allele_Count", "Subset_2_Major_Allele_Count",
"Odds_ratio", "P-value"),
ncol=8, nrow=1)
#Iterating over each marker and performing FE test to compare
#minor and major allele frequencies in both subsets
for(i in 1:length(markers_subset_list_1)){
subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
#Adding minor allele count (equals 0) to sorted_count_vec_1 if not existing in vector:
if(length(sorted_count_vec_1)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_2 = sorted_count_vec_2
cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
cp_sorted_count_vec_2[1] = 0
sorted_count_vec_1 = cp_sorted_count_vec_2
}
#Adding minor allele count (equals 0) to sorted_count_vec_1 if not existing:
if(length(sorted_count_vec_2)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_1 = sorted_count_vec_1
cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
cp_sorted_count_vec_1[1] = 0
sorted_count_vec_2 = cp_sorted_count_vec_1
}
#In case one of the subsets contains no or multiple minor alleles
#we rename the minor allele of the other subset according to this one
if(nchar(names(sorted_count_vec_1)[1]) > nchar(names(sorted_count_vec_2)[1])){
names(sorted_count_vec_2) = names(sorted_count_vec_1)
}
if(nchar(names(sorted_count_vec_2)[1]) > nchar(names(sorted_count_vec_1)[1])){
names(sorted_count_vec_1) = names(sorted_count_vec_2)
}
##2x2 contingency table
input.matrix = matrix(nrow=2, ncol=2)
input.matrix[1,] = sorted_count_vec_1
input.matrix[2,] = sorted_count_vec_2
rownames(input.matrix) = c("subset_1", "subset_2")
colnames(input.matrix) = names(sorted_count_vec_1)
#FE test
res.test = fisher.test(input.matrix)
p.value = res.test$p.value
odds_ratio = res.test$estimate
# print(input.matrix)
# print(res.test)
#Reformating and storing results in a matrix
marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
results = rbind(results, c(marker_split[1], marker_split[2],
input.matrix[1,1], input.matrix[1,2],
input.matrix[2,1], input.matrix[2,2],
odds_ratio, p.value)
)
}
#Printing results to file ordered by p-value
results_header = as.vector(results[1,])
results = results[-1,]
colnames(results) = results_header
results = results[order(results[,"P-value"]), ]
##Adding fdr MHC:
fdr = p.adjust(results[,"P-value"], method="fdr")
results = cbind(results, FDR=fdr)
write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
#print(warnings())
### END MAIN CODE ####
#######################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment