Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TGM_pipeline
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
External wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Wei Gu
TGM_pipeline
Commits
6153a659
Commit
6153a659
authored
10 years ago
by
Wei Gu
Browse files
Options
Downloads
Patches
Plain Diff
update VCF fisher test
parent
05ddf0be
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
tool-data/Rscripts/VCF_FE_test.R
+235
-197
235 additions, 197 deletions
tool-data/Rscripts/VCF_FE_test.R
with
235 additions
and
197 deletions
tool-data/Rscripts/VCF_FE_test.R
+
235
−
197
View file @
6153a659
#!/usr/bin/env Rscript
##############################################################
## Name: VCF_FE_test.R ##
## Description: Performs the Fisher exact test to detect ##
## significant deviations in the proportions for minor ##
## and major alleles between the two subsets provided as ##
## input files.
## Usage: VCF_FE_test.R <input_file_1> <input_file_2> ##
## <output_file> ##
## Author: serge.eifes@uni.lu ##
##############################################################
#######################################
#######################################
### START FUNCTIONS ####
get_input_data
=
function
(
file
){
data
=
as.matrix
(
read.delim
(
file
,
header
=
F
))
#Adding column and row names
data_w_names
=
data
[
-1
,
-1
]
rownames
=
data
[
-1
,
1
]
colnames
=
t
(
data
[
1
,
-1
])
rownames
(
data_w_names
)
=
rownames
colnames
(
data_w_names
)
=
colnames
return
(
data_w_names
)
}
split_alleles
=
function
(
row
){
row
=
as.vector
(
row
)
num_patients
=
length
(
row
)
row.split.list
=
strsplit
(
row
,
"\\/"
,
perl
=
T
)
all_alleles_per_marker
=
vector
(
mode
=
"character"
,
length
=
num_patients
*
2
)
counter
=
1
for
(
i
in
1
:
num_patients
){
patient_alleles
=
row.split.list
[[
i
]]
for
(
j
in
1
:
2
){
allele
=
patient_alleles
[
j
]
all_alleles_per_marker
[
counter
]
=
allele
counter
=
counter
+1
}
}
return
(
table
(
all_alleles_per_marker
))
}
create_subset_count_list
=
function
(
data
){
subset
=
list
()
markers
=
rownames
(
data
)
for
(
i
in
1
:
dim
(
data
)[
1
]){
datarow
=
data
[
i
,]
marker
=
markers
[
i
]
nt_counts
=
split_alleles
(
datarow
)
nt_count_matrix
=
as.matrix
(
nt_counts
)
subset
[[
marker
]]
=
nt_count_matrix
}
return
(
subset
)
}
get_sorted_count_vec
=
function
(
subset_data
){
alleles_subset
=
rownames
(
subset_data
)
subset_vec
=
as.vector
(
subset_data
[,
1
])
names
(
subset_vec
)
=
alleles_subset
subset_vec_sorted
=
sort
(
subset_vec
,
method
=
"quick"
)
return
(
subset_vec_sorted
)
}
#Checks sorted_count_vec if minor allele (with count 0) needs to be added
check_sorted_count_vec_for_minor
=
function
(
sorted_count_vec_a
){
if
(
length
(
sorted_count_vec_a
)
==
1
){
#Need to add minor allele with count 0 if it exists in other subset
sorted_count_vec
=
vector
(
mode
=
"integer"
,
length
=
2
)
sorted_count_vec
[
2
]
=
sorted_count_vec_a
[
1
]
sorted_count_vec
[
1
]
=
0
sorted_count_vec_a
=
sorted_count_vec
}
return
(
sorted_count_vec_a
)
}
### END FUNCTIONS ######
#######################################
#######################################
#######################################
### START MAIN CODE ####
#Getting command line arguments
args
=
commandArgs
(
trailingOnly
=
TRUE
)
input_file_1
=
as.vector
(
args
[
1
])
input_file_2
=
as.vector
(
args
[
2
])
output_file
=
as.vector
(
args
[
3
])
data_1
=
get_input_data
(
input_file_1
)
data_2
=
get_input_data
(
input_file_2
)
#Test case for debugging
# print(data_1["1_3396099",])
# print("___")
# print(data_2["1_3396099",])
subset_list_1
=
create_subset_count_list
(
data_1
)
subset_list_2
=
create_subset_count_list
(
data_2
)
markers_subset_list_1
=
names
(
subset_list_1
)
markers_subset_list_2
=
names
(
subset_list_2
)
# In this matrix the results of the FE test for each marker
#will be stored
results
=
matrix
(
c
(
"Chromosome"
,
"Position"
,
"P-value"
),
ncol
=
3
,
nrow
=
1
)
#Iterating over each marker and performing FE test to compare
#minor and major allele frequencies in both subsets
for
(
i
in
1
:
length
(
markers_subset_list_1
)){
subset_1_data
=
subset_list_1
[[
markers_subset_list_1
[
i
]]]
subset_2_data
=
subset_list_2
[[
markers_subset_list_1
[
i
]]]
sorted_count_vec_1
=
get_sorted_count_vec
(
subset_1_data
)
sorted_count_vec_2
=
get_sorted_count_vec
(
subset_2_data
)
if
((
length
(
sorted_count_vec_1
)
!=
1
)
||
(
length
(
sorted_count_vec_2
)
!=
1
)){
#sorted_count_vec_1 = check_sorted_count_vec_for_minor(sorted_count_vec_1)
#sorted_count_vec_2 = check_sorted_count_vec_for_minor(sorted_count_vec_2)
if
(
length
(
sorted_count_vec_1
)
==
1
){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_2
=
sorted_count_vec_2
cp_sorted_count_vec_2
[
2
]
=
sorted_count_vec_1
[
1
]
cp_sorted_count_vec_2
[
1
]
=
0
sorted_count_vec_1
=
cp_sorted_count_vec_2
}
if
(
length
(
sorted_count_vec_2
)
==
1
){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_1
=
sorted_count_vec_1
cp_sorted_count_vec_1
[
2
]
=
sorted_count_vec_2
[
1
]
cp_sorted_count_vec_1
[
1
]
=
0
sorted_count_vec_2
=
cp_sorted_count_vec_1
}
##2x2 contingency table
input.matrix
=
matrix
(
nrow
=
2
,
ncol
=
2
)
input.matrix
[
1
,]
=
sorted_count_vec_1
input.matrix
[
2
,]
=
sorted_count_vec_2
rownames
(
input.matrix
)
=
c
(
"subset_1"
,
"subset_2"
)
colnames
(
input.matrix
)
=
names
(
sorted_count_vec_1
)
#FE test
p.value
=
fisher.test
(
input.matrix
)
$
p.value
#Reformating and storing results in a matrix
marker_split
=
unlist
(
strsplit
(
markers_subset_list_1
[
i
],
"_"
))
results
=
rbind
(
results
,
c
(
marker_split
[
1
],
marker_split
[
2
],
p.value
))
# if(p.value<0.15){
# print(markers_subset_list_1[i])
# print(input.matrix)
# print(p.value)
# print("//////////////")
# }
}
}
#Printing results to file ordered by p-value
results_header
=
as.vector
(
results
[
1
,])
results
=
results
[
-1
,]
colnames
(
results
)
=
results_header
results
=
results
[
order
(
results
[,
"P-value"
]),
]
write.table
(
results
,
output_file
,
quote
=
F
,
sep
=
"\t"
,
row.names
=
F
,
col.names
=
T
)
### END MAIN CODE ####
#######################################
#!/usr/bin/env Rscript
##############################################################
## Name: VCF_FE_test.R ##
## Description: Performs the Fisher exact test to detect ##
## significant deviations in the proportions for minor ##
## and major alleles between the two subsets provided as ##
## input files. The script Create_VCF_Allele_matrix.pl ##
## generates the two input files for this script. Each ##
## input file provides the data for one cohort/subset ##
## Usage: VCF_FE_test.R <input_file_1> <input_file_2> ##
## <output_file> ##
## Author: Serge Eifes (University of Luxembourg) ##
## Email: serge.eifes@uni.lu ##
## ##
## This work is licensed under the Creative Commons ##
## Attribution-NonCommercial-ShareAlike 4.0 ##
## International License. To view a copy of this license, ##
## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
##############################################################
#######################################
#######################################
### START FUNCTIONS ####
get_input_data
=
function
(
file
){
data
=
as.matrix
(
read.delim
(
file
,
header
=
F
))
#Adding column and row names
data_w_names
=
data
[
-1
,
-1
]
rownames
=
data
[
-1
,
1
]
colnames
=
t
(
data
[
1
,
-1
])
rownames
(
data_w_names
)
=
rownames
colnames
(
data_w_names
)
=
colnames
return
(
data_w_names
)
}
split_alleles
=
function
(
row
){
row
=
as.vector
(
row
)
num_patients
=
length
(
row
)
row.split.list
=
strsplit
(
row
,
"\\/"
,
perl
=
T
)
all_alleles_per_marker
=
vector
(
mode
=
"character"
,
length
=
num_patients
*
2
)
counter
=
1
for
(
i
in
1
:
num_patients
){
patient_alleles
=
row.split.list
[[
i
]]
for
(
j
in
1
:
2
){
allele
=
patient_alleles
[
j
]
all_alleles_per_marker
[
counter
]
=
allele
counter
=
counter
+1
}
}
return
(
table
(
all_alleles_per_marker
))
}
create_subset_count_list
=
function
(
data
){
subset
=
list
()
markers
=
rownames
(
data
)
for
(
i
in
1
:
dim
(
data
)[
1
]){
datarow
=
data
[
i
,]
marker
=
markers
[
i
]
nt_counts
=
split_alleles
(
datarow
)
nt_count_matrix
=
as.matrix
(
nt_counts
)
subset
[[
marker
]]
=
nt_count_matrix
}
return
(
subset
)
}
get_sorted_count_vec
=
function
(
subset_data
){
alleles_subset
=
rownames
(
subset_data
)
subset_vec
=
as.vector
(
subset_data
[,
1
])
names
(
subset_vec
)
=
alleles_subset
subset_vec_sorted
=
sort
(
subset_vec
,
method
=
"quick"
)
#When multiple minor alleles exist in one subset ... merge them:
if
(
length
(
subset_vec_sorted
)
>
2
){
subset_vec_sorted
=
merge_minor_allele_counts
(
subset_vec_sorted
)
}
return
(
subset_vec_sorted
)
}
merge_minor_allele_counts
=
function
(
allele_count_vec
){
#Create new allele count vector with minor and major allele count by merging the minor ones:
merged_allele_count_vec
=
c
(
sum
(
allele_count_vec
[
1
:
(
length
(
allele_count_vec
)
-1
)]),
allele_count_vec
[
length
(
allele_count_vec
)]
)
names.merged.allele
=
paste
(
names
(
allele_count_vec
[
1
:
(
length
(
allele_count_vec
)
-1
)]),
collapse
=
"/"
)
return
(
merged_allele_count_vec
)
}
#Checks sorted_count_vec if minor allele (with count 0) needs to be added
check_sorted_count_vec_for_minor
=
function
(
sorted_count_vec_a
){
if
(
length
(
sorted_count_vec_a
)
==
1
){
#Need to add minor allele with count 0 if it exists in other subset
sorted_count_vec
=
vector
(
mode
=
"integer"
,
length
=
2
)
sorted_count_vec
[
2
]
=
sorted_count_vec_a
[
1
]
sorted_count_vec
[
1
]
=
0
sorted_count_vec_a
=
sorted_count_vec
}
return
(
sorted_count_vec_a
)
}
### END FUNCTIONS ######
#######################################
#######################################
#######################################
### START MAIN CODE ####
#Getting command line arguments
args
=
commandArgs
(
trailingOnly
=
TRUE
)
input_file_1
=
as.vector
(
args
[
1
])
input_file_2
=
as.vector
(
args
[
2
])
output_file
=
as.vector
(
args
[
3
])
data_1
=
get_input_data
(
input_file_1
)
data_2
=
get_input_data
(
input_file_2
)
#Test case for debugging
# print(data_1["1_3396099",])
# print("___")
# print(data_2["1_3396099",])
subset_list_1
=
create_subset_count_list
(
data_1
)
subset_list_2
=
create_subset_count_list
(
data_2
)
markers_subset_list_1
=
names
(
subset_list_1
)
markers_subset_list_2
=
names
(
subset_list_2
)
# In this matrix the results of the FE test for each marker
#will be stored
results
=
matrix
(
c
(
"Chromosome"
,
"Position"
,
"P-value"
),
ncol
=
3
,
nrow
=
1
)
#Iterating over each marker and performing FE test to compare
#minor and major allele frequencies in both subsets
for
(
i
in
1
:
length
(
markers_subset_list_1
)){
subset_1_data
=
subset_list_1
[[
markers_subset_list_1
[
i
]]]
subset_2_data
=
subset_list_2
[[
markers_subset_list_1
[
i
]]]
sorted_count_vec_1
=
get_sorted_count_vec
(
subset_1_data
)
sorted_count_vec_2
=
get_sorted_count_vec
(
subset_2_data
)
if
((
length
(
sorted_count_vec_1
)
!=
1
)
||
(
length
(
sorted_count_vec_2
)
!=
1
)){
#sorted_count_vec_1 = check_sorted_count_vec_for_minor(sorted_count_vec_1)
#sorted_count_vec_2 = check_sorted_count_vec_for_minor(sorted_count_vec_2)
if
(
length
(
sorted_count_vec_1
)
==
1
){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_2
=
sorted_count_vec_2
cp_sorted_count_vec_2
[
2
]
=
sorted_count_vec_1
[
1
]
cp_sorted_count_vec_2
[
1
]
=
0
sorted_count_vec_1
=
cp_sorted_count_vec_2
}
if
(
length
(
sorted_count_vec_2
)
==
1
){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_1
=
sorted_count_vec_1
cp_sorted_count_vec_1
[
2
]
=
sorted_count_vec_2
[
1
]
cp_sorted_count_vec_1
[
1
]
=
0
sorted_count_vec_2
=
cp_sorted_count_vec_1
}
print
(
paste
(
nchar
(
names
(
sorted_count_vec_1
)[
1
]),
nchar
(
names
(
sorted_count_vec_2
)[
1
]),
sep
=
"--"
))
#In case one of the subsets contains no or multiple minor alleles
#we rename the minor allele of the other subset according to this one
if
(
nchar
(
names
(
sorted_count_vec_1
)[
1
])
>
nchar
(
names
(
sorted_count_vec_2
)[
1
])){
names
(
sorted_count_vec_2
)
=
names
(
sorted_count_vec_1
)
}
if
(
nchar
(
names
(
sorted_count_vec_2
)[
1
])
>
nchar
(
names
(
sorted_count_vec_1
)[
1
])){
names
(
sorted_count_vec_1
)
=
names
(
sorted_count_vec_2
)
}
##2x2 contingency table
input.matrix
=
matrix
(
nrow
=
2
,
ncol
=
2
)
input.matrix
[
1
,]
=
sorted_count_vec_1
input.matrix
[
2
,]
=
sorted_count_vec_2
rownames
(
input.matrix
)
=
c
(
"subset_1"
,
"subset_2"
)
colnames
(
input.matrix
)
=
names
(
sorted_count_vec_1
)
#FE test
p.value
=
fisher.test
(
input.matrix
)
$
p.value
#Reformating and storing results in a matrix
marker_split
=
unlist
(
strsplit
(
markers_subset_list_1
[
i
],
"_"
))
results
=
rbind
(
results
,
c
(
marker_split
[
1
],
marker_split
[
2
],
p.value
))
}
}
#Printing results to file ordered by p-value
results_header
=
as.vector
(
results
[
1
,])
results
=
results
[
-1
,]
colnames
(
results
)
=
results_header
results
=
results
[
order
(
results
[,
"P-value"
]),
]
##Adding fdr MHC:
#fdr = p.adjust(results[,"P-value"], method="fdr")
#results = cbind(results, FDR=fdr)
write.table
(
results
,
output_file
,
quote
=
F
,
sep
=
"\t"
,
row.names
=
F
,
col.names
=
T
)
### END MAIN CODE ####
#######################################
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment