marker selection with RNAseq data

0aa8c28e · Wei Gu · f03e2d48 · 0aa8c28e · 0aa8c28e
Commit 0aa8c28e authored 10 years ago by Wei Gu
--- a/tool-data/Rscripts/Create_MarkerSelection_inputfile.pl
+++ b/tool-data/Rscripts/Create_MarkerSelection_inputfile.pl
@@ -3,11 +3,16 @@
 ##############################################################
 ## Name: Create_GEX_outputfile.pl                           ##
 ## Description: Merges the two subset mrna.tsv subset files ##
-## and puts the data into the correct format to be used by  ##
-## MarkerSelection.R                                        ##
+## and puts the data into the correct format to be used     ##
+## as input by MarkerSelection.R                            ##
 ## Usage: Create_GEX_outputfile.pl -s1 <subset1_input_file> ##
 ##        -s2 <subset2_input_file> -dir <in/output dir>     ##
-## Author: serge.eifes@uni.lu                               ##
+## Email: serge.eifes@uni.lu                                ##
+##                                                          ##
+## This work is licensed under the Creative Commons         ##
+## Attribution-NonCommercial-ShareAlike 4.0                 ##
+## International License. To view a copy of this license,   ##
+## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
 ##############################################################


@@ -36,6 +41,12 @@ open(WH, ">$O_file") or die "Cannot open $O_file: $!";

 foreach my $file (@A_Subset_files){
 	
+	#Idx numbers for data columns we want to extract
+	my $Patient_ID_idx;
+	my $Value_idx;
+	my $Probe_ID_idx;
+	my $Gene_Symbol_idx;
+	
 	open(FH, "$file") or die "Cannot open $file: $!";
 	
 	
@@ -47,14 +58,30 @@ foreach my $file (@A_Subset_files){
 		my $Line = $_;
 		chomp($Line);
 	
-		next if($Line =~ /^PATIENT/);
 	
 		my (@A_Line) = split /\t/, $Line;
 		
-		my $Patient_ID = $A_Line[0];
-		my $Value = $A_Line[9];
-		my $Probe_ID = $A_Line[11];
-		my $Gene_Symbol = $A_Line[13];
+		
+		#Retrieving col indices from header for the data we need		
+		if($Line=~/PATIENT/){
+			for(my $i=0; $i<=$#A_Line; $i++){
+				if($A_Line[$i] eq "PATIENT ID"){
+					$Patient_ID_idx = $i;
+				} elsif($A_Line[$i] eq "LOG2E"){
+					$Value_idx = $i;
+				} elsif(($A_Line[$i] eq "PROBE") || ($A_Line[$i] eq "ANNOTATIONID")){
+					$Probe_ID_idx = $i;
+				} elsif($A_Line[$i] eq "GENE SYMBOL"){
+					$Gene_Symbol_idx = $i;
+				}
+			}
+			next;
+		}
+		
+		my $Patient_ID = $A_Line[$Patient_ID_idx];
+		my $Value = $A_Line[$Value_idx];
+		my $Probe_ID = $A_Line[$Probe_ID_idx];
+		my $Gene_Symbol = $A_Line[$Gene_Symbol_idx];
 		
 		$Gene_Symbol="NA" if($Gene_Symbol eq "null");
 		

--- a/tool-data/Rscripts/MarkerSelection.R
+++ b/tool-data/Rscripts/MarkerSelection.R
@@ -16,22 +16,21 @@
 # limitations under the License.
 ###########################################################################

-###########################################################################
-#### Code rewritten by Wei and Serge
-###########################################################################
+#Supress printing of warnings
+options(warn=-1)


 ###########################################################################
 #Comparative Marker Selection
 ##########################################################################

-#Function to obtain the aligned positions all items in the vectorToAlign
-#in the vectorRef
+#Function to obtain the aligned positions between all items in the vectorToAlign
+#relative to the vectorRef
 get.reordered_index = function(vectorToAlign, vectorRef){
 	
 	vectorToAlign = as.vector(vectorToAlign)
 	vectorRef = as.vector(vectorRef)
-
+	
 	res = vector(mode="integer", length=length(vectorToAlign))
 	
 	for(i in 1:length(res)){
@@ -73,10 +72,18 @@ MS.loader <- function(

 	#Getting rid of the probesets w/o associated gene symbol:
 	idx_wo_symbol = which(is.na(mRNAData$GENE_SYMBOL))
+	#print(length(idx_wo_symbol))
 	if(length(idx_wo_symbol)>0){	
 		mRNAData=mRNAData[-idx_wo_symbol,]
 	}
 	
+	idx_wo_symbol = which(mRNAData$GENE_SYMBOL=="null")
+	#print(length(idx_wo_symbol))
+	if(length(idx_wo_symbol)>0){	
+		mRNAData=mRNAData[-idx_wo_symbol,]
+	}
+	
+	
 	#Create a data.frame with unique probe/gene ids.
 	geneStatsData <- data.frame(mRNAData$PROBE.ID,mRNAData$GENE_SYMBOL);
 	
@@ -212,9 +219,9 @@ MS.loader <- function(
 	s = strsplit(as.character(finalHeatmapData$GROUP), " ")
 	d = vector(mode="character", length=length(s))
 	for(i in 1:length(s)){
-		d[i] = s[[i]][1]
+		d[i] = s[[i]][2]
 	}
-	idx = get.reordered_index(d, topgenes$GENE_SYMBOL)
+	idx = get.reordered_index(d, topgenes$PROBE.ID)
 	finalHeatmapData = finalHeatmapData[ idx, ]

 	#WRITE TO FILE