...
 
Commits (14)
## Description of folders and files
### tool_conf.xml
Configuration of what tools to be activated in Galaxy
### tool-data
Folder to store the tool scripts (*.R *.perl... etc)
### tools
XML definitions of each tool
\ No newline at end of file
This diff is collapsed.
<?xml version='1.0' encoding='utf-8'?>
<toolbox>
<section id="getext" name="Get Data">
<tool file="data_source/upload.xml" />
<tool file="data_source/ucsc_tablebrowser.xml" />
<tool file="data_source/ucsc_tablebrowser_archaea.xml" />
<tool file="data_source/ebi_sra.xml" />
<tool file="data_source/biomart.xml" />
<tool file="data_source/gramene_mart.xml" />
<tool file="data_source/flymine.xml" />
<tool file="data_source/fly_modencode.xml" />
<tool file="data_source/modmine.xml" />
<tool file="data_source/mousemine.xml" />
<tool file="data_source/ratmine.xml" />
<tool file="data_source/yeastmine.xml" />
<tool file="data_source/worm_modencode.xml" />
<tool file="data_source/wormbase.xml" />
<tool file="data_source/eupathdb.xml" />
<tool file="genomespace/genomespace_file_browser_prod.xml" />
<tool file="genomespace/genomespace_importer.xml" />
</section>
<section id="tmscripts" name="tranSMART tools">
<tool file="tranSMART/tm_plink_input.xml" />
<tool file="tranSMART/run_plink.xml" />
<tool file="tranSMART/plot_plink.xml" />
</section>
<!--
<section id="send" name="Send Data">
<tool file="genomespace/genomespace_exporter.xml" />
</section>
<section id="liftOver" name="Lift-Over">
<tool file="extract/liftOver_wrapper.xml" />
</section>
-->
<section id="textutil" name="Text Manipulation">
<tool file="filters/fixedValueColumn.xml" />
<tool file="filters/catWrapper.xml" />
<tool file="filters/condense_characters.xml" />
<tool file="filters/convert_characters.xml" />
<tool file="filters/mergeCols.xml" />
<tool file="filters/CreateInterval.xml" />
<tool file="filters/cutWrapper.xml" />
<tool file="filters/changeCase.xml" />
<tool file="filters/pasteWrapper.xml" />
<tool file="filters/remove_beginning.xml" />
<tool file="filters/randomlines.xml" />
<tool file="filters/headWrapper.xml" />
<tool file="filters/tailWrapper.xml" />
<tool file="filters/trimmer.xml" />
<tool file="filters/wc_gnu.xml" />
<tool file="filters/secure_hash_message_digest.xml" />
</section>
<!--
<section id="convert" name="Convert Formats">
<tool file="filters/bed2gff.xml" />
<tool file="filters/gff2bed.xml" />
<tool file="maf/maf_to_bed.xml" />
<tool file="maf/maf_to_interval.xml" />
<tool file="maf/maf_to_fasta.xml" />
<tool file="filters/sff_extractor.xml" />
<tool file="filters/wig_to_bigwig.xml" />
<tool file="filters/bed_to_bigbed.xml" />
</section>
<section id="filter" name="Filter and Sort">
<tool file="stats/filtering.xml" />
<tool file="filters/sorter.xml" />
<tool file="filters/grep.xml" />
<label id="gff" text="GFF" />
<tool file="filters/gff/extract_GFF_Features.xml" />
<tool file="filters/gff/gff_filter_by_attribute.xml" />
<tool file="filters/gff/gff_filter_by_feature_count.xml" />
<tool file="filters/gff/gtf_filter_by_attribute_values_list.xml" />
</section>
<section id="group" name="Join, Subtract and Group">
<tool file="filters/joiner.xml" />
<tool file="filters/compare.xml" />
<tool file="stats/grouping.xml" />
</section>
<section id="features" name="Extract Features">
<tool file="filters/ucsc_gene_bed_to_exon_bed.xml" />
</section>
<section id="fetchSeq" name="Fetch Sequences">
<tool file="extract/extract_genomic_dna.xml" />
</section>
<section id="fetchAlign" name="Fetch Alignments">
<tool file="maf/interval2maf_pairwise.xml" />
<tool file="maf/interval2maf.xml" />
<tool file="maf/interval_maf_to_merged_fasta.xml" />
<tool file="maf/genebed_maf_to_fasta.xml" />
<tool file="maf/maf_stats.xml" />
<tool file="maf/maf_thread_for_species.xml" />
<tool file="maf/maf_limit_to_species.xml" />
<tool file="maf/maf_limit_size.xml" />
<tool file="maf/maf_by_block_number.xml" />
<tool file="maf/maf_filter.xml" />
<tool file="maf/maf_reverse_complement.xml" />
</section>
<section id="scores" name="Get Genomic Scores">
<tool file="filters/wiggle_to_simple.xml" />
<tool file="stats/aggregate_binned_scores_in_intervals.xml" />
</section>
<section id="stats" name="Statistics">
<tool file="stats/gsummary.xml" />
<tool file="filters/uniq.xml" />
</section>
<section id="plots" name="Graph/Display Data">
<tool file="plotting/boxplot.xml" />
<tool file="maf/vcf_to_maf_customtrack.xml" />
</section>
<section id="hgv" name="Phenotype Association">
<tool file="evolution/codingSnps.xml" />
<tool file="evolution/add_scores.xml" />
<tool file="phenotype_association/sift.xml" />
<tool file="phenotype_association/linkToGProfile.xml" />
<tool file="phenotype_association/linkToDavid.xml" />
<tool file="phenotype_association/ldtools.xml" />
<tool file="phenotype_association/pass.xml" />
<tool file="phenotype_association/gpass.xml" />
<tool file="phenotype_association/beam.xml" />
<tool file="phenotype_association/lps.xml" />
<tool file="phenotype_association/master2pg.xml" />
</section>
<label id="ngs" text="NGS Toolbox Beta" />
<section id="cshl_library_information" name="NGS: QC and manipulation">
<label id="illumina" text="Illumina data" />
<label id="454" text="Roche-454 data" />
<label id="solid" text="AB-SOLiD data" />
<tool file="next_gen_conversion/solid2fastq.xml" />
<tool file="solid_tools/solid_qual_stats.xml" />
<tool file="solid_tools/solid_qual_boxplot.xml" />
<label id="generic_fastq" text="Generic FASTQ manipulation" />
<label id="fastx_toolkit_fastq" text="FASTX-Toolkit for FASTQ data" />
</section>
<section id="ngs_mapping" name="NGS: Mapping">
<label id="illumina" text="Illumina" />
<label id="roche_454" text="Roche-454" />
<label id="ab_solid" text="AB-SOLiD" />
</section>
<section id="samtools" name="NGS: SAM Tools">
</section>
-->
</toolbox>
#!/usr/bin/Rscript --vanilla
##############################################################
## Name : plot_plink.R ##
## Description: plot QQ plot and manhattan plot from Plink ##
## results ##
## Usage : ##
## plot_plink.R plink_result_FILE N_top_to_highlight ##
## ##
## Author : eTRIKS WP4 UL team ##
## Contact : wei.gu@uni.lu ##
##############################################################
## ------
## ------ command line args ------ ##
## ------
args = commandArgs(trailingOnly = TRUE)
plink_result = as.vector(args[1])
## ------
## ------ deal with input files ------ ##
## ------
#
# read plink results
#
pResult <- read.table (file=plink_result, header=T)
## ------
## ------ plot the results ------ ##
## ------
suppressMessages(library(qqman))
#
# QQ plot
#
png("qqplot.png",width=800,heigh=800)
qq(pResult$P, main="Q-Q Plot", cex=0.8, cex.axis=1.2)
dev.off()
#
# manhattan plot
#
color=colors()[c(12,24,41,26,48,116,111,258,552,12,24,41,26,48,116,111,258,552,12,24,41,26)]
png("manhattan.png",width=1000,heigh=600)
manhattan(pResult, main = "Manhattan Plot", cex = 0.8, cex.axis = 1.2, col = color )
dev.off()
#!/usr/bin/Rscript --vanilla
##############################################################
## Name : prepare_plink.R ##
## Description: Creates plink input file from tranSMART ##
## cohort selection export (clinical data) ##
## Usage : ##
## prepare_plink.R jobInfo_FILE subset1_clinical_FILE ##
## subset2_clinical_FILE ##
## ##
## Author : eTRIKS WP4 UL team ##
## Contact : wei.gu@uni.lu ##
##############################################################
## ------
## ------ command line args ------ ##
## ------
args = commandArgs(trailingOnly = TRUE)
jobFile = as.vector(args[1])
subset1_file = as.vector(args[2])
subset2_file = as.vector(args[3])
## ------
## ------ deal with input files ------ ##
## ------
#
# read jobFile to get job info: studyAccessions -> study_id
#
con <- file(jobFile, open = "r")
while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
myVector <- strsplit(oneLine, " -> ")
if (myVector[[1]][1]=="\tstudyAccessions") {
study_id <- gsub("\\]","",gsub("\\[","",myVector[[1]][2]))
}
}
close(con)
#
# read PATIENT ID in both subsets
#
cohortData <- read.table (subset1_file, header=T, sep="\t")
idS1 <- as.vector (cohortData$PATIENT.ID)
cohortData <- read.table (subset2_file, header=T, sep="\t")
idS2 <- as.vector (cohortData$PATIENT.ID)
#
# generate phenotype file
#
pheno <- cbind(idS1,idS1,"1")
pheno <- rbind(pheno,cbind(idS2,idS2,"2"))
write.table(pheno,file="transmart.pheno",quote=F,sep=" ",row.names=F,col.names=F)
## ------
## ------ now the database part ------ ##
## ------
#
# set up mongodb connection details
#
#options(warn=-1)
suppressMessages(library(rmongodb))
host <- "127.0.0.1:27017"
username <- ""
password <- ""
db <- "tm_hd_data"
#
# connect to mongo
#
# the create function has the following signature
#mongo.create(host="127.0.0.1", name="", username="", password="", db="admin", timeout=0L)
mongo <- mongo.create(host=host, db=db)
#
# query DB (GridFS) to get the bed, bim and fam file
#
getFile <- function (gridfsCon, inFile, outFile){
FileCon <- mongo.gridfs.find(gridfsCon,inFile )
if (!is.null(FileCon)) {
outFile<- file (outFile)
mongo.gridfile.pipe(FileCon,outFile)
mongo.gridfile.destroy(FileCon)
}
}
if (mongo.is.connected(mongo)){
mgrids <- mongo.gridfs.create(mongo,db)
if (!is.null(mgrids)){
getFile(mgrids,paste(study_id,".bed",sep=""),"transmart.bed")
getFile(mgrids,paste(study_id,".bim",sep=""),"transmart.bim")
getFile(mgrids,paste(study_id,".fam",sep=""),"transmart.fam")
}
mongo.gridfs.destroy (mgrids)
}
#
# disconnect from mongo
#
mongo.destroy(mongo)
<tool id="plot_plink" name="Plot the GWAS results from PLINK" version="0.1">
<command>
${GALAXY_DATA_INDEX_DIR}/tranSMART/plot_plink.R $assocFile
</command>
<inputs>
<param format="txt" name="assocFile" type="data" label="association result file"/>
</inputs>
<outputs>
<data format="png" name="qqplot" label="QQ-Plot Figure" from_work_dir="qqplot.png" ></data>
<data format="png" name="mhplot" label="Manhattan-Plot Figure" from_work_dir="manhattan.png" ></data>
</outputs>
<help>
Plot GWAS analysis results from PLINK.
</help>
</tool>
<tool id="run_plink" name="Run GWAS with PLINK" version="0.1">
<command>
/usr/local/bin/plink --noweb --bed $bedfile --bim $bimfile --fam $famfile --pheno $phenofile --assoc --adjust --pfilter $p_cutoff
</command>
<inputs>
<param format="txt" name="bedfile" type="data" label="binary genotype file"/>
<param format="txt" name="bimfile" type="data" label="SNP mapping file"/>
<param format="txt" name="famfile" type="data" label="Sample info file" />
<param format="txt" name="phenofile" type="data" label="phenotype file" />
<param name="p_cutoff" type="float" value="0.05" min="0" max="1" label="P-value cutoff for unadjusted p-values" />
</inputs>
<outputs>
<data format="txt" name="log_File" label="log file" from_work_dir="plink.assoc" ></data>
<data format="txt" name="assoc_File" label="association result file" from_work_dir="plink.assoc"></data>
<data format="txt" name="adjusted_File" label="adjusted result file" from_work_dir="plink.assoc.adjusted"></data>
</outputs>
<help>
Perform GWAS analysis using PLINK.
</help>
</tool>
<?xml version="1.0"?>
<tool id="tm_plink_input" name="Create PLINK input from TM Export files" version="0.1">
<command>
${GALAXY_DATA_INDEX_DIR}/tranSMART/prepare_plink.R $jobInfo $subset_1 $subset_2
</command>
<inputs>
<param format="txt" name="jobInfo" type="data" label="job information"/>
<param format="tabular" name="subset_1" type="data" label="Subset 1"/>
<param format="tabular" name="subset_2" type="data" label="Subset 2" />
</inputs>
<outputs>
<data format="txt" name="bed_File" label="binary genotype file" from_work_dir="transmart.bed"></data>
<data format="txt" name="bim_File" label="SNP mapping file" from_work_dir="transmart.bim" ></data>
<data format="txt" name="fam_File" label="Sample info file" from_work_dir="transmart.fam"></data>
<data format="txt" name="pheno_File" label="phenotype file" from_work_dir="transmart.pheno"></data>
</outputs>
<help>
This tool reads exported clinical data as well as job information to build inputs for PLINK analysis.
</help>
</tool>
#!/usr/bin/env perl
##############################################################
## Name: Connect_PDmap.pl ##
## Description: It parses the output file i.e generated by ##
## differential marker analysis and extracts the gene names ##
## and expression values to PD map ##
## Usage: Connect_PDmap.pl <input_file> ##
## <output_file> ##
## Author: venkata.satagopam@uni.lu ##
##############################################################
###Modules loaded
use strict;
use warnings;
use LWP::Simple;
use HTTP::Request::Common;
use LWP::UserAgent;
#Argument variables:
my $file = $ARGV[0];
my $outfile = $ARGV[1];
my $expression_value="NAME\tVALUE\n";
my @set = ('0' ..'9', 'A' .. 'F');
my $identifier = join '' => map $set[rand @set], 1 .. 16;
## START Input file parser ##
open( FH, "$file" ) or die "Cannot open $file: $!";
my @all = <FH>;
close FH;
shift @all; # it removes header
open( my $ofh,'>', "$outfile" ) or die "Cannot open $outfile: $!";
my %result_hash = ();
while (@all) {
my $Line = shift @all;
chomp($Line);
my ($gene_symbol, $probe_id, $log_fc, $t, $p_value, $adj_p_value, $b_value) = split /\t/, $Line;
my $normalized_log_fc = "";
$result_hash{$gene_symbol} = $log_fc;
} # while (<FH>) {
my @log_fc_values = ();
for my $gene_symbol (keys %result_hash) {
push @log_fc_values, $result_hash{$gene_symbol};
}
@log_fc_values = sort {$a <=> $b} @log_fc_values;
print "log_fc_values : ", join("\t", @log_fc_values), "\n";
my $array_length = scalar(@log_fc_values);
my $min_log_fc_value = $log_fc_values[0];
my $max_log_fc_value = $log_fc_values[$array_length -1];
print "min_log_fc_value : $min_log_fc_value, max_log_fc_value : $max_log_fc_value\n";
my $expression_value="NAME\tVALUE\n";
for my $gene_symbol (keys %result_hash) {
my $log_fc_value = $result_hash{$gene_symbol};
if ($log_fc_value > 0) {
$log_fc_value = $log_fc_value / $max_log_fc_value;
}
elsif ($log_fc_value < 0) {
$log_fc_value = $log_fc_value / abs($min_log_fc_value);
}
$expression_value.="$gene_symbol\t$log_fc_value\n";
} # for my $gene_symbol (keys %result_hash) {
my @set = ('0' ..'9', 'A' .. 'F');
my $identifier = join '' => map $set[rand @set], 1 .. 16;
my $pdmap_servelet = "http://pg-sandbox.uni.lu:8080/MapViewer/galaxy.xhtml";
my $pdmap_ua = LWP::UserAgent->new();
my $pdmap_req = POST $pdmap_servelet,
Content => [
identifier => $identifier,
expression_value => $expression_value,
];
my $pdmap_response = $pdmap_ua->request($pdmap_req);
if ($pdmap_response->is_success()) {
my $pdmaped_content = $pdmap_response->content;
#print "pdmaped_content : $pdmaped_content\n";
print $ofh "<html>";
print $ofh "<head></head>";
print $ofh "<body>";
#print $ofh "<br><h1>pdmaped_content : $pdmaped_content</h1>";
print $ofh "<br><h3>Overlay of biomarkers on PD map</h3>";
print $ofh "<h3> Click the <a href=\"http://pg-sandbox.uni.lu:8080/MapViewer/?id=pdmap&layout=$identifier\">PD map</a> link</h3>";
print $ofh "</body>";
print $ofh "</html>";
} else {
print $pdmap_response->as_string;
print $ofh "<html>";
print $ofh "<head></head>";
print $ofh "<body>";
#print $ofh "<br><h1>$pdmap_response->as_string</h1>";
print $ofh "<br><h3>Overlay of biomarkers on Parkinson's Disease Map</h3>";
print $ofh "<h3> Click ---> <a href=\"http://pg-sandbox.uni.lu:8080/MapViewer/?id=pdmap&layout=Galaxy\">PD Map</a></h3>";
print $ofh "</body>";
print $ofh "</html>";
}
close $ofh;
## END Input file parser ##
#!/usr/bin/env perl
##############################################################
## Name: Create_GEX_outputfile.pl ##
## Description: Merges the two subset mrna.tsv subset files ##
## and puts the data into the correct format to be used ##
## as input by MarkerSelection.R ##
## Usage: Create_GEX_outputfile.pl <subset1_input_file> ##
## <subset2_input_file> <in/output dir> <data_filter>##
## Email: serge.eifes@uni.lu ##
## ##
## This work is licensed under the Creative Commons ##
## Attribution-NonCommercial-ShareAlike 4.0 ##
## International License. To view a copy of this license, ##
## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
##############################################################
###Modules loaded
use strict;
use warnings;
#Argument variables:
my $file_1=$ARGV[0];
my $file_2=$ARGV[1];
my $O_file=$ARGV[2];
#my $data_filter=$ARGV[3]; #Should be either "TRUE" or "FALSE". All values corresponding to -10 will be set to NA
my $data_filter="TRUE"; # test Wei
die "Not allowed argument: $data_filter. Value provided should be 'TRUE' or 'FALSE'" if(($data_filter ne "TRUE") && ($data_filter ne "FALSE"));
my @A_Subset_files;
push(@A_Subset_files, $file_1);
push(@A_Subset_files, $file_2);
my $counter_subset=1;
open(WH, ">$O_file") or die "Cannot open $O_file: $!";
foreach my $file (@A_Subset_files){
#Idx numbers for data columns we want to extract
my $Patient_ID_idx;
my $Value_idx;
my $Probe_ID_idx;
my $Gene_Symbol_idx;
open(FH, "$file") or die "Cannot open $file: $!";
print WH "PATIENT.ID\tVALUE\tPROBE.ID\tGENE_SYMBOL\tSUBSET\n";
while(<FH>){
my $Line = $_;
chomp($Line);
my (@A_Line) = split /\t/, $Line;
#Retrieving col indices from header for the data we need
if($Line=~/PATIENT/){
for(my $i=0; $i<=$#A_Line; $i++){
if($A_Line[$i] eq "PATIENT ID"){
$Patient_ID_idx = $i;
} elsif($A_Line[$i] eq "LOG2E"){
$Value_idx = $i;
} elsif(($A_Line[$i] eq "PROBE") || ($A_Line[$i] eq "ANNOTATIONID")){
$Probe_ID_idx = $i;
} elsif($A_Line[$i] eq "GENE SYMBOL"){
$Gene_Symbol_idx = $i;
} elsif($A_Line[$i] eq "MIRNA ID"){
$Probe_ID_idx = $i;
$Gene_Symbol_idx = $i;
}
}
next;
}
my $Patient_ID = $A_Line[$Patient_ID_idx];
my $Value = $A_Line[$Value_idx];
my $Probe_ID = $A_Line[$Probe_ID_idx];
my $Gene_Symbol = $A_Line[$Gene_Symbol_idx];
if($data_filter eq "TRUE"){
$Value="NA" if$Value==-10;
}
$Gene_Symbol="NA" if($Gene_Symbol eq "null");
my $Subset;
$Subset = "S1" if($counter_subset==1);
$Subset = "S2" if($counter_subset==2);
print WH $Subset . "_" . $Patient_ID . "\t" . $Value
. "\t" . $Probe_ID . "\t" . $Gene_Symbol
. "\t" . $Subset . "\n";
}
close FH;
$counter_subset++;
}
close WH;
#!/usr/bin/env perl
##############################################################
## Name: Create_VCF_Allele_matrix.pl ##
## Description: Creates input file for subset to ##
## perform the Fisher exact test to detect significant ##
## deviations in the proportions for minor and major ##
## alleles between two subsets. To perform the analysis, ##
##two subset files need to be generated by this script ##
## Usage: Create_VCF_Allele_matrix.pl <input_file> ##
## <output_file> ##
## Author: serge.eifes@uni.lu ##
##############################################################
###Modules loaded
use strict;
use warnings;
#Argument variables:
my $file = $ARGV[0];
my $O_file = $ARGV[1];
#To store the corresponding column num for given col
my %H_col2colNum;
#Hash to store data extracted from input file
my %HoH_MarkerAndPatient2Variant;
#Names of the columsn we need:
my @A_colnames = ("PATIENT ID", "VARIANT", "CHROMOSOME", "POSITION");
## START Input file parser ##
open( FH, "$file" ) or die "Cannot open $file: $!";
while (<FH>) {
my $Line = $_;
chomp($Line);
my (@A_Line) = split /\t/, $Line;
if(($Line=~/PATIENT ID/) && ($Line=~/VARIANT/) && ($Line=~/CHROMOSOME/) && ($Line=~/POSITION/)){
foreach my $colname (@A_colnames){
$H_col2colNum{$colname} = get_col_num($colname, \@A_Line);
}
next;
}
my $Marker = $A_Line[$H_col2colNum{"CHROMOSOME"}] . "_" . $A_Line[$H_col2colNum{"POSITION"}];
my $Patient = $A_Line[$H_col2colNum{"PATIENT ID"}];
my $Variant = $A_Line[$H_col2colNum{"VARIANT"}];
$HoH_MarkerAndPatient2Variant{$Marker}{$Patient}=$Variant;
}
close FH;
## END Input file parser ##
## START Storing all Markers and patients in array ##
my @A_markers;
my @A_patients;
for my $Marker (sort keys %HoH_MarkerAndPatient2Variant ) {
for my $Patient (sort keys %{$HoH_MarkerAndPatient2Variant{$Marker}}){
push(@A_markers, $Marker);
push(@A_patients, $Patient);
# print $Patient . "\n";
}
}
my @A_uniq_markers = uniq(@A_markers);
my @A_uniq_patients = uniq(@A_patients);
#print $#A_patients . "\n";
#print $#A_uniq_patients . "\n";
#
#print $#A_markers . "\n";
#print $#A_uniq_markers . "\n";
my @AoA;
###Storing data in two dim array:
#Rownames
$AoA[0][0]="//";
my $i=1;
foreach my $Marker (@A_uniq_markers){
$AoA[$i][0]=$Marker;
$i++;
}
#Colnames
my $j=1;
foreach my $Patient (@A_uniq_patients){
$AoA[0][$j]=$Patient;
$j++;
}
##Filling the "matrix" with the variant data
my $col = 1;
foreach my $Patient (@A_uniq_patients){
my $row= 1;
foreach my $Marker (@A_uniq_markers){
if (exists($HoH_MarkerAndPatient2Variant{$Marker}{$Patient})){
$AoA[$row][$col] = $HoH_MarkerAndPatient2Variant{$Marker}{$Patient};
} else {
$AoA[$row][$col] = "NA";
}
# print $row . "--" . $col . "::" . $HoH_MarkerAndPatient2Variant{$Marker}{$Patient} . "\n";
$row++;
}
$col++;
}
## END Storing all Markers and patients in array ##
## START Printing out the results/matrix
open(WH, ">$O_file") or die "Cannot create $O_file: $!";
for $i ( 0 .. $#AoA ) {
print WH join("\t", @{$AoA[$i]}) . "\n";
}
close WH;
## END Printing out the results/matrix
### FUNCTIONS ###
sub get_col_num{
my $colname2match = $_[0];
my @A_header_line = @{$_[1]};
my $col_num=0;
foreach my $colname (@A_header_line){
if($colname2match eq $colname){
return($col_num);
} else{
$col_num++;
}
}
die "Column $colname2match not found. Please check the input file header!!!";
}
##Eliminate duplicate items in array:
sub uniq {
my %seen;
return grep { !$seen{$_}++ } @_;
}
#!/usr/bin/env Rscript
###########################################################################
# Copyright 2008-2012 Janssen Research & Development, LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###########################################################################
#Supress printing of warnings
options(warn=-1)
###########################################################################
#Comparative Marker Selection
##########################################################################
#Function to obtain the aligned positions between all items in the vectorToAlign
#relative to the vectorRef
get.reordered_index = function(vectorToAlign, vectorRef){
vectorToAlign = as.vector(vectorToAlign)
vectorRef = as.vector(vectorRef)
res = vector(mode="integer", length=length(vectorToAlign))
for(i in 1:length(res)){
idx = which(vectorToAlign == vectorRef[i])
res[i]=idx
}
return(res)
}
## Main function to perform DEG analysis:
MS.loader <- function(
input.filename="outputfile.txt",
output.file ="CMS.TXT",
numberOfMarkers = 100,
out.heatmap ="heatmapdata",
mhcCorrection = 2
)
{
##########################################
#We need this to do the ddply below.
suppressMessages(library(plyr))
suppressMessages(library(reshape2))
suppressMessages(library(limma))
#---------------------
#PREPARE RAW DATA
#Pull the GEX data from the file.
mRNAData <- data.frame(read.delim(input.filename))
#Trim the probe.id field.
mRNAData$PROBE.ID <- gsub("^\\s+|\\s+$", "",mRNAData$PROBE.ID)
mRNAData$GENE_SYMBOL <- gsub("^\\s+|\\s+$", "",mRNAData$GENE_SYMBOL)
mRNAData$PATIENT.ID <- gsub("^\\s+|\\s+$", "",mRNAData$PATIENT.ID)
#Getting rid of the probesets w/o associated gene symbol:
idx_wo_symbol = which(is.na(mRNAData$GENE_SYMBOL))
#print(length(idx_wo_symbol))
if(length(idx_wo_symbol)>0){
mRNAData=mRNAData[-idx_wo_symbol,]
}
idx_wo_symbol = which(mRNAData$GENE_SYMBOL=="null")
#print(length(idx_wo_symbol))
if(length(idx_wo_symbol)>0){
mRNAData=mRNAData[-idx_wo_symbol,]
}
#Create a data.frame with unique probe/gene ids.
geneStatsData <- data.frame(mRNAData$PROBE.ID,mRNAData$GENE_SYMBOL);
#Add a column name to our data.frame.
colnames(geneStatsData) <- c('PROBE.ID','GENE_SYMBOL')
geneStatsData <- unique(geneStatsData[,c("PROBE.ID","GENE_SYMBOL")]);
#---------------------
#---------------------
#Prepare the casted raw data.
#Get a copy of the raw data.
coercedData <- mRNAData
#Grab only the columns we need for doing the melt/cast.
coercedData <- coercedData[c('PATIENT.ID','VALUE','PROBE.ID','GENE_SYMBOL')]
#Melt the data, leaving 2 columns as the grouping fields.
meltedData <- melt(coercedData, id=c("PROBE.ID","GENE_SYMBOL","PATIENT.ID"))
#Cast the data into a format that puts the PATIENT.ID in a column.
coercedData <- data.frame(dcast(meltedData, PROBE.ID + GENE_SYMBOL ~ PATIENT.ID))
#The PATIENT.ID column needs to be removed if exists in the matrix!!!!!!!
idx.patient_id = which(colnames(coercedData)=="PATIENT.ID")
if(length(idx.patient_id)==1){
coercedData = coercedData[,-idx.patient_id]
}
#When we convert to a data frame the numeric columns get an x in front of them. Remove them here.
colnames(coercedData) <- sub("^X","",colnames(coercedData))
#Get a gene list that we can use later to preserve the list of the genes.
geneList <- as.vector(coercedData$GENE_SYMBOL)
probeList <- as.vector(coercedData$PROBE.ID)
#---------------------
#---------------------
#Fitting linear model with limma
#Remove the gene_symbol and probe.id columns.
coercedDataWithoutGroup <- data.matrix(subset(coercedData, select=-c(GENE_SYMBOL,PROBE.ID)))
rownames(coercedDataWithoutGroup)=coercedData$PROBE.ID # Rownames have to be added
#Creating a named vector for mapping PROBE.ID to GENE.SYMBOL
gene.symbols=coercedData$GENE_SYMBOL
names(gene.symbols)=coercedData$PROBE.ID
#Get a vector representing our subsets.
classVector <- colnames(coercedDataWithoutGroup)
classVector <- gsub("^S1.*","0",classVector)
classVector <- gsub("^S2.*","1",classVector)
classVector <- as.numeric(classVector)
#Check the class vector to verify we have two subsets.
if(length(unique(classVector)) < 2) stop("||FRIENDLY||There is only one subset selected, please select two in order to run the comparative analysis.")
if(mhcCorrection==0){
mhcMethod="none"
}else if(mhcCorrection==1){
mhcMethod="BH"
}else if(mhcCorrection==2){
mhcMethod="BY"
}else if(mhcCorrection==3){
mhcMethod="Holm"
}
#LINEAR MODEL FITTING
#Creating Design matrix
S1 = integer(length(classVector))
S1[which(classVector==0)]=1
S2 = classVector
design <- cbind(S1=S1,S2=S2)
#... and contrast matrix
contrast.matrix = makeContrasts(S1-S2, levels=design)
# Linear model fitting
fit <- lmFit(coercedDataWithoutGroup, design)
fit <- contrasts.fit(fit, contrast.matrix)
fit <- eBayes(fit)
#The data structure of topTable for newer versions of limma has been modified.
#This causes errors during execution of the script. A workarround by creating a data structure identical to topTable compatible with tranSMART is created
contr=1 #The contrast in the fitting to be used for the results table
#Results data frame which provides all required statistics for output data
top.fit = data.frame(
ID=rownames(fit$coefficients), #Depending on limma version fit$genes doesn't exist anymore
logFC=fit$coefficients[,contr],
t=fit$t[,contr],
P.Value=fit$p.value[,contr],
adj.P.val=p.adjust(p=fit$p.value[,contr], method=mhcMethod),
B=fit$lods[,contr]
)
top.fit.ranked.decr = top.fit[ order(top.fit$B, decreasing=T), ] #Ordering the data according to limma B statistic
rownames(top.fit.ranked.decr) = NULL
top.fit.ranked.decr.filt = top.fit.ranked.decr[1:numberOfMarkers,] #Filtering down the results table to the number of genes selected by user
topgenes = cbind(gene.symbols[top.fit.ranked.decr.filt$ID], top.fit.ranked.decr.filt)
colnames(topgenes) = c("GENE_SYMBOL", "PROBE.ID", "logFC", "t", "P.value", "adj.P.val", "B")
rownames(topgenes) = NULL #This is the final results data object for Marker Selection workflow containing all data displayed in output table
#print(top.fit.ranked.decr.filt)
# End Linear model fitting
#---------------------
#---------------------
#HEATMAP
## Generating heatmap output data:
heatmapData = coercedData[which(coercedData[,"PROBE.ID"] %in% topgenes[,"PROBE.ID"]), ]
finalHeatmapData = heatmapData[, -c(1,2)]
GROUP = paste(as.vector(heatmapData[, "GENE_SYMBOL"]), as.vector(heatmapData[, "PROBE.ID"]), sep=" ")
finalHeatmapData = cbind(GROUP, finalHeatmapData)
#Here we align the lines in finalHeatmapData to topgenes according to
#the gene symbols in both matrices
s = strsplit(as.character(finalHeatmapData$GROUP), " ")
d = vector(mode="character", length=length(s))
for(i in 1:length(s)){
d[i] = s[[i]][2]
}
idx = get.reordered_index(d, topgenes$PROBE.ID)
finalHeatmapData = finalHeatmapData[ idx, ]
#WRITE TO FILE
#Write the file with the stats by gene. This will get read into the UI.
write.table(topgenes,output.file,sep = "\t",quote=F,row.names=F)
#Write the data file we will use for the heatmap.
write.table(finalHeatmapData,out.heatmap,sep = "\t",quote=F,row.names=F)
##########################################
}
#### Executing the main job to obtain the DEG and corresponding heatmap file: ####
#Getting command line arguments
args = commandArgs(trailingOnly = TRUE)
input_file=as.vector(args[1])
output_file=as.vector(args[2])
numberOfMarkers=as.vector(args[3])
out.heatmap=as.vector(args[4])
MS.loader(input_file, output_file,numberOfMarkers, out.heatmap)
#!/usr/bin/env Rscript
##############################################################
## Name: VCF_FE_test.R ##
## Description: Performs the Fisher exact test to detect ##
## significant deviations in the proportions for minor ##
## and major alleles between the two subsets provided as ##
## input files. The script Create_VCF_Allele_matrix.pl ##
## generates the two input files for this script. Each ##
## input file provides the data for one cohort/subset ##
## Usage: VCF_FE_test.R <input_file_1> <input_file_2> ##
## <output_file> ##
## Author: Serge Eifes (University of Luxembourg) ##
## Email: serge.eifes@uni.lu ##
## ##
## This work is licensed under the Creative Commons ##
## Attribution-NonCommercial-ShareAlike 4.0 ##
## International License. To view a copy of this license, ##
## visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ##
##############################################################
#######################################
#######################################
### START FUNCTIONS ####
get_input_data = function(file){
data = as.matrix(read.delim(file, header=F))
#Adding column and row names
data_w_names = data[-1,-1]
rownames = data[-1,1]
colnames = t(data[1,-1])
rownames(data_w_names) = rownames
colnames(data_w_names) = colnames
return(data_w_names)
}
split_alleles = function(row){
row = as.vector(row)
num_patients = length(row)
row.split.list = strsplit(row, "\\/", perl=T)
all_alleles_per_marker = vector(mode="character", length=num_patients*2)
counter=1
for(i in 1:num_patients){
patient_alleles = row.split.list[[i]]
for(j in 1:2){
allele = patient_alleles[j]
all_alleles_per_marker[counter] = allele
counter=counter+1
}
}
return(table(all_alleles_per_marker))
}
create_subset_count_list = function(data){
subset = list()
markers = rownames(data)
for(i in 1:dim(data)[1]){
datarow = data[i,]
marker = markers[i]
nt_counts = split_alleles(datarow)
nt_count_matrix = as.matrix(nt_counts)
subset[[marker]] = nt_count_matrix
}
return(subset)
}
get_sorted_count_vec = function(subset_data){
alleles_subset = rownames(subset_data)
subset_vec = as.vector(subset_data[,1])
names(subset_vec) = alleles_subset
subset_vec_sorted = sort(subset_vec, method="quick")
#When multiple minor alleles exist in one subset ... merge them:
if(length(subset_vec_sorted)>2){
subset_vec_sorted = merge_minor_allele_counts(subset_vec_sorted)
}
return(subset_vec_sorted)
}
merge_minor_allele_counts = function(allele_count_vec){
#Create new allele count vector with minor and major allele count by merging the minor ones:
merged_allele_count_vec = c(
sum(allele_count_vec[1:(length(allele_count_vec)-1)]),
allele_count_vec[length(allele_count_vec)]
)
names.merged.allele = paste(names(allele_count_vec[1:(length(allele_count_vec)-1)]), collapse="/")
return(merged_allele_count_vec)
}
#Checks sorted_count_vec if minor allele (with count 0) needs to be added
check_sorted_count_vec_for_minor = function(sorted_count_vec_a){
if(length(sorted_count_vec_a)==1){
#Need to add minor allele with count 0 if it exists in other subset
sorted_count_vec = vector(mode="integer", length=2)
sorted_count_vec[2] = sorted_count_vec_a[1]
sorted_count_vec[1] = 0
sorted_count_vec_a = sorted_count_vec
}
return(sorted_count_vec_a)
}
### END FUNCTIONS ######
#######################################
#######################################
#######################################
### START MAIN CODE ####
#Getting command line arguments
args = commandArgs(trailingOnly = TRUE)
input_file_1=as.vector(args[1])
input_file_2=as.vector(args[2])
output_file=as.vector(args[3])
data_1 = get_input_data(input_file_1)
data_2 = get_input_data(input_file_2)
#Test case for debugging
# print(data_1["1_3396099",])
# print("___")
# print(data_2["1_3396099",])
subset_list_1 = create_subset_count_list(data_1)
subset_list_2 = create_subset_count_list(data_2)
markers_subset_list_1 = names(subset_list_1)
markers_subset_list_2 = names(subset_list_2)
# In this matrix the results of the FE test for each marker
#will be stored
results = matrix(c("Chr", "Position",
"S1_Minor_AC", "S1_Major_AC",
"S2_Minor_AC", "S2_Major_AC",
"OR", "P-value"),
ncol=8, nrow=1)
#Iterating over each marker and performing FE test to compare
#minor and major allele frequencies in both subsets
for(i in 1:length(markers_subset_list_1)){
subset_1_data = subset_list_1[[markers_subset_list_1[i]]]
subset_2_data = subset_list_2[[markers_subset_list_1[i]]]
sorted_count_vec_1 = get_sorted_count_vec(subset_1_data)
sorted_count_vec_2 = get_sorted_count_vec(subset_2_data)
#Adding minor allele count (equals 0) to sorted_count_vec_1 if not existing in vector:
if(length(sorted_count_vec_1)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_2 = sorted_count_vec_2
cp_sorted_count_vec_2[2] = sorted_count_vec_1[1]
cp_sorted_count_vec_2[1] = 0
sorted_count_vec_1 = cp_sorted_count_vec_2
}
#Adding minor allele count (equals 0) to sorted_count_vec_1 if not existing:
if(length(sorted_count_vec_2)==1){
#Need to add minor allele with count 0 if it exists in other subset
cp_sorted_count_vec_1 = sorted_count_vec_1
cp_sorted_count_vec_1[2] = sorted_count_vec_2[1]
cp_sorted_count_vec_1[1] = 0
sorted_count_vec_2 = cp_sorted_count_vec_1
}
#In case one of the subsets contains no or multiple minor alleles
#we rename the minor allele of the other subset according to this one
if(nchar(names(sorted_count_vec_1)[1]) > nchar(names(sorted_count_vec_2)[1])){
names(sorted_count_vec_2) = names(sorted_count_vec_1)
}
if(nchar(names(sorted_count_vec_2)[1]) > nchar(names(sorted_count_vec_1)[1])){
names(sorted_count_vec_1) = names(sorted_count_vec_2)
}
##2x2 contingency table
input.matrix = matrix(nrow=2, ncol=2)
input.matrix[1,] = sorted_count_vec_1
input.matrix[2,] = sorted_count_vec_2
rownames(input.matrix) = c("subset_1", "subset_2")
colnames(input.matrix) = names(sorted_count_vec_1)
#FE test
res.test = fisher.test(input.matrix)
p.value = res.test$p.value
odds_ratio = res.test$estimate
# print(input.matrix)
# print(res.test)
#Reformating and storing results in a matrix
marker_split = unlist(strsplit(markers_subset_list_1[i], "_"))
results = rbind(results, c(marker_split[1], marker_split[2],
input.matrix[1,1], input.matrix[1,2],
input.matrix[2,1], input.matrix[2,2],
odds_ratio, p.value)
)
}
#Printing results to file ordered by p-value
results_header = as.vector(results[1,])
results = results[-1,]
colnames(results) = results_header
results = results[order(results[,"P-value"]), ]
##Adding fdr MHC:
fdr = p.adjust(results[,"P-value"], method="fdr")
results = cbind(results, FDR=fdr)
results = results[1:min(nrow(results),20),]
write.table(results, output_file, quote=F, sep="\t", row.names=F, col.names=T)
#print(warnings())
### END MAIN CODE ####
#######################################
#!/usr/bin/Rscript --vanilla
# user input
fileName <- "mrna.tsv"
# check command line parameters
args <- strsplit(commandArgs(TRUE), split='=')
keys <- vector ("character")
if (length(args) > 0) {
for (i in 1:length(args)) {
key <- args[[i]][1]
value <- args[[i]][2]
keys <- c(keys, key)
if (exists(key)) {
# replace default value of key with input value
assign(key, value)
}else {
cat("\n")
stop(paste("Unrecognized option [",key,"].\n\n", sep=""))
}
}
}
a<-read.table(fileName,sep="\t")
write.table(head(a),file="outputfile.txt")
This diff is collapsed.
<tool id="tm_fe_vcf_test" name="Do Fisher test on VCF files" version="0.1">
<command>
${GALAXY_DATA_INDEX_DIR}/Rscripts/VCF_FE_test.R $input_s1 $input_s2 $output
</command>
<inputs>
<param format="tabular" name="input_s1" type="data" label="vcf_subset_1"/>
<param format="tabular" name="input_s2" type="data" label="vcf_subset_2"/>
</inputs>
<outputs>
<!-- option q selected -->
<data format="tabular" name="output" ></data>
</outputs>
<help>
Do Fisher's test on VCF data
</help>
</tool>
<tool id="tm_marker_selection_mrna" name="Do Marker Selection on TM Export mRNA files" version="0.1">
<command>
${GALAXY_DATA_INDEX_DIR}/Rscripts/MarkerSelection.R $input $output $nmarker $heatmap
</command>
<inputs>
<param format="tabular" name="input" type="data" label="subset_1_and_2"/>
<param name="nmarker" type="integer" value="50" label="Number of Markers"/>
</inputs>
<outputs>
<!-- option q selected -->
<data format="tabular" name="output" ></data>
<data format="tabular" name="heatmap" ></data>
</outputs>
<help>
Do Marker Selection using Limma
</help>
</tool>
<tool id="testrscript" name="Test R script" version="0.1">
<command>
${GALAXY_DATA_INDEX_DIR}/Rscripts/test.R fileName=$input_1
</command>
<inputs>
<param format="tabular" name="input_1" type="data" label="Subset 1"/>
</inputs>
<outputs>
<!-- option q selected -->
<data format="tabular" name="output" label="label_written_to_the_user" from_work_dir="outputfile.txt"></data>
</outputs>
<help>
Some tool description
</help>
</tool>
<tool id="tm_create_input_VCF" name="Create VCF input from TM Export files" version="0.1">
<command>
${GALAXY_DATA_INDEX_DIR}/Rscripts/Create_VCF_Allele_matrix.pl $input $output
</command>
<inputs>
<param format="tabular" name="input" type="data" label="vcf_input"/>
</inputs>
<outputs>
<!-- option q selected -->
<!-- <data format="tabular" name="output" label="label_written_to_the_user" from_work_dir="outputfile.txt"></data> -->
<data format="tabular" name="output" ></data>
</outputs>
<help>
Some tool description
</help>
</tool>
<tool id="tm_create_input" name="Create input from TM Export files" version="0.1">
<command>
${GALAXY_DATA_INDEX_DIR}/Rscripts/Create_MarkerSelection_inputfile.pl $input_s1 $input_s2 $output
</command>
<inputs>
<param format="tabular" name="input_s1" type="data" label="Subset 1"/>
<param format="tabular" name="input_s2" type="data" label="Subset 2" />
</inputs>
<outputs>
<!-- option q selected -->
<!-- <data format="tabular" name="output" label="label_written_to_the_user" from_work_dir="outputfile.txt"></data> -->
<data format="tabular" name="output" ></data>
</outputs>
<help>
Some tool description
</help>
</tool>