diff --git a/rules/AMR/filter.pl b/rules/AMR/filter.pl deleted file mode 100755 index 07f81706b31575963a9633f86e904cd40cb08431..0000000000000000000000000000000000000000 --- a/rules/AMR/filter.pl +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -my $minlen = shift or die "Error: `minlen` parameter not provided\n"; -{ - local $/=">"; - while(<>) { - chomp; - next unless /\w/; - s/>$//gs; - my @chunk = split /\n/; - my $header = shift @chunk; - my $seqlen = length join "", @chunk; - print ">$_" if($seqlen >= $minlen); - } - local $/="\n"; -} diff --git a/rules/Virulence/virulence_prediction.py b/rules/Virulence/virulence_prediction.py deleted file mode 100755 index ca90232ec9191e9e327a79c0d218ab6680d4261b..0000000000000000000000000000000000000000 --- a/rules/Virulence/virulence_prediction.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python - -import Bio -from Bio import SeqIO -import re -import pandas as pd -#import argparse -import numpy as np -from sklearn import model_selection -from sklearn.ensemble import RandomForestClassifier -import pickle - -#from PyBioMed.PyProtein import AAComposition - -AALetter = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"] - -def CalculateAAComposition(ProteinSequence): - LengthSequence = len(ProteinSequence) - Result={} - for i in AALetter: - Result[i] = round(float(ProteinSequence.seq.count(i))/ LengthSequence * 100, 2) - return Result - -def CalculateDipeptideComposition(ProteinSequence): - LengthSequence = len(ProteinSequence) - Result = {} - for i in AALetter: - for j in AALetter: - Dipeptide = i + j - Result[Dipeptide] = round(float(ProteinSequence.seq.count(Dipeptide)) / (LengthSequence -1) * 100, 2) - - return Result - -def CalculateKmerComposition(ProteinSequence): - LengthSequence = len(ProteinSequence) - Result = {} - for i in AALetter: - for j in AALetter: - for k in AALetter: - Kmer = i + j + k - Result[Kmer] = round(float(ProteinSequence.seq.count(Kmer)) / (LengthSequence -2) * 100, 2) - return Result - -def CalculateAADipeptideComposition(ProteinSequence): - Result = {} - Result.update(CalculateAAComposition(ProteinSequence)) - Result.update(CalculateDipeptideComposition(ProteinSequence)) - Result.update(CalculateKmerComposition(ProteinSequence)) - return Result - - -#if __name__ == "__main__": -# parser = argparse.ArgumentParser(description='Process fasta input with random forest virulence prediction model') -# parser.add_argument('infile', metavar='infile', type=str, -# help='The input file (FASTA format)') -# parser.add_argument('outfile', metavar='outfile', type=str, -# help='The outpufile (TSV format)') -# args = parser.parse_args() - -df = pd.DataFrame([]) - -for Sequence in SeqIO.parse(snakemake.input[0], "fasta"): - ProteinSequence = Sequence - DIP = CalculateAADipeptideComposition(ProteinSequence) - data = DIP - data = pd.DataFrame(data.items()) - data = data.transpose() - - data.columns=data.iloc[0] - data = data.drop(data.index[[0]]) - data['Sequence'] = Sequence.id - df =df.append(data) - rfc = pickle.load(open("finalized_model_3f.sav", 'rb')) - -X_predict = df.drop('Sequence', axis = 1) -rfc_predict = rfc.predict(X_predict) -rfc_probs = rfc.predict_proba(X_predict)[:,1] - -Prediction = pd.DataFrame([]) -dfToList = df['Sequence'].tolist() -Prediction['Sequence'] = dfToList -Prediction['Prediction'] = rfc_predict -Prediction['Probability'] = rfc_probs -Prediction.to_csv(snakemake.output[0], sep='\t') - - - diff --git a/rules/AMR/AMR_MGE.R b/scripts/AMR_MGE.R similarity index 100% rename from rules/AMR/AMR_MGE.R rename to scripts/AMR_MGE.R diff --git a/rules/Universal/PathoFact.R b/scripts/PathoFact.R similarity index 100% rename from rules/Universal/PathoFact.R rename to scripts/PathoFact.R diff --git a/rules/Virulence/hmm.R b/scripts/hmm.R similarity index 100% rename from rules/Virulence/hmm.R rename to scripts/hmm.R diff --git a/rules/Toxin/ownHMM_library.R b/scripts/ownHMM_library.R similarity index 100% rename from rules/Toxin/ownHMM_library.R rename to scripts/ownHMM_library.R diff --git a/scripts/virulence_prediction.py b/scripts/virulence_prediction.py index b92a099329d9f70fbe31b6f15436149a813c06f6..ca90232ec9191e9e327a79c0d218ab6680d4261b 100755 --- a/scripts/virulence_prediction.py +++ b/scripts/virulence_prediction.py @@ -1,34 +1,87 @@ #!/usr/bin/env python +import Bio +from Bio import SeqIO import re import pandas as pd -import argparse +#import argparse import numpy as np +from sklearn import model_selection from sklearn.ensemble import RandomForestClassifier import pickle +#from PyBioMed.PyProtein import AAComposition -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Process fasta input with random forest virulence prediction model') - parser.add_argument('infile', metavar='infile', type=str, - help='The input file (FASTA format)') - parser.add_argument('outfile', metavar='outfile', type=str, - help='The outpufile (TSV format)') - args = parser.parse_args() +AALetter = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"] - df = pd.read_csv(args.infile, sep='\t') - X = df.drop('#', axis=1) +def CalculateAAComposition(ProteinSequence): + LengthSequence = len(ProteinSequence) + Result={} + for i in AALetter: + Result[i] = round(float(ProteinSequence.seq.count(i))/ LengthSequence * 100, 2) + return Result + +def CalculateDipeptideComposition(ProteinSequence): + LengthSequence = len(ProteinSequence) + Result = {} + for i in AALetter: + for j in AALetter: + Dipeptide = i + j + Result[Dipeptide] = round(float(ProteinSequence.seq.count(Dipeptide)) / (LengthSequence -1) * 100, 2) + + return Result + +def CalculateKmerComposition(ProteinSequence): + LengthSequence = len(ProteinSequence) + Result = {} + for i in AALetter: + for j in AALetter: + for k in AALetter: + Kmer = i + j + k + Result[Kmer] = round(float(ProteinSequence.seq.count(Kmer)) / (LengthSequence -2) * 100, 2) + return Result + +def CalculateAADipeptideComposition(ProteinSequence): + Result = {} + Result.update(CalculateAAComposition(ProteinSequence)) + Result.update(CalculateDipeptideComposition(ProteinSequence)) + Result.update(CalculateKmerComposition(ProteinSequence)) + return Result + + +#if __name__ == "__main__": +# parser = argparse.ArgumentParser(description='Process fasta input with random forest virulence prediction model') +# parser.add_argument('infile', metavar='infile', type=str, +# help='The input file (FASTA format)') +# parser.add_argument('outfile', metavar='outfile', type=str, +# help='The outpufile (TSV format)') +# args = parser.parse_args() + +df = pd.DataFrame([]) - rfc = pickle.load(open("scripts/Virulence_factor_model.sav", 'rb')) - rfc_predict = rfc.predict(X) - rfc_probs = rfc.predict_proba(X)[:,1] +for Sequence in SeqIO.parse(snakemake.input[0], "fasta"): + ProteinSequence = Sequence + DIP = CalculateAADipeptideComposition(ProteinSequence) + data = DIP + data = pd.DataFrame(data.items()) + data = data.transpose() + + data.columns=data.iloc[0] + data = data.drop(data.index[[0]]) + data['Sequence'] = Sequence.id + df =df.append(data) + rfc = pickle.load(open("finalized_model_3f.sav", 'rb')) + +X_predict = df.drop('Sequence', axis = 1) +rfc_predict = rfc.predict(X_predict) +rfc_probs = rfc.predict_proba(X_predict)[:,1] - Prediction = pd.DataFrame([]) - dfToList = df['#'].tolist() - Prediction['Sequence'] = dfToList - Prediction['Prediction'] = rfc_predict - Prediction['Probability'] = rfc_probs - Prediction.to_csv(args.outfile, sep='\t', header=False) +Prediction = pd.DataFrame([]) +dfToList = df['Sequence'].tolist() +Prediction['Sequence'] = dfToList +Prediction['Prediction'] = rfc_predict +Prediction['Probability'] = rfc_probs +Prediction.to_csv(snakemake.output[0], sep='\t')