Commit ba3bf94b by Valentina Galata

### mv scripts in rules/ to scripts/ (issue #15)

parent f2d5c20f
 #!/usr/bin/env perl use strict; use warnings; my \$minlen = shift or die "Error: `minlen` parameter not provided\n"; { local \$/=">"; while(<>) { chomp; next unless /\w/; s/>\$//gs; my @chunk = split /\n/; my \$header = shift @chunk; my \$seqlen = length join "", @chunk; print ">\$_" if(\$seqlen >= \$minlen); } local \$/="\n"; }
 #!/usr/bin/env python import Bio from Bio import SeqIO import re import pandas as pd #import argparse import numpy as np from sklearn import model_selection from sklearn.ensemble import RandomForestClassifier import pickle #from PyBioMed.PyProtein import AAComposition AALetter = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"] def CalculateAAComposition(ProteinSequence): LengthSequence = len(ProteinSequence) Result={} for i in AALetter: Result[i] = round(float(ProteinSequence.seq.count(i))/ LengthSequence * 100, 2) return Result def CalculateDipeptideComposition(ProteinSequence): LengthSequence = len(ProteinSequence) Result = {} for i in AALetter: for j in AALetter: Dipeptide = i + j Result[Dipeptide] = round(float(ProteinSequence.seq.count(Dipeptide)) / (LengthSequence -1) * 100, 2) return Result def CalculateKmerComposition(ProteinSequence): LengthSequence = len(ProteinSequence) Result = {} for i in AALetter: for j in AALetter: for k in AALetter: Kmer = i + j + k Result[Kmer] = round(float(ProteinSequence.seq.count(Kmer)) / (LengthSequence -2) * 100, 2) return Result def CalculateAADipeptideComposition(ProteinSequence): Result = {} Result.update(CalculateAAComposition(ProteinSequence)) Result.update(CalculateDipeptideComposition(ProteinSequence)) Result.update(CalculateKmerComposition(ProteinSequence)) return Result #if __name__ == "__main__": # parser = argparse.ArgumentParser(description='Process fasta input with random forest virulence prediction model') # parser.add_argument('infile', metavar='infile', type=str, # help='The input file (FASTA format)') # parser.add_argument('outfile', metavar='outfile', type=str, # help='The outpufile (TSV format)') # args = parser.parse_args() df = pd.DataFrame([]) for Sequence in SeqIO.parse(snakemake.input[0], "fasta"): ProteinSequence = Sequence DIP = CalculateAADipeptideComposition(ProteinSequence) data = DIP data = pd.DataFrame(data.items()) data = data.transpose() data.columns=data.iloc[0] data = data.drop(data.index[[0]]) data['Sequence'] = Sequence.id df =df.append(data) rfc = pickle.load(open("finalized_model_3f.sav", 'rb')) X_predict = df.drop('Sequence', axis = 1) rfc_predict = rfc.predict(X_predict) rfc_probs = rfc.predict_proba(X_predict)[:,1] Prediction = pd.DataFrame([]) dfToList = df['Sequence'].tolist() Prediction['Sequence'] = dfToList Prediction['Prediction'] = rfc_predict Prediction['Probability'] = rfc_probs Prediction.to_csv(snakemake.output[0], sep='\t')
File moved
 #!/usr/bin/env python import Bio from Bio import SeqIO import re import pandas as pd import argparse #import argparse import numpy as np from sklearn import model_selection from sklearn.ensemble import RandomForestClassifier import pickle #from PyBioMed.PyProtein import AAComposition if __name__ == "__main__": parser = argparse.ArgumentParser(description='Process fasta input with random forest virulence prediction model') parser.add_argument('infile', metavar='infile', type=str, help='The input file (FASTA format)') parser.add_argument('outfile', metavar='outfile', type=str, help='The outpufile (TSV format)') args = parser.parse_args() AALetter = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"] df = pd.read_csv(args.infile, sep='\t') X = df.drop('#', axis=1) def CalculateAAComposition(ProteinSequence): LengthSequence = len(ProteinSequence) Result={} for i in AALetter: Result[i] = round(float(ProteinSequence.seq.count(i))/ LengthSequence * 100, 2) return Result def CalculateDipeptideComposition(ProteinSequence): LengthSequence = len(ProteinSequence) Result = {} for i in AALetter: for j in AALetter: Dipeptide = i + j Result[Dipeptide] = round(float(ProteinSequence.seq.count(Dipeptide)) / (LengthSequence -1) * 100, 2) return Result def CalculateKmerComposition(ProteinSequence): LengthSequence = len(ProteinSequence) Result = {} for i in AALetter: for j in AALetter: for k in AALetter: Kmer = i + j + k Result[Kmer] = round(float(ProteinSequence.seq.count(Kmer)) / (LengthSequence -2) * 100, 2) return Result def CalculateAADipeptideComposition(ProteinSequence): Result = {} Result.update(CalculateAAComposition(ProteinSequence)) Result.update(CalculateDipeptideComposition(ProteinSequence)) Result.update(CalculateKmerComposition(ProteinSequence)) return Result #if __name__ == "__main__": # parser = argparse.ArgumentParser(description='Process fasta input with random forest virulence prediction model') # parser.add_argument('infile', metavar='infile', type=str, # help='The input file (FASTA format)') # parser.add_argument('outfile', metavar='outfile', type=str, # help='The outpufile (TSV format)') # args = parser.parse_args() df = pd.DataFrame([]) rfc = pickle.load(open("scripts/Virulence_factor_model.sav", 'rb')) rfc_predict = rfc.predict(X) rfc_probs = rfc.predict_proba(X)[:,1] for Sequence in SeqIO.parse(snakemake.input[0], "fasta"): ProteinSequence = Sequence DIP = CalculateAADipeptideComposition(ProteinSequence) data = DIP data = pd.DataFrame(data.items()) data = data.transpose() data.columns=data.iloc[0] data = data.drop(data.index[[0]]) data['Sequence'] = Sequence.id df =df.append(data) rfc = pickle.load(open("finalized_model_3f.sav", 'rb')) X_predict = df.drop('Sequence', axis = 1) rfc_predict = rfc.predict(X_predict) rfc_probs = rfc.predict_proba(X_predict)[:,1] Prediction = pd.DataFrame([]) dfToList = df['#'].tolist() Prediction['Sequence'] = dfToList Prediction['Prediction'] = rfc_predict Prediction['Probability'] = rfc_probs Prediction.to_csv(args.outfile, sep='\t', header=False) Prediction = pd.DataFrame([]) dfToList = df['Sequence'].tolist() Prediction['Sequence'] = dfToList Prediction['Prediction'] = rfc_predict Prediction['Probability'] = rfc_probs Prediction.to_csv(snakemake.output[0], sep='\t')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!