Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Laura Denies
PathoFact
Commits
eab5f6ab
Commit
eab5f6ab
authored
Apr 23, 2020
by
Valentina Galata
Browse files
replaced scripts/virulence_prediction.py by correct version (issue
#31
, related to issue
#15
)
parent
0adaecb2
Changes
1
Hide whitespace changes
Inline
Side-by-side
scripts/virulence_prediction.py
100755 → 100644
View file @
eab5f6ab
#!/usr/bin/env python
import
Bio
from
Bio
import
SeqIO
import
re
import
pandas
as
pd
#
import argparse
import
argparse
import
numpy
as
np
from
sklearn
import
model_selection
from
sklearn.ensemble
import
RandomForestClassifier
import
pickle
#from PyBioMed.PyProtein import AAComposition
AALetter
=
[
"A"
,
"R"
,
"N"
,
"D"
,
"C"
,
"E"
,
"Q"
,
"G"
,
"H"
,
"I"
,
"L"
,
"K"
,
"M"
,
"F"
,
"P"
,
"S"
,
"T"
,
"W"
,
"Y"
,
"V"
]
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Process fasta input with random forest virulence prediction model'
)
parser
.
add_argument
(
'infile'
,
metavar
=
'infile'
,
type
=
str
,
help
=
'The input file (FASTA format)'
)
parser
.
add_argument
(
'outfile'
,
metavar
=
'outfile'
,
type
=
str
,
help
=
'The outpufile (TSV format)'
)
args
=
parser
.
parse_args
()
def
CalculateAAComposition
(
ProteinSequence
):
LengthSequence
=
len
(
ProteinSequence
)
Result
=
{}
for
i
in
AALetter
:
Result
[
i
]
=
round
(
float
(
ProteinSequence
.
seq
.
count
(
i
))
/
LengthSequence
*
100
,
2
)
return
Result
def
CalculateDipeptideComposition
(
ProteinSequence
):
LengthSequence
=
len
(
ProteinSequence
)
Result
=
{}
for
i
in
AALetter
:
for
j
in
AALetter
:
Dipeptide
=
i
+
j
Result
[
Dipeptide
]
=
round
(
float
(
ProteinSequence
.
seq
.
count
(
Dipeptide
))
/
(
LengthSequence
-
1
)
*
100
,
2
)
return
Result
def
CalculateKmerComposition
(
ProteinSequence
):
LengthSequence
=
len
(
ProteinSequence
)
Result
=
{}
for
i
in
AALetter
:
for
j
in
AALetter
:
for
k
in
AALetter
:
Kmer
=
i
+
j
+
k
Result
[
Kmer
]
=
round
(
float
(
ProteinSequence
.
seq
.
count
(
Kmer
))
/
(
LengthSequence
-
2
)
*
100
,
2
)
return
Result
def
CalculateAADipeptideComposition
(
ProteinSequence
):
Result
=
{}
Result
.
update
(
CalculateAAComposition
(
ProteinSequence
))
Result
.
update
(
CalculateDipeptideComposition
(
ProteinSequence
))
Result
.
update
(
CalculateKmerComposition
(
ProteinSequence
))
return
Result
#if __name__ == "__main__":
# parser = argparse.ArgumentParser(description='Process fasta input with random forest virulence prediction model')
# parser.add_argument('infile', metavar='infile', type=str,
# help='The input file (FASTA format)')
# parser.add_argument('outfile', metavar='outfile', type=str,
# help='The outpufile (TSV format)')
# args = parser.parse_args()
df
=
pd
.
DataFrame
([])
df
=
pd
.
read_csv
(
args
.
infile
,
sep
=
'
\t
'
)
X
=
df
.
drop
(
'#'
,
axis
=
1
)
for
Sequence
in
SeqIO
.
parse
(
snakemake
.
input
[
0
],
"fasta"
):
ProteinSequence
=
Sequence
DIP
=
CalculateAADipeptideComposition
(
ProteinSequence
)
data
=
DIP
data
=
pd
.
DataFrame
(
data
.
items
())
data
=
data
.
transpose
()
data
.
columns
=
data
.
iloc
[
0
]
data
=
data
.
drop
(
data
.
index
[[
0
]])
data
[
'Sequence'
]
=
Sequence
.
id
df
=
df
.
append
(
data
)
rfc
=
pickle
.
load
(
open
(
"finalized_model_3f.sav"
,
'rb'
))
X_predict
=
df
.
drop
(
'Sequence'
,
axis
=
1
)
rfc_predict
=
rfc
.
predict
(
X_predict
)
rfc_probs
=
rfc
.
predict_proba
(
X_predict
)[:,
1
]
rfc
=
pickle
.
load
(
open
(
"scripts/Virulence_factor_model.sav"
,
'rb'
))
rfc_predict
=
rfc
.
predict
(
X
)
rfc_probs
=
rfc
.
predict_proba
(
X
)[:,
1
]
Prediction
=
pd
.
DataFrame
([])
dfToList
=
df
[
'
Sequence
'
].
tolist
()
Prediction
[
'Sequence'
]
=
dfToList
Prediction
[
'Prediction'
]
=
rfc_predict
Prediction
[
'Probability'
]
=
rfc_probs
Prediction
.
to_csv
(
snakemake
.
output
[
0
],
sep
=
'
\t
'
)
Prediction
=
pd
.
DataFrame
([])
dfToList
=
df
[
'
#
'
].
tolist
()
Prediction
[
'Sequence'
]
=
dfToList
Prediction
[
'Prediction'
]
=
rfc_predict
Prediction
[
'Probability'
]
=
rfc_probs
Prediction
.
to_csv
(
args
.
outfile
,
sep
=
'
\t
'
,
header
=
False
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment