Unverified Commit 63219c7a authored by St. Elmo's avatar St. Elmo Committed by GitHub
Browse files

Merge pull request #605 from LCSB-BioCore/mo-mk-gecko-etc

Small fixes to enzyme PR
parents 73b8f617 b027b2db
Pipeline #55316 passed with stages
in 12 minutes and 47 seconds
"""
make_gecko_model(
model::MetabolicModel;
reaction_isozymes::Union{Function,Dict{String,Isozyme}},
gene_product_limit::Union{Function,Dict{String,Tuple{Float64,Float64}}},
reaction_isozymes::Union{Function,Dict{String,Vector{Isozyme}}}
gene_product_bounds::Union{Function,Dict{String,Tuple{Float64,Float64}}},
gene_product_molar_mass::Union{Function,Dict{String,Float64}},
gene_mass_group::Union{Function,Dict{String,String}} = _ -> "uncategorized",
group_mass_limit::Union{Function,Dict{String,Float64}},
relaxed_arm_reaction_bounds = false,
gene_product_mass_group::Union{Function,Dict{String,String}} = _ -> "uncategorized",
gene_product_mass_group_bound::Union{Function,Dict{String,Float64}},
)
Wrap a model into a [`GeckoModel`](@ref), following the structure given by
......@@ -16,56 +15,49 @@ GECKO algorithm (see [`GeckoModel`](@ref) documentation for details).
- `reaction_isozymes` is a function that returns a vector of [`Isozyme`](@ref)s
for each reaction, or empty vector if the reaction is not enzymatic.
- `gene_product_limit` is a function that returns lower and upper bound for
- `gene_product_bounds` is a function that returns lower and upper bound for
concentration for a given gene product (specified by the same string gene ID as in
`reaction_isozymes`), as `Tuple{Float64,Float64}`.
- `gene_product_molar_mass` is a function that returns a numeric molar mass of
a given gene product specified by string gene ID.
- `gene_mass_group` is a function that returns a string group identifier for a
- `gene_product_mass_group` is a function that returns a string group identifier for a
given gene product, again specified by string gene ID. By default, all gene
products belong to group `"uncategorized"` which is the behavior of original
GECKO.
- `group_mass_limit` is a function that returns the maximum mass for a given
- `gene_product_mass_group_bound` is a function that returns the maximum mass for a given
mass group.
- `relaxed_arm_reaction_bounds` is a boolean flag that relaxes the constraints
on the "arm" reactions specified by GECKO. By default (value `false`), there
is a separate constraint that limits the total flux through forward-direction
reaction for all its isozymes (ensuring that the sum of forward rates is less
than "global" upper bound), and another separate constraint that limits the
total flux through reverse-direction reaction isozymes. Value `true` groups
both forward and reverse reactions in a single constraint, allowing the total
forward flux to be actually greater than the upper bound IF the reverse flux
can balance it to fit into the upper and lower bound constraints (in turn,
more enzyme can be "wasted" by a reaction that runs in both directions).
Alternatively, all function arguments may be also supplied as dictionaries that
provide the same data lookup.
"""
function make_gecko_model(
model::MetabolicModel;
reaction_isozymes::Union{Function,Dict{String,Isozyme}},
gene_product_limit::Union{Function,Dict{String,Tuple{Float64,Float64}}},
reaction_isozymes::Union{Function,Dict{String,Vector{Isozyme}}},
gene_product_bounds::Union{Function,Dict{String,Tuple{Float64,Float64}}},
gene_product_molar_mass::Union{Function,Dict{String,Float64}},
gene_mass_group::Union{Function,Dict{String,String}} = _ -> "uncategorized",
group_mass_limit::Union{Function,Dict{String,Float64}},
relaxed_arm_reaction_bounds = false,
gene_product_mass_group::Union{Function,Dict{String,String}} = _ -> "uncategorized",
gene_product_mass_group_bound::Union{Function,Dict{String,Float64}},
)
ris_ =
reaction_isozymes isa Function ? reaction_isozymes : (gid -> reaction_isozymes[gid])
gpl_ =
gene_product_limit isa Function ? gene_product_limit :
(gid -> gene_product_limit[gid])
reaction_isozymes isa Function ? reaction_isozymes :
(rid -> get(reaction_isozymes, rid, []))
gpb_ =
gene_product_bounds isa Function ? gene_product_bounds :
(gid -> gene_product_bounds[gid])
gpmm_ =
gene_product_molar_mass isa Function ? gene_product_molar_mass :
(gid -> gene_product_molar_mass[gid])
gmg_ = gene_mass_group isa Function ? gene_mass_group : (gid -> gene_mass_group[gid])
gml_ = group_mass_limit isa Function ? group_mass_limit : (grp -> group_mass_limit[grp])
gmg_ =
gene_product_mass_group isa Function ? gene_product_mass_group :
(gid -> gene_product_mass_group[gid])
gmgb_ =
gene_product_mass_group_bound isa Function ? gene_product_mass_group_bound :
(grp -> gene_product_mass_group_bound[grp])
# ...it would be nicer to have an overload for this, but kwargs can't be used for dispatch
columns = Vector{_gecko_column}()
columns = Vector{_gecko_reaction_column}()
coupling_row_reaction = Int[]
coupling_row_gene_product = Int[]
coupling_row_mass_group = String[]
gids = genes(model)
(lbs, ubs) = bounds(model)
......@@ -73,47 +65,32 @@ function make_gecko_model(
gene_name_lookup = Dict(gids .=> 1:length(gids))
gene_row_lookup = Dict{Int,Int}()
mass_group_lookup = Dict{String,Int}()
for i = 1:n_reactions(model)
isozymes = ris_(rids[i])
if isempty(isozymes)
push!(columns, _gecko_column(i, 0, 0, 0, lbs[i], ubs[i], [], []))
push!(columns, _gecko_reaction_column(i, 0, 0, 0, lbs[i], ubs[i], []))
continue
end
# if the reaction has multiple isozymes, it needs extra coupling to
# ensure that the total rate of the reaction doesn't exceed the
# "global" limit
if relaxed_arm_reaction_bounds
reaction_coupling_row =
length(isozymes) > 1 ? begin
push!(coupling_row_reaction, i)
length(coupling_row_reaction)
end : 0
end
# loop over both directions for all isozymes
for (lb, ub, kcatf, dir) in [
(-ubs[i], -lbs[i], i -> i.kcat_reverse, -1),
(lbs[i], ubs[i], i -> i.kcat_forward, 1),
]
if !relaxed_arm_reaction_bounds
# In this case, the coefficients in the coupling matrix will be
# the same as in the combined case, only categorized in
# separate rows for negative and positive ones. Surprisingly,
# we do not need to explicitly remember the bounds, because the
# ones taken from the original model are perfectly okay -- the
# "reverse" direction is unreachable because of individual
# bounds on split reactions, and the "forward" direction is
# properly negated in the reverse case to work nicely with the
# global lower bound.
reaction_coupling_row =
ub > 0 && length(isozymes) > 1 ? begin
push!(coupling_row_reaction, i)
length(coupling_row_reaction)
end : 0
end
# The coefficients in the coupling matrix will be categorized in
# separate rows for negative and positive reactions. Surprisingly,
# we do not need to explicitly remember the bounds, because the
# ones taken from the original model are perfectly okay -- the
# "reverse" direction is unreachable because of individual
# bounds on split reactions, and the "forward" direction is
# properly negated in the reverse case to work nicely with the
# global lower bound.
reaction_coupling_row =
ub > 0 && length(isozymes) > 1 ? begin
push!(coupling_row_reaction, i)
length(coupling_row_reaction)
end : 0
# all isozymes in this direction
for (iidx, isozyme) in enumerate(isozymes)
......@@ -130,40 +107,15 @@ function make_gecko_model(
gene_row_lookup[gidx] =
length(coupling_row_gene_product)
end
(row_idx, 1 / kcat)
end for (gene, count) in isozyme.gene_product_count if
(row_idx, stoich / kcat)
end for (gene, stoich) in isozyme.gene_product_count if
haskey(gene_name_lookup, gene)
)
# prepare the coupling with the mass groups
gp_groups = gmg_.(keys(isozyme.gene_product_count))
gp_mass = gpmm_.(keys(isozyme.gene_product_count))
groups = unique(filter(!isnothing, gp_groups))
group_idx = Dict(groups .=> 1:length(groups))
vals = [0.0 for _ in groups]
for (gpg, mass) in zip(gp_groups, gp_mass)
if !isnothing(gpg)
vals[group_idx[gpg]] += mass / kcat
end
end
mass_group_coupling = collect(
isnothing(group) ? 0 :
begin
if !haskey(mass_group_lookup, group)
push!(coupling_row_mass_group, group)
mass_group_lookup[group] =
length(coupling_row_mass_group)
end
(mass_group_lookup[group], val)
end for (group, val) in zip(groups, vals)
)
# make a new column
push!(
columns,
_gecko_column(
_gecko_reaction_column(
i,
iidx,
dir,
......@@ -171,7 +123,6 @@ function make_gecko_model(
max(lb, 0),
ub,
gene_product_coupling,
mass_group_coupling,
),
)
end
......@@ -179,11 +130,41 @@ function make_gecko_model(
end
end
GeckoModel(
# prepare enzyme capacity constraints
mg_gid_lookup = Dict{String,Vector{String}}()
for gid in gids[coupling_row_gene_product]
mg = gmg_(gid)
if haskey(mg_gid_lookup, mg)
push!(mg_gid_lookup[mg], gid)
else
mg_gid_lookup[mg] = [gid]
end
end
coupling_row_mass_group = Vector{_gecko_capacity}()
for (grp, gs) in mg_gid_lookup
idxs = [gene_row_lookup[x] for x in Int.(indexin(gs, gids))]
mms = gpmm_.(gs)
push!(coupling_row_mass_group, _gecko_capacity(grp, idxs, mms, gmgb_(grp)))
end
# create model with dummy objective
gm = GeckoModel(
spzeros(length(columns) + length(coupling_row_gene_product)),
columns,
coupling_row_reaction,
collect(zip(coupling_row_gene_product, gpl_.(gids[coupling_row_gene_product]))),
collect(zip(coupling_row_mass_group, gml_.(coupling_row_mass_group))),
collect(zip(coupling_row_gene_product, gpb_.(gids[coupling_row_gene_product]))),
coupling_row_mass_group,
model,
)
#=
Set objective. This is a separate field because gene products can also be objectives.
This way they can be set as objectives by the user.
=#
gm.objective .= [
_gecko_reaction_column_reactions(gm)' * objective(gm.inner)
spzeros(length(coupling_row_gene_product))
]
return gm
end
......@@ -2,7 +2,7 @@
"""
make_smoment_model(
model::MetabolicModel;
reaction_isozymes::Union{Function,Dict{String,Isozyme}},
reaction_isozyme::Union{Function,Dict{String,Isozyme}},
gene_product_molar_mass::Union{Function,Dict{String,Float64}},
total_enzyme_capacity::Float64,
)
......@@ -12,28 +12,27 @@ Construct a model with a structure given by sMOMENT algorithm; returns a
# Arguments
- `reaction_isozymes` parameter is a function that returns a single
- `reaction_isozyme` parameter is a function that returns a single
[`Isozyme`](@ref) for each reaction, or `nothing` if the reaction is not
enzymatic. If the reaction has multiple isozymes, use
[`smoment_isozyme_score`](@ref) to select the "best" one, as recommended by
the sMOMENT approach.
[`smoment_isozyme_speed`](@ref) to select the fastest one, as recommended by
the sMOMENT paper.
- `gene_product_molar_mass` parameter is a function that returns a molar mass
of each gene product (relative to `total_enzyme_capacity` and the specified
kcats), as specified by sMOMENT.
- `total_enzyme_capacity` is the maximum "enzyme capacity" consumption in the
model.
of each gene product as specified by sMOMENT.
- `total_enzyme_capacity` is the maximum "enzyme capacity" in the model.
Alternatively, all function arguments also accept dictionaries that are used to
provide the same data lookup.
"""
function make_smoment_model(
model::MetabolicModel;
reaction_isozymes::Union{Function,Dict{String,Isozyme}},
reaction_isozyme::Union{Function,Dict{String,Isozyme}},
gene_product_molar_mass::Union{Function,Dict{String,Float64}},
total_enzyme_capacity::Float64,
)
ris_ =
reaction_isozymes isa Function ? reaction_isozymes : (gid -> reaction_isozymes[gid])
reaction_isozyme isa Function ? reaction_isozyme :
(rid -> get(reaction_isozyme, rid, nothing))
gpmm_ =
gene_product_molar_mass isa Function ? gene_product_molar_mass :
(gid -> gene_product_molar_mass[gid])
......
"""
struct _gecko_column
struct _gecko_reaction_column
A helper type for describing the contents of [`GeckoModel`](@ref)s.
"""
struct _gecko_column
struct _gecko_reaction_column
reaction_idx::Int
isozyme_idx::Int
direction::Int
......@@ -12,7 +11,19 @@ struct _gecko_column
lb::Float64
ub::Float64
gene_product_coupling::Vector{Tuple{Int,Float64}}
mass_group_coupling::Vector{Tuple{Int,Float64}}
end
"""
struct _gecko_capacity
A helper struct that contains the gene product capacity terms organized by
the grouping type, e.g. metabolic or membrane groups etc.
"""
struct _gecko_capacity
group_id::String
gene_product_idxs::Vector{Int}
gene_product_molar_masses::Vector{Float64}
group_upper_bound::Float64
end
"""
......@@ -20,21 +31,20 @@ end
A model with complex enzyme concentration and capacity bounds, as described in
*Sánchez, Benjamín J., et al. "Improving the phenotype predictions of a yeast
genomescale metabolic model by incorporating enzymatic constraints." Molecular
genome-scale metabolic model by incorporating enzymatic constraints." Molecular
systems biology 13.8 (2017): 935.*
Use [`make_gecko_model`](@ref) or [`with_gecko`](@ref) to construct this kind
of models.
of model.
The model wraps another "internal" model, and adds following modifications:
- enzymatic reactions with known enzyme information are split into multiple
forward and reverse variants for each isozyme,
- reaction coupling is added to ensure the groups of isozyme reactions obey the
global reaction flux bounds from the original model,
- coupling is added to simulate available gene concentrations as "virtual
metabolites" consumed by each reaction by its gene product stoichiometry,
which can constrained by the user (to reflect realistic measurements such as
from mass spectrometry),
- gene concentrations specified by each reaction and its gene product stoichiometry,
can constrained by the user to reflect measurements, such as
from mass spectrometry,
- additional coupling is added to simulate total masses of different proteins
grouped by type (e.g., membrane-bound and free-floating proteins), which can
be again constrained by the user (this is slightly generalized from original
......@@ -45,7 +55,10 @@ The structure contains fields `columns` that describe the contents of the
coupling columns, `coupling_row_reaction`, `coupling_row_gene_product` and
`coupling_row_mass_group` that describe correspondence of the coupling rows to
original model and determine the coupling bounds, and `inner`, which is the
original wrapped model.
original wrapped model. Note, `objective` is the objective vector of the model,
special care needs to be taken to ensure that its length is `n_reactions(model)
+ n_genes(model)` when the user modifies it, where `model` is the GeckoModel in
question.
Implementation exposes the split reactions (available as `reactions(model)`),
but retains the original "simple" reactions accessible by [`fluxes`](@ref). All
......@@ -54,10 +67,11 @@ constraints are implemented using [`coupling`](@ref) and
purely virtual and do not occur in [`metabolites`](@ref).
"""
struct GeckoModel <: ModelWrapper
columns::Vector{_gecko_column}
objective::SparseVec
columns::Vector{_gecko_reaction_column}
coupling_row_reaction::Vector{Int}
coupling_row_gene_product::Vector{Tuple{Int,Tuple{Float64,Float64}}}
coupling_row_mass_group::Vector{Tuple{String,Float64}}
coupling_row_mass_group::Vector{_gecko_capacity}
inner::MetabolicModel
end
......@@ -71,16 +85,25 @@ Return a stoichiometry of the [`GeckoModel`](@ref). The enzymatic reactions are
split into unidirectional forward and reverse ones, each of which may have
multiple variants per isozyme.
"""
stoichiometry(model::GeckoModel) =
stoichiometry(model.inner) * _gecko_column_reactions(model)
function stoichiometry(model::GeckoModel)
irrevS = stoichiometry(model.inner) * COBREXA._gecko_reaction_column_reactions(model)
enzS = COBREXA._gecko_gene_product_coupling(model)
[
irrevS spzeros(size(irrevS, 1), size(enzS, 1))
-enzS I(size(enzS, 1))
]
end
"""
objective(model::GeckoModel)
Reconstruct an objective of the [`GeckoModel`](@ref), following the objective
of the inner model.
Return the objective of the [`GeckoModel`](@ref). Note, the objective is with
respect to the internal variables, i.e. [`reactions(model)`](@ref) and
[`genes(model)`](@ref). To manually set the objective, index into
`model.objective` appropriately, and remember to set the previous coefficients
to zero.
"""
objective(model::GeckoModel) = _gecko_column_reactions(model)' * objective(model.inner)
objective(model::GeckoModel) = model.objective
"""
reactions(model::GeckoModel)
......@@ -101,9 +124,10 @@ reactions(model::GeckoModel) =
end
"""
reactions(model::GeckoModel)
n_reactions(model::GeckoModel)
Returns the number of all irreversible reactions in `model`.
Returns the number of all irreversible reactions in `model` as well as the
number of gene products that take part in enzymatic reactions.
"""
n_reactions(model::GeckoModel) = length(model.columns)
......@@ -112,8 +136,17 @@ n_reactions(model::GeckoModel) = length(model.columns)
Return variable bounds for [`GeckoModel`](@ref).
"""
bounds(model::GeckoModel) =
([col.lb for col in model.columns], [col.ub for col in model.columns])
function bounds(model::GeckoModel)
lbs = [
[col.lb for col in model.columns]
[lb for (_, (lb, _)) in model.coupling_row_gene_product]
]
ubs = [
[col.ub for col in model.columns]
[ub for (_, (_, ub)) in model.coupling_row_gene_product]
]
(lbs, ubs)
end
"""
reaction_flux(model::GeckoModel)
......@@ -121,22 +154,31 @@ bounds(model::GeckoModel) =
Get the mapping of the reaction rates in [`GeckoModel`](@ref) to the original
fluxes in the wrapped model.
"""
reaction_flux(model::GeckoModel) =
_gecko_column_reactions(model)' * reaction_flux(model.inner)
function reaction_flux(model::GeckoModel)
rxnmat = _gecko_reaction_column_reactions(model)' * reaction_flux(model.inner)
[
rxnmat
spzeros(n_genes(model), size(rxnmat, 2))
]
end
"""
coupling(model::GeckoModel)
Return the coupling of [`GeckoModel`](@ref). That combines the coupling of the
wrapped model, coupling for split reactions, and the coupling for the total
wrapped model, coupling for split (arm) reactions, and the coupling for the total
enzyme capacity.
"""
coupling(model::GeckoModel) = vcat(
coupling(model.inner) * _gecko_column_reactions(model),
_gecko_reaction_coupling(model),
_gecko_gene_product_coupling(model),
_gecko_mass_group_coupling(model),
)
function coupling(model::GeckoModel)
innerC = coupling(model.inner) * _gecko_reaction_column_reactions(model)
rxnC = _gecko_reaction_coupling(model)
enzcap = _gecko_mass_group_coupling(model)
[
innerC spzeros(size(innerC, 1), n_genes(model))
rxnC spzeros(size(rxnC, 1), n_genes(model))
spzeros(length(model.coupling_row_mass_group), length(model.columns)) enzcap
]
end
"""
n_coupling_constraints(model::GeckoModel)
......@@ -147,7 +189,6 @@ Count the coupling constraints in [`GeckoModel`](@ref) (refer to
n_coupling_constraints(model::GeckoModel) =
n_coupling_constraints(model.inner) +
length(model.coupling_row_reaction) +
length(model.coupling_row_gene_product) +
length(model.coupling_row_mass_group)
"""
......@@ -163,14 +204,50 @@ function coupling_bounds(model::GeckoModel)
vcat(
iclb,
ilb[model.coupling_row_reaction],
[lb for (_, (lb, _)) in model.coupling_row_gene_product],
[0.0 for _ in model.coupling_row_mass_group],
),
vcat(
icub,
iub[model.coupling_row_reaction],
[ub for (_, (_, ub)) in model.coupling_row_gene_product],
[ub for (_, ub) in model.coupling_row_mass_group],
[grp.group_upper_bound for grp in model.coupling_row_mass_group],
),
)
end
"""
balance(model::GeckoModel)
Return the balance of the reactions in the inner model, concatenated with a vector of
zeros representing the enzyme balance of a [`GeckoModel`](@ref).
"""
balance(model::GeckoModel) =
[balance(model.inner); spzeros(length(model.coupling_row_gene_product))]
"""
n_genes(model::GeckoModel)
Return the number of genes that have enzymatic constraints associated with them.
"""
n_genes(model::GeckoModel) = length(model.coupling_row_gene_product)
"""
genes(model::GeckoModel)
Return the gene ids of genes that have enzymatic constraints associated with them.
"""
genes(model::GeckoModel) =
genes(model.inner)[[idx for (idx, _) in model.coupling_row_gene_product]]
"""
metabolites(model::GeckoModel)
Return the ids of all metabolites, both real and pseudo, for a [`GeckoModel`](@ref).
"""
metabolites(model::GeckoModel) = [metabolites(model.inner); genes(model) .* "#supply"]
"""
n_metabolites(model::GeckoModel)
Return the number of metabolites, both real and pseudo, for a [`GeckoModel`](@ref).
"""
n_metabolites(model::GeckoModel) = n_metabolites(model.inner) + n_genes(model)
......@@ -36,9 +36,9 @@ The `SMomentModel` structure contains a worked-out representation of the
optimization problem atop a wrapped [`MetabolicModel`](@ref), in particular the
separation of certain reactions into unidirectional forward and reverse parts,
an "enzyme capacity" required for each reaction, and the value of the maximum
capacity constraint. Original coupling is retained.
capacity constraint. Original coupling in the inner model is retained.
In the structure, field `columns` describes the correspondence of stoichiometry
In the structure, the field `columns` describes the correspondence of stoichiometry
columns to the stoichiometry and data of the internal wrapped model, and
`total_enzyme_capacity` is the total bound on the enzyme capacity consumption
as specified in sMOMENT algorithm.
......
"""
protein_dict(model::GeckoModel, opt_model)
gene_product_dict(model::GeckoModel, opt_model)
Return a dictionary mapping protein molar concentrations to their ids. The
argument `opt_model` is a solved optimization problem, typically returned by
[`flux_balance_analysis`](@ref).
[`flux_balance_analysis`](@ref). See [`flux_dict`](@ref) for the corresponding
function that returns a dictionary of solved fluxes.
"""
protein_dict(model::GeckoModel, opt_model) =
let gids = genes(model)
is_solved(opt_model) ?
Dict(
[gids[gidx] for (gidx, _) in model.coupling_row_gene_product] .=> _gecko_gene_product_coupling(model) * value.(opt_model[:x]),
) : nothing
end
gene_product_dict(model::GeckoModel, opt_model) =
is_solved(opt_model) ?
Dict(genes(model) .=> value.(opt_model[:x])[(n_reactions(model)+1):end]) : nothing
"""
protein_dict(model::GeckoModel)
gene_product_dict(model::GeckoModel)
A pipe-able variant of [`protein_dict`](@ref).
A pipe-able variant of [`gene_product_dict`](@ref).
"""
protein_dict(model::GeckoModel) = x -> protein_dict(model, x)
gene_product_dict(model::GeckoModel) = x -> gene_product_dict(model, x)
"""
protein_mass_group_dict(model::GeckoModel, opt_model)
gene_product_mass_group_dict(model::GeckoModel, opt_model)
Extract the mass utilization in mass groups from a solved [`GeckoModel`](@ref).
"""
protein_mass_group_dict(model::GeckoModel, opt_model) =
gene_product_mass_group_dict(model::GeckoModel, opt_model) =
is_solved(opt_model) ?
Dict(
(group for (group, _) in model.coupling_row_mass_group) .=>
_gecko_mass_group_coupling(model) * value.(opt_model[:x]),
grp.group_id => dot(
value.(opt_model[:x])[n_reactions(model).+grp.gene_product_idxs],
grp.gene_product_molar_masses,
) for grp in model.coupling_row_mass_group
) : nothing
"""
protein_mass_group_dict(model::GeckoModel)
gene_product_mass_group_dict(model::GeckoModel)
A pipe-able variant of [`mass_group_dict`](@ref).
"""
protein_mass_group_dict(model::GeckoModel) = x -> mass_group_dict(model, x)
gene_product_mass_group_dict(model::GeckoModel) = x -> mass_group_dict(model, x)
"""