sampling seems to work OK now

a43ad19a · Miroslav Kratochvil · 0012e659 · a43ad19a · a43ad19a · a43ad19a
Unverified Commit a43ad19a authored 3 years ago by Miroslav Kratochvil
--- a/Project.toml
+++ b/Project.toml
@@ -11,7 +11,6 @@ JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
-MCMCChains = "c7f686f2-ff18-58e9-bc7b-31028e88f75d"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

--- a/src/COBREXA.jl
+++ b/src/COBREXA.jl
@@ -13,7 +13,6 @@ using Random
 using Serialization
 using SparseArrays
 using Statistics
-using MCMCChains

 import Base: findfirst, getindex, show
 import Pkg

--- a/src/analysis/sampling/hit_and_run.jl
+++ b/src/analysis/sampling/hit_and_run.jl
 """
-    hit_and_run(
-        model,
-        optimizer;
-        modifications = [],
-        N = 1000,
-        keepevery = _constants.sampling_keep_iters,
-        samplesize = _constants.sampling_size,
-        warmup_indices = collect(1:n_reactions(model)),
-        workerids = [myid()],
-        nchains = 1,
-    )
+    TODO

 Perform basic hit and run sampling for `N` iterations on `model` using `optimizer` to 
 generate the warmup points. Here warmup points are generated by iteratively
@@ -64,119 +54,93 @@ chains = hit_and_run(
    )
 ```
 """
-function hit_and_run(
-    model,
-    optimizer;
-    modifications = [],
-    N = 150_000,
-    keepevery = _constants.sampling_keep_iters,
-    samplesize = _constants.sampling_size,
-    warmup_indices = collect(1:n_reactions(model)),
-    workerids = [myid()],
-    nchains = 1,
+function affine_hit_and_run(
+    warmup_points::Matrix{Float64},
+    lbs::Vector{Float64},
+    ubs::Vector{Float64};
+    sample_iters = 100 .* (1:5),
+    workers = [myid()],
+    chains = length(workers),
 )

-    # get warmup points in parallel, similar to FVA
-    ws, lbs, ubs = warmup(
-        model,
-        optimizer;
-        modifications = modifications,
-        workerids = workerids, # parallel
-        warmup_points = warmup_indices,
-    )
-
-    # load warmup points to workers
-    save_at.(workerids, :cobrexa_ws, Ref(:($ws)))
-    save_at.(workerids, :cobrexa_lbs, Ref(:($lbs)))
-    save_at.(workerids, :cobrexa_ubs, Ref(:($ubs)))
-
-    # sample in parallel! 
-    samples = dpmap(
-        x -> :($COBREXA._serial_hit_and_run(
-            cobrexa_ws,
-            cobrexa_lbs,
-            cobrexa_ubs,
-            $samplesize,
-            $keepevery,
-            $N,
-        )),
-        CachingPool(workerids),
-        1:nchains,
+    # distribute starting data to workers
+    save_at.(workers, :cobrexa_hit_and_run_data, Ref((warmup_points, lbs, ubs)))
+
+    # sample all chains
+    samples = hcat(
+        dpmap(
+            chain -> :($COBREXA._affine_hit_and_run_chain(
+                cobrexa_hit_and_run_data...,
+                $sample_iters,
+                $chain,
+            )),
+            CachingPool(workers),
+            1:chains,
+        )...,
    )

    # remove warmup points from workers
-    map(fetch, remove_from.(workerids, :cobrexa_ws))
-    map(fetch, remove_from.(workerids, :cobrexa_lbs))
-    map(fetch, remove_from.(workerids, :cobrexa_ubs))
-
-    # not sure how to do this better - cat/vcat doesn't work, oh well 
-    vals = zeros(samplesize, length(lbs), nchains)
-    for c = 1:nchains
-        vals[:, :, c] = samples[c]'
-    end
-    chains = Chains(vals, reactions(model))
+    map(fetch, remove_from.(workers, :cobrexa_hit_and_run_data))

-    return chains
+    return samples
 end

-function _serial_hit_and_run(ws, lbs, ubs, samplesize, keepevery, N)
-    samples = zeros(length(lbs), samplesize) # sample storage
-    current_point = zeros(length(lbs))
-    current_point .= ws[1, 1] # just use the first warmup point, randomness is introduced later
-    direction = zeros(length(lbs))
-
-    sample_num = 0
-    samplelength = 0
-    use_warmup_points = true
-    for n = 1:N
-
-        if use_warmup_points
-            i = rand(1:size(ws, 1))
-            j = rand(1:size(ws, 2))
-            direction .= ws[i, j] - current_point # use warmup points to find direction in warmup phase
-        else
-            direction .= samples[:, rand(1:(samplelength))] - current_point # after warmup phase, only find directions in sampled space
-        end
-
-        λmax = Inf
-        λmin = -Inf
-        for i in eachindex(lbs)
-            δlower = lbs[i] - current_point[i]
-            δupper = ubs[i] - current_point[i]
-            # only consider the step size bound if the direction of travel is non-negligible
-            if direction[i] < -_constants.tolerance
-                lower = δupper / direction[i]
-                upper = δlower / direction[i]
-            elseif direction[i] > _constants.tolerance
-                lower = δlower / direction[i]
-                upper = δupper / direction[i]
-            else
-                lower = -Inf
-                upper = Inf
+function _affine_hit_and_run_chain(warmup, lbs, ubs, iters, chain)
+
+    points = copy(warmup)
+    d, n_points = size(points)
+    result = Matrix{Float64}(undef, size(points, 1), 0)
+
+    iter = 0
+
+    for iter_target in iters
+
+        while iter < iter_target
+            iter += 1
+
+            new_points = copy(points)
+
+            for i = 1:n_points
+
+                mix = rand(n_points) .+ _constants.tolerance
+                dir = points * (mix ./ sum(mix)) - points[:, i]
+
+                # iteratively collect the maximum and minimum possible multiple
+                # of `dir` added to the current point
+                λmax = Inf
+                λmin = -Inf
+                for j = 1:d
+                    dl = lbs[j] - points[j, i]
+                    du = ubs[j] - points[j, i]
+                    idir = 1 / dir[j]
+                    if dir[j] < -_constants.tolerance
+                        lower = du * idir
+                        upper = dl * idir
+                    elseif dir[j] > _constants.tolerance
+                        lower = dl * idir
+                        upper = du * idir
+                    else
+                        lower = -Inf
+                        upper = Inf
+                    end
+                    λmin = max(λmin, lower)
+                    λmax = min(λmax, upper)
+                end
+
+                λ = λmin + rand() * (λmax - λmin)
+                !isfinite(λ) && continue # avoid divergence
+                new_points[:, i] = points[:, i] .+ λ .* dir
+
+                # TODO normally, here we would check if sum(S*new_point) is still
+                # lower than the tolerance, but we shall trust the computer
+                # instead.
            end
-            lower > λmin && (λmin = lower) # max min step size that satisfies all bounds
-            upper < λmax && (λmax = upper) # min max step size that satisfies all bounds
-        end

-        if λmax <= λmin || λmin == -Inf || λmax == Inf # this sometimes can happen
-            #     @warn "Infeasible direction at iteration $(n)..." # TODO: make this optional/ add to a logger... very noisy otherwise
-            continue
-        end
-
-        λ = rand() * (λmax - λmin) + λmin # random step size
-        current_point .= current_point .+ λ .* direction # will be feasible
-
-        if n % keepevery == 0
-            sample_num += 1
-            samples[:, sample_num] .= current_point
-            if sample_num >= samplesize
-                use_warmup_points = false # once the entire memory vector filled, stop using warm up points
-                sample_num = 0 # reset, start replacing the older samples
-            end
-            use_warmup_points && (samplelength += 1) # lags sample_num because the latter is a flag as well
+            points = new_points
        end

+        result = hcat(result, points)
    end

-    return samples
+    result
 end
--- a/src/analysis/sampling/warmup.jl
+++ b/src/analysis/sampling/warmup.jl
 """
    warmup_from_variability(
-        n_points::Int,
        model::MetabolicModel,
-        optimizer;
-        modifications = [],
-        workers = [myid()],
+        optimizer,
+        n_points::Int;
+        kwargs...
    )

 Generates FVA-like warmup points for samplers, by selecting random points by
 minimizing and maximizing reactions. Can not return more than 2 times the
 number of reactions in the model.
 """
-function warmup_from_variability(n_points::Int, model::MetabolicModel, optimizer; kwargs...)
+function warmup_from_variability(model::MetabolicModel,
+    optimizer,
+    n_points::Int;
+    kwargs...)
    nr = n_reactions(model)

    n_points > 2 * nr && throw(
@@ -23,36 +25,36 @@ function warmup_from_variability(n_points::Int, model::MetabolicModel, optimizer

    sample = shuffle(vcat(1:nr, -(1:nr)))[begin:n_points]
    warmup_from_variability(
-        -filter(x -> x < 0, sample),
-        filter(x -> x > 0, sample),
        model,
-        optimizer;
+        optimizer,
+        -filter(x -> x < 0, sample),
+        filter(x -> x > 0, sample);
        kwargs...,
    )
 end

 """
-    warmup_from_variability(
-        min_reactions::Vector{Int},
-        max_reactions::Vector{Int},
+    function warmup_from_variability(
        model::MetabolicModel,
-        optimizer;
-        modifications=[],
-        workers::Vector{Int}=[myid()]
-    )::Matrix{Float64}
+        optimizer,
+        min_reactions::Vector{Int}=1:n_reactions(model),
+        max_reactions::Vector{Int}=1:n_reactions(model);
+        modifications = [],
+        workers::Vector{Int} = [myid()],
+    )::Tuple{Matrix{Float64}, Vector{Float64}, Vector{Float64}}

 Generate FVA-like warmup points for samplers, by minimizing and maximizing the
 specified reactions. The result is returned as a matrix, each point occupies as
 single column in the result.
 """
 function warmup_from_variability(
-    min_reactions::Vector{Int},
-    max_reactions::Vector{Int},
    model::MetabolicModel,
-    optimizer;
+    optimizer,
+    min_reactions::AbstractVector{Int} = 1:n_reactions(model),
+    max_reactions::AbstractVector{Int} = 1:n_reactions(model);
    modifications = [],
    workers::Vector{Int} = [myid()],
-)::Matrix{Float64}
+)::Tuple{Matrix{Float64},Vector{Float64},Vector{Float64}}

    # create optimization problem at workers, apply modifications
    save_model = :(
@@ -80,8 +82,14 @@ function warmup_from_variability(
        )...,
    )

+    # snatch the bounds from whatever worker is around
+    lbs, ubs = get_val_from(
+        workers[1],
+        :($COBREXA.get_bound_vectors(cobrexa_sampling_warmup_optmodel)),
+    )
+
    # free the data on workers
    map(fetch, remove_from.(workers, :cobrexa_sampling_warmup_optmodel))

-    return fluxes
+    return fluxes, lbs, ubs
 end
--- a/src/base/solver.jl
+++ b/src/base/solver.jl
@@ -65,17 +65,8 @@ end
 Returns vectors of the lower and upper bounds of `opt_model` constraints, where
 `opt_model` is a JuMP model constructed by e.g.
 [`make_optimization_model`](@ref) or [`flux_balance_analysis`](@ref).
-
-
 """
-function get_bound_vectors(opt_model)
-    lbconref = opt_model[:lbs]
-    ubconref = opt_model[:ubs]
-    lbs = zeros(length(lbconref))
-    for i in eachindex(lbs)
-        lbs[i] = -normalized_rhs(lbconref[i])
-    end
-    ubs = [normalized_rhs(ubconref[i]) for i in eachindex(ubconref)]
-
-    return lbs, ubs
-end
+get_bound_vectors(opt_model) = (
+    [-normalized_rhs(lb) for lb in opt_model[:lbs]],
+    [normalized_rhs(ub) for ub in opt_model[:ubs]],
+)
--- a/test/analysis/sampling/hit_and_run.jl
+++ b/test/analysis/sampling/hit_and_run.jl
@@ -7,36 +7,21 @@

    model = load_model(StandardModel, model_path)

-    # Serial test
-    chains = hit_and_run(
-        model,
-        Tulip.Optimizer;
-        N = 10_000,
-        nchains = 3,
-        samplesize = 1000,
-        modifications = [
-            change_constraint("EX_glc__D_e", -10, -10),
-            change_constraint("BIOMASS_Ecoli_core_w_GAM", 0.8, 0.8),
-        ],
-        workerids = W,
-    )
-
-    # Do not test for convergence, that requires too many samples
-    @test isapprox(mean(chains[[:PFL]]).nt.mean[1], 0.8, atol = 1.0)
+    warmup, lbs, ubs = warmup_from_variability(model, Tulip.Optimizer; workers = W)

-    # parallel tests (still broken, not sure why)
-    chainsparallel = hit_and_run(
-        model,
-        Tulip.Optimizer;
-        N = 10_000,
-        nchains = 3,
-        samplesize = 1000,
-        modifications = [
-            change_constraint("EX_glc__D_e", -10, -10),
-            change_constraint("BIOMASS_Ecoli_core_w_GAM", 0.8, 0.8),
-        ],
-        workerids = W,
+    samples = affine_hit_and_run(
+        warmup,
+        lbs,
+        ubs;
+        sample_iters = 10 * (1:3),
+        workers = W,
+        chains = length(W),
    )

-    @test isapprox(mean(chainsparallel[[:PFL]]).nt.mean[1], 0.8, atol = 1.0)
+    @test size(samples, 1) == size(warmup, 1)
+    @test size(samples, 2) == size(warmup, 2) * 3 * length(W)
+
+    @test all(samples .>= lbs)
+    @test all(samples .<= ubs)
+    @test all(stoichiometry(model) * samples .< TEST_TOLERANCE)
 end
--- a/test/analysis/sampling/warmup.jl
+++ b/test/analysis/sampling/warmup.jl
@@ -6,10 +6,10 @@
    )
    model = load_model(StandardModel, model_path)

-    pts = warmup_from_variability(
-        100,
+    pts, lbs, ubs = warmup_from_variability(
        model,
-        Tulip.Optimizer;
+        Tulip.Optimizer,
+        100;
        modifications = [change_constraint("EX_glc__D_e", -2, 2)],
        workers = W,
    )
@@ -18,4 +18,6 @@
    @test size(pts) == (95, 100)
    @test all(pts[idx, :] .>= -2)
    @test all(pts[idx, :] .<= 2)
+    @test lbs[idx] == -2
+    @test ubs[idx] == 2
 end
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,7 +12,6 @@ using SHA
 using SparseArrays
 using Statistics
 using Tulip
-using MCMCChains

 # tolerance for comparing analysis results (should be a bit bigger than the
 # error tolerance in computations)