Methodology
DAVI trains a value network against a slow-moving target. BWAS uses it at inference.
DAVI training
Bellman one-step lookahead with a frozen target.
-
V_target <- copy V_theta -
for each iteration: -
sample batch of scrambles s_1, ..., s_B -
for each s in batch: -
if s is goal: -
y(s) <- 0 -
else: -
children <- expand_all(s) -
y(s) <- 1 + min over children: V_target(child) -
update V_theta toward {(s_i, y(s_i))} -
every K iterations: -
V_target <- copy V_theta
BWAS inference
Batched weighted A*. One heuristic call per pop.
-
open <- priority queue keyed by f -
g[start] <- 0 -
push(open, start, f = lambda * 0 + h(start)) -
while open is not empty: -
batch <- pop up to N lowest-f nodes from open -
if any node in batch is goal: -
return reconstruct(parents, goal) -
children <- expand_all(batch) -
h_children <- heuristic(children) # one batched call -
for each (parent, action) -> child in children: -
g_new <- g[parent] + 1 -
if g_new < g[child]: -
g[child] <- g_new -
parents[child] <- (parent, action) -
push(open, child, f = lambda * g_new + h_children[child]) -
return failure