🐐
This commit is contained in:
176
backend/ai/card_pick_nn.py
Normal file
176
backend/ai/card_pick_nn.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ai.nn import NeuralNet, _softmax
|
||||
|
||||
# Separate weights file so this NN trains independently from the plan NN.
|
||||
CARD_PICK_WEIGHTS_PATH = os.path.join(os.path.dirname(__file__), "card_pick_weights.json")
|
||||
|
||||
N_CARD_FEATURES = 15
|
||||
|
||||
# Normalization constants — chosen to cover the realistic stat range for generated cards.
|
||||
_MAX_ATK = 50.0
|
||||
_MAX_DEF = 100.0
|
||||
|
||||
|
||||
def _precompute_static_features(allowed: list) -> np.ndarray:
|
||||
"""
|
||||
Vectorized precomputation of the 7 per-card static features for the whole pool.
|
||||
Returns (n, 7) float32. Called once per choose_cards() invocation.
|
||||
"""
|
||||
n = len(allowed)
|
||||
atk = np.array([c.attack for c in allowed], dtype=np.float32)
|
||||
defn = np.array([c.defense for c in allowed], dtype=np.float32)
|
||||
cost = np.array([c.cost for c in allowed], dtype=np.float32)
|
||||
rar = np.array([c.card_rarity.value for c in allowed], dtype=np.float32)
|
||||
typ = np.array([c.card_type.value for c in allowed], dtype=np.float32)
|
||||
|
||||
exact_cost = np.minimum(10.0, np.maximum(1.0, ((atk**2 + defn**2)**0.18) / 1.5))
|
||||
total = atk + defn
|
||||
atk_ratio = np.where(total > 0, atk / total, 0.5)
|
||||
pcv_norm = np.clip(exact_cost - cost, 0.0, 1.0)
|
||||
|
||||
out = np.empty((n, 7), dtype=np.float32)
|
||||
out[:, 0] = atk / _MAX_ATK
|
||||
out[:, 1] = defn / _MAX_DEF
|
||||
out[:, 2] = cost / 10.0
|
||||
out[:, 3] = rar / 5.0
|
||||
out[:, 4] = atk_ratio
|
||||
out[:, 5] = pcv_norm
|
||||
out[:, 6] = typ / 9.0
|
||||
return out
|
||||
|
||||
|
||||
class CardPickPlayer:
|
||||
"""
|
||||
Uses a NeuralNet to sequentially select cards from a pool until the cost
|
||||
budget is exhausted. API mirrors NeuralPlayer so training code stays uniform.
|
||||
|
||||
In training mode: samples stochastically (softmax) and records the
|
||||
trajectory for a REINFORCE update after the game ends.
|
||||
In inference mode: picks the highest-scoring affordable card at each step.
|
||||
|
||||
Performance design:
|
||||
- Static per-card features (7) are computed once via vectorized numpy.
|
||||
- Context features (8) use running totals updated by O(1) increments.
|
||||
- Picked cards are tracked with a boolean mask; no list.remove() calls.
|
||||
- Each pick step does one small forward pass over the affordable subset only.
|
||||
"""
|
||||
|
||||
def __init__(self, net: NeuralNet, training: bool = False, temperature: float = 1.0):
|
||||
self.net = net
|
||||
self.training = training
|
||||
self.temperature = temperature
|
||||
self.trajectory: list[tuple[np.ndarray, int]] = [] # (features_matrix, chosen_idx)
|
||||
|
||||
def choose_cards(self, allowed: list, difficulty: int) -> list:
|
||||
"""
|
||||
allowed: pre-filtered list of Card objects (cost ≤ max_card_cost already applied).
|
||||
Returns the selected deck as a list of Cards.
|
||||
"""
|
||||
BUDGET = 50
|
||||
n = len(allowed)
|
||||
|
||||
static = _precompute_static_features(allowed) # (n, 7) — computed once
|
||||
costs = np.array([c.cost for c in allowed], dtype=np.float32)
|
||||
picked = np.zeros(n, dtype=bool)
|
||||
|
||||
budget_remaining = BUDGET
|
||||
selected: list = []
|
||||
|
||||
# Running totals for context features — incremented O(1) per pick.
|
||||
n_picked = 0
|
||||
sum_atk = 0.0
|
||||
sum_def = 0.0
|
||||
sum_cost = 0.0
|
||||
n_cheap = 0 # cost ≤ 3
|
||||
n_high = 0 # cost ≥ 6
|
||||
|
||||
diff_norm = difficulty / 10.0
|
||||
|
||||
while True:
|
||||
mask = (~picked) & (costs <= budget_remaining)
|
||||
if not mask.any():
|
||||
break
|
||||
|
||||
idxs = np.where(mask)[0]
|
||||
|
||||
# Context row — same for every candidate this step, broadcast via tile.
|
||||
if n_picked > 0:
|
||||
ctx = np.array([
|
||||
n_picked / 30.0,
|
||||
budget_remaining / 50.0,
|
||||
sum_atk / n_picked / _MAX_ATK,
|
||||
sum_def / n_picked / _MAX_DEF,
|
||||
sum_cost / n_picked / 10.0,
|
||||
n_cheap / n_picked,
|
||||
n_high / n_picked,
|
||||
diff_norm,
|
||||
], dtype=np.float32)
|
||||
else:
|
||||
ctx = np.array([
|
||||
0.0, budget_remaining / 50.0, 0.0, 0.0, 0.0, 0.0, 0.0, diff_norm,
|
||||
], dtype=np.float32)
|
||||
|
||||
features = np.concatenate(
|
||||
[static[idxs], np.tile(ctx, (len(idxs), 1))],
|
||||
axis=1,
|
||||
)
|
||||
scores = self.net.forward(features)
|
||||
|
||||
if self.training:
|
||||
probs = _softmax((scores / self.temperature).astype(np.float64))
|
||||
probs = np.clip(probs, 1e-10, None)
|
||||
probs /= probs.sum()
|
||||
local_idx = int(np.random.choice(len(idxs), p=probs))
|
||||
self.trajectory.append((features, local_idx))
|
||||
else:
|
||||
local_idx = int(np.argmax(scores))
|
||||
|
||||
global_idx = idxs[local_idx]
|
||||
card = allowed[global_idx]
|
||||
picked[global_idx] = True
|
||||
selected.append(card)
|
||||
|
||||
# Incremental context update — O(1).
|
||||
budget_remaining -= card.cost
|
||||
n_picked += 1
|
||||
sum_atk += card.attack
|
||||
sum_def += card.defense
|
||||
sum_cost += card.cost
|
||||
if card.cost <= 3: n_cheap += 1
|
||||
if card.cost >= 6: n_high += 1
|
||||
|
||||
return selected
|
||||
|
||||
def compute_grads(self, outcome: float) -> tuple[list, list] | None:
|
||||
"""
|
||||
REINFORCE gradients averaged over the pick trajectory.
|
||||
outcome: centered reward (win/loss minus baseline).
|
||||
Returns (grads_w, grads_b), or None if no picks were made.
|
||||
"""
|
||||
if not self.trajectory:
|
||||
return None
|
||||
|
||||
acc_gw = [np.zeros_like(w) for w in self.net.weights]
|
||||
acc_gb = [np.zeros_like(b) for b in self.net.biases]
|
||||
|
||||
for features, chosen_idx in self.trajectory:
|
||||
scores = self.net.forward(features)
|
||||
probs = _softmax(scores.astype(np.float64)).astype(np.float32)
|
||||
upstream = -probs.copy()
|
||||
upstream[chosen_idx] += 1.0
|
||||
upstream *= outcome
|
||||
gw, gb = self.net.backward(upstream)
|
||||
for i in range(len(acc_gw)):
|
||||
acc_gw[i] += gw[i]
|
||||
acc_gb[i] += gb[i]
|
||||
|
||||
n = len(self.trajectory)
|
||||
for i in range(len(acc_gw)):
|
||||
acc_gw[i] /= n
|
||||
acc_gb[i] /= n
|
||||
|
||||
self.trajectory.clear()
|
||||
return acc_gw, acc_gb
|
||||
Reference in New Issue
Block a user