This commit is contained in:
2026-04-01 18:31:33 +02:00
parent 6e23e32bb0
commit b5c7c5305a
95 changed files with 9609 additions and 2374 deletions

176
backend/ai/card_pick_nn.py Normal file
View File

@@ -0,0 +1,176 @@
import os
import numpy as np
from ai.nn import NeuralNet, _softmax
# Separate weights file so this NN trains independently from the plan NN.
CARD_PICK_WEIGHTS_PATH = os.path.join(os.path.dirname(__file__), "card_pick_weights.json")
N_CARD_FEATURES = 15
# Normalization constants — chosen to cover the realistic stat range for generated cards.
_MAX_ATK = 50.0
_MAX_DEF = 100.0
def _precompute_static_features(allowed: list) -> np.ndarray:
"""
Vectorized precomputation of the 7 per-card static features for the whole pool.
Returns (n, 7) float32. Called once per choose_cards() invocation.
"""
n = len(allowed)
atk = np.array([c.attack for c in allowed], dtype=np.float32)
defn = np.array([c.defense for c in allowed], dtype=np.float32)
cost = np.array([c.cost for c in allowed], dtype=np.float32)
rar = np.array([c.card_rarity.value for c in allowed], dtype=np.float32)
typ = np.array([c.card_type.value for c in allowed], dtype=np.float32)
exact_cost = np.minimum(10.0, np.maximum(1.0, ((atk**2 + defn**2)**0.18) / 1.5))
total = atk + defn
atk_ratio = np.where(total > 0, atk / total, 0.5)
pcv_norm = np.clip(exact_cost - cost, 0.0, 1.0)
out = np.empty((n, 7), dtype=np.float32)
out[:, 0] = atk / _MAX_ATK
out[:, 1] = defn / _MAX_DEF
out[:, 2] = cost / 10.0
out[:, 3] = rar / 5.0
out[:, 4] = atk_ratio
out[:, 5] = pcv_norm
out[:, 6] = typ / 9.0
return out
class CardPickPlayer:
"""
Uses a NeuralNet to sequentially select cards from a pool until the cost
budget is exhausted. API mirrors NeuralPlayer so training code stays uniform.
In training mode: samples stochastically (softmax) and records the
trajectory for a REINFORCE update after the game ends.
In inference mode: picks the highest-scoring affordable card at each step.
Performance design:
- Static per-card features (7) are computed once via vectorized numpy.
- Context features (8) use running totals updated by O(1) increments.
- Picked cards are tracked with a boolean mask; no list.remove() calls.
- Each pick step does one small forward pass over the affordable subset only.
"""
def __init__(self, net: NeuralNet, training: bool = False, temperature: float = 1.0):
self.net = net
self.training = training
self.temperature = temperature
self.trajectory: list[tuple[np.ndarray, int]] = [] # (features_matrix, chosen_idx)
def choose_cards(self, allowed: list, difficulty: int) -> list:
"""
allowed: pre-filtered list of Card objects (cost ≤ max_card_cost already applied).
Returns the selected deck as a list of Cards.
"""
BUDGET = 50
n = len(allowed)
static = _precompute_static_features(allowed) # (n, 7) — computed once
costs = np.array([c.cost for c in allowed], dtype=np.float32)
picked = np.zeros(n, dtype=bool)
budget_remaining = BUDGET
selected: list = []
# Running totals for context features — incremented O(1) per pick.
n_picked = 0
sum_atk = 0.0
sum_def = 0.0
sum_cost = 0.0
n_cheap = 0 # cost ≤ 3
n_high = 0 # cost ≥ 6
diff_norm = difficulty / 10.0
while True:
mask = (~picked) & (costs <= budget_remaining)
if not mask.any():
break
idxs = np.where(mask)[0]
# Context row — same for every candidate this step, broadcast via tile.
if n_picked > 0:
ctx = np.array([
n_picked / 30.0,
budget_remaining / 50.0,
sum_atk / n_picked / _MAX_ATK,
sum_def / n_picked / _MAX_DEF,
sum_cost / n_picked / 10.0,
n_cheap / n_picked,
n_high / n_picked,
diff_norm,
], dtype=np.float32)
else:
ctx = np.array([
0.0, budget_remaining / 50.0, 0.0, 0.0, 0.0, 0.0, 0.0, diff_norm,
], dtype=np.float32)
features = np.concatenate(
[static[idxs], np.tile(ctx, (len(idxs), 1))],
axis=1,
)
scores = self.net.forward(features)
if self.training:
probs = _softmax((scores / self.temperature).astype(np.float64))
probs = np.clip(probs, 1e-10, None)
probs /= probs.sum()
local_idx = int(np.random.choice(len(idxs), p=probs))
self.trajectory.append((features, local_idx))
else:
local_idx = int(np.argmax(scores))
global_idx = idxs[local_idx]
card = allowed[global_idx]
picked[global_idx] = True
selected.append(card)
# Incremental context update — O(1).
budget_remaining -= card.cost
n_picked += 1
sum_atk += card.attack
sum_def += card.defense
sum_cost += card.cost
if card.cost <= 3: n_cheap += 1
if card.cost >= 6: n_high += 1
return selected
def compute_grads(self, outcome: float) -> tuple[list, list] | None:
"""
REINFORCE gradients averaged over the pick trajectory.
outcome: centered reward (win/loss minus baseline).
Returns (grads_w, grads_b), or None if no picks were made.
"""
if not self.trajectory:
return None
acc_gw = [np.zeros_like(w) for w in self.net.weights]
acc_gb = [np.zeros_like(b) for b in self.net.biases]
for features, chosen_idx in self.trajectory:
scores = self.net.forward(features)
probs = _softmax(scores.astype(np.float64)).astype(np.float32)
upstream = -probs.copy()
upstream[chosen_idx] += 1.0
upstream *= outcome
gw, gb = self.net.backward(upstream)
for i in range(len(acc_gw)):
acc_gw[i] += gw[i]
acc_gb[i] += gb[i]
n = len(self.trajectory)
for i in range(len(acc_gw)):
acc_gw[i] /= n
acc_gb[i] /= n
self.trajectory.clear()
return acc_gw, acc_gb