177 lines
5.7 KiB
Python
177 lines
5.7 KiB
Python
import os
|
|
|
|
import numpy as np
|
|
|
|
from ai.nn import NeuralNet, _softmax
|
|
|
|
# Separate weights file so this NN trains independently from the plan NN.
|
|
CARD_PICK_WEIGHTS_PATH = os.path.join(os.path.dirname(__file__), "card_pick_weights.json")
|
|
|
|
N_CARD_FEATURES = 15
|
|
|
|
# Normalization constants — chosen to cover the realistic stat range for generated cards.
|
|
_MAX_ATK = 50.0
|
|
_MAX_DEF = 100.0
|
|
|
|
|
|
def _precompute_static_features(allowed: list) -> np.ndarray:
|
|
"""
|
|
Vectorized precomputation of the 7 per-card static features for the whole pool.
|
|
Returns (n, 7) float32. Called once per choose_cards() invocation.
|
|
"""
|
|
n = len(allowed)
|
|
atk = np.array([c.attack for c in allowed], dtype=np.float32)
|
|
defn = np.array([c.defense for c in allowed], dtype=np.float32)
|
|
cost = np.array([c.cost for c in allowed], dtype=np.float32)
|
|
rar = np.array([c.card_rarity.value for c in allowed], dtype=np.float32)
|
|
typ = np.array([c.card_type.value for c in allowed], dtype=np.float32)
|
|
|
|
exact_cost = np.minimum(10.0, np.maximum(1.0, ((atk**2 + defn**2)**0.18) / 1.5))
|
|
total = atk + defn
|
|
atk_ratio = np.where(total > 0, atk / total, 0.5)
|
|
pcv_norm = np.clip(exact_cost - cost, 0.0, 1.0)
|
|
|
|
out = np.empty((n, 7), dtype=np.float32)
|
|
out[:, 0] = atk / _MAX_ATK
|
|
out[:, 1] = defn / _MAX_DEF
|
|
out[:, 2] = cost / 10.0
|
|
out[:, 3] = rar / 5.0
|
|
out[:, 4] = atk_ratio
|
|
out[:, 5] = pcv_norm
|
|
out[:, 6] = typ / 9.0
|
|
return out
|
|
|
|
|
|
class CardPickPlayer:
|
|
"""
|
|
Uses a NeuralNet to sequentially select cards from a pool until the cost
|
|
budget is exhausted. API mirrors NeuralPlayer so training code stays uniform.
|
|
|
|
In training mode: samples stochastically (softmax) and records the
|
|
trajectory for a REINFORCE update after the game ends.
|
|
In inference mode: picks the highest-scoring affordable card at each step.
|
|
|
|
Performance design:
|
|
- Static per-card features (7) are computed once via vectorized numpy.
|
|
- Context features (8) use running totals updated by O(1) increments.
|
|
- Picked cards are tracked with a boolean mask; no list.remove() calls.
|
|
- Each pick step does one small forward pass over the affordable subset only.
|
|
"""
|
|
|
|
def __init__(self, net: NeuralNet, training: bool = False, temperature: float = 1.0):
|
|
self.net = net
|
|
self.training = training
|
|
self.temperature = temperature
|
|
self.trajectory: list[tuple[np.ndarray, int]] = [] # (features_matrix, chosen_idx)
|
|
|
|
def choose_cards(self, allowed: list, difficulty: int) -> list:
|
|
"""
|
|
allowed: pre-filtered list of Card objects (cost ≤ max_card_cost already applied).
|
|
Returns the selected deck as a list of Cards.
|
|
"""
|
|
BUDGET = 50
|
|
n = len(allowed)
|
|
|
|
static = _precompute_static_features(allowed) # (n, 7) — computed once
|
|
costs = np.array([c.cost for c in allowed], dtype=np.float32)
|
|
picked = np.zeros(n, dtype=bool)
|
|
|
|
budget_remaining = BUDGET
|
|
selected: list = []
|
|
|
|
# Running totals for context features — incremented O(1) per pick.
|
|
n_picked = 0
|
|
sum_atk = 0.0
|
|
sum_def = 0.0
|
|
sum_cost = 0.0
|
|
n_cheap = 0 # cost ≤ 3
|
|
n_high = 0 # cost ≥ 6
|
|
|
|
diff_norm = difficulty / 10.0
|
|
|
|
while True:
|
|
mask = (~picked) & (costs <= budget_remaining)
|
|
if not mask.any():
|
|
break
|
|
|
|
idxs = np.where(mask)[0]
|
|
|
|
# Context row — same for every candidate this step, broadcast via tile.
|
|
if n_picked > 0:
|
|
ctx = np.array([
|
|
n_picked / 30.0,
|
|
budget_remaining / 50.0,
|
|
sum_atk / n_picked / _MAX_ATK,
|
|
sum_def / n_picked / _MAX_DEF,
|
|
sum_cost / n_picked / 10.0,
|
|
n_cheap / n_picked,
|
|
n_high / n_picked,
|
|
diff_norm,
|
|
], dtype=np.float32)
|
|
else:
|
|
ctx = np.array([
|
|
0.0, budget_remaining / 50.0, 0.0, 0.0, 0.0, 0.0, 0.0, diff_norm,
|
|
], dtype=np.float32)
|
|
|
|
features = np.concatenate(
|
|
[static[idxs], np.tile(ctx, (len(idxs), 1))],
|
|
axis=1,
|
|
)
|
|
scores = self.net.forward(features)
|
|
|
|
if self.training:
|
|
probs = _softmax((scores / self.temperature).astype(np.float64))
|
|
probs = np.clip(probs, 1e-10, None)
|
|
probs /= probs.sum()
|
|
local_idx = int(np.random.choice(len(idxs), p=probs))
|
|
self.trajectory.append((features, local_idx))
|
|
else:
|
|
local_idx = int(np.argmax(scores))
|
|
|
|
global_idx = idxs[local_idx]
|
|
card = allowed[global_idx]
|
|
picked[global_idx] = True
|
|
selected.append(card)
|
|
|
|
# Incremental context update — O(1).
|
|
budget_remaining -= card.cost
|
|
n_picked += 1
|
|
sum_atk += card.attack
|
|
sum_def += card.defense
|
|
sum_cost += card.cost
|
|
if card.cost <= 3: n_cheap += 1
|
|
if card.cost >= 6: n_high += 1
|
|
|
|
return selected
|
|
|
|
def compute_grads(self, outcome: float) -> tuple[list, list] | None:
|
|
"""
|
|
REINFORCE gradients averaged over the pick trajectory.
|
|
outcome: centered reward (win/loss minus baseline).
|
|
Returns (grads_w, grads_b), or None if no picks were made.
|
|
"""
|
|
if not self.trajectory:
|
|
return None
|
|
|
|
acc_gw = [np.zeros_like(w) for w in self.net.weights]
|
|
acc_gb = [np.zeros_like(b) for b in self.net.biases]
|
|
|
|
for features, chosen_idx in self.trajectory:
|
|
scores = self.net.forward(features)
|
|
probs = _softmax(scores.astype(np.float64)).astype(np.float32)
|
|
upstream = -probs.copy()
|
|
upstream[chosen_idx] += 1.0
|
|
upstream *= outcome
|
|
gw, gb = self.net.backward(upstream)
|
|
for i in range(len(acc_gw)):
|
|
acc_gw[i] += gw[i]
|
|
acc_gb[i] += gb[i]
|
|
|
|
n = len(self.trajectory)
|
|
for i in range(len(acc_gw)):
|
|
acc_gw[i] /= n
|
|
acc_gb[i] /= n
|
|
|
|
self.trajectory.clear()
|
|
return acc_gw, acc_gb
|