import os import numpy as np from ai.nn import NeuralNet, _softmax # Separate weights file so this NN trains independently from the plan NN. CARD_PICK_WEIGHTS_PATH = os.path.join(os.path.dirname(__file__), "card_pick_weights.json") N_CARD_FEATURES = 15 # Normalization constants — chosen to cover the realistic stat range for generated cards. _MAX_ATK = 50.0 _MAX_DEF = 100.0 def _precompute_static_features(allowed: list) -> np.ndarray: """ Vectorized precomputation of the 7 per-card static features for the whole pool. Returns (n, 7) float32. Called once per choose_cards() invocation. """ n = len(allowed) atk = np.array([c.attack for c in allowed], dtype=np.float32) defn = np.array([c.defense for c in allowed], dtype=np.float32) cost = np.array([c.cost for c in allowed], dtype=np.float32) rar = np.array([c.card_rarity.value for c in allowed], dtype=np.float32) typ = np.array([c.card_type.value for c in allowed], dtype=np.float32) exact_cost = np.minimum(10.0, np.maximum(1.0, ((atk**2 + defn**2)**0.18) / 1.5)) total = atk + defn atk_ratio = np.where(total > 0, atk / total, 0.5) pcv_norm = np.clip(exact_cost - cost, 0.0, 1.0) out = np.empty((n, 7), dtype=np.float32) out[:, 0] = atk / _MAX_ATK out[:, 1] = defn / _MAX_DEF out[:, 2] = cost / 10.0 out[:, 3] = rar / 5.0 out[:, 4] = atk_ratio out[:, 5] = pcv_norm out[:, 6] = typ / 9.0 return out class CardPickPlayer: """ Uses a NeuralNet to sequentially select cards from a pool until the cost budget is exhausted. API mirrors NeuralPlayer so training code stays uniform. In training mode: samples stochastically (softmax) and records the trajectory for a REINFORCE update after the game ends. In inference mode: picks the highest-scoring affordable card at each step. Performance design: - Static per-card features (7) are computed once via vectorized numpy. - Context features (8) use running totals updated by O(1) increments. - Picked cards are tracked with a boolean mask; no list.remove() calls. - Each pick step does one small forward pass over the affordable subset only. """ def __init__(self, net: NeuralNet, training: bool = False, temperature: float = 1.0): self.net = net self.training = training self.temperature = temperature self.trajectory: list[tuple[np.ndarray, int]] = [] # (features_matrix, chosen_idx) def choose_cards(self, allowed: list, difficulty: int) -> list: """ allowed: pre-filtered list of Card objects (cost ≤ max_card_cost already applied). Returns the selected deck as a list of Cards. """ BUDGET = 50 n = len(allowed) static = _precompute_static_features(allowed) # (n, 7) — computed once costs = np.array([c.cost for c in allowed], dtype=np.float32) picked = np.zeros(n, dtype=bool) budget_remaining = BUDGET selected: list = [] # Running totals for context features — incremented O(1) per pick. n_picked = 0 sum_atk = 0.0 sum_def = 0.0 sum_cost = 0.0 n_cheap = 0 # cost ≤ 3 n_high = 0 # cost ≥ 6 diff_norm = difficulty / 10.0 while True: mask = (~picked) & (costs <= budget_remaining) if not mask.any(): break idxs = np.where(mask)[0] # Context row — same for every candidate this step, broadcast via tile. if n_picked > 0: ctx = np.array([ n_picked / 30.0, budget_remaining / 50.0, sum_atk / n_picked / _MAX_ATK, sum_def / n_picked / _MAX_DEF, sum_cost / n_picked / 10.0, n_cheap / n_picked, n_high / n_picked, diff_norm, ], dtype=np.float32) else: ctx = np.array([ 0.0, budget_remaining / 50.0, 0.0, 0.0, 0.0, 0.0, 0.0, diff_norm, ], dtype=np.float32) features = np.concatenate( [static[idxs], np.tile(ctx, (len(idxs), 1))], axis=1, ) scores = self.net.forward(features) if self.training: probs = _softmax((scores / self.temperature).astype(np.float64)) probs = np.clip(probs, 1e-10, None) probs /= probs.sum() local_idx = int(np.random.choice(len(idxs), p=probs)) self.trajectory.append((features, local_idx)) else: local_idx = int(np.argmax(scores)) global_idx = idxs[local_idx] card = allowed[global_idx] picked[global_idx] = True selected.append(card) # Incremental context update — O(1). budget_remaining -= card.cost n_picked += 1 sum_atk += card.attack sum_def += card.defense sum_cost += card.cost if card.cost <= 3: n_cheap += 1 if card.cost >= 6: n_high += 1 return selected def compute_grads(self, outcome: float) -> tuple[list, list] | None: """ REINFORCE gradients averaged over the pick trajectory. outcome: centered reward (win/loss minus baseline). Returns (grads_w, grads_b), or None if no picks were made. """ if not self.trajectory: return None acc_gw = [np.zeros_like(w) for w in self.net.weights] acc_gb = [np.zeros_like(b) for b in self.net.biases] for features, chosen_idx in self.trajectory: scores = self.net.forward(features) probs = _softmax(scores.astype(np.float64)).astype(np.float32) upstream = -probs.copy() upstream[chosen_idx] += 1.0 upstream *= outcome gw, gb = self.net.backward(upstream) for i in range(len(acc_gw)): acc_gw[i] += gw[i] acc_gb[i] += gb[i] n = len(self.trajectory) for i in range(len(acc_gw)): acc_gw[i] /= n acc_gb[i] /= n self.trajectory.clear() return acc_gw, acc_gb