wiki-tcg/backend/ai/card_pick_nn.py

import os

import numpy as np

from ai.nn import NeuralNet, _softmax

# Separate weights file so this NN trains independently from the plan NN.
CARD_PICK_WEIGHTS_PATH = os.path.join(os.path.dirname(__file__), "card_pick_weights.json")

N_CARD_FEATURES = 15

# Normalization constants — chosen to cover the realistic stat range for generated cards.
_MAX_ATK = 50.0
_MAX_DEF = 100.0


def _precompute_static_features(allowed: list) -> np.ndarray:
  """
  Vectorized precomputation of the 7 per-card static features for the whole pool.
  Returns (n, 7) float32. Called once per choose_cards() invocation.
  """
  n = len(allowed)
  atk  = np.array([c.attack           for c in allowed], dtype=np.float32)
  defn = np.array([c.defense          for c in allowed], dtype=np.float32)
  cost = np.array([c.cost             for c in allowed], dtype=np.float32)
  rar  = np.array([c.card_rarity.value for c in allowed], dtype=np.float32)
  typ  = np.array([c.card_type.value   for c in allowed], dtype=np.float32)

  exact_cost = np.minimum(10.0, np.maximum(1.0, ((atk**2 + defn**2)**0.18) / 1.5))
  total      = atk + defn
  atk_ratio  = np.where(total > 0, atk / total, 0.5)
  pcv_norm   = np.clip(exact_cost - cost, 0.0, 1.0)

  out = np.empty((n, 7), dtype=np.float32)
  out[:, 0] = atk  / _MAX_ATK
  out[:, 1] = defn / _MAX_DEF
  out[:, 2] = cost / 10.0
  out[:, 3] = rar  / 5.0
  out[:, 4] = atk_ratio
  out[:, 5] = pcv_norm
  out[:, 6] = typ  / 9.0
  return out


class CardPickPlayer:
  """
  Uses a NeuralNet to sequentially select cards from a pool until the cost
  budget is exhausted.  API mirrors NeuralPlayer so training code stays uniform.

  In training mode: samples stochastically (softmax) and records the
  trajectory for a REINFORCE update after the game ends.
  In inference mode: picks the highest-scoring affordable card at each step.

  Performance design:
    - Static per-card features (7) are computed once via vectorized numpy.
    - Context features (8) use running totals updated by O(1) increments.
    - Picked cards are tracked with a boolean mask; no list.remove() calls.
    - Each pick step does one small forward pass over the affordable subset only.
  """

  def __init__(self, net: NeuralNet, training: bool = False, temperature: float = 1.0):
    self.net = net
    self.training = training
    self.temperature = temperature
    self.trajectory: list[tuple[np.ndarray, int]] = []  # (features_matrix, chosen_idx)

  def choose_cards(self, allowed: list, difficulty: int) -> list:
    """
    allowed: pre-filtered list of Card objects (cost ≤ max_card_cost already applied).
    Returns the selected deck as a list of Cards.
    """
    BUDGET = 50
    n = len(allowed)

    static   = _precompute_static_features(allowed)   # (n, 7) — computed once
    costs    = np.array([c.cost for c in allowed], dtype=np.float32)
    picked   = np.zeros(n, dtype=bool)

    budget_remaining = BUDGET
    selected: list = []

    # Running totals for context features — incremented O(1) per pick.
    n_picked   = 0
    sum_atk    = 0.0
    sum_def    = 0.0
    sum_cost   = 0.0
    n_cheap    = 0   # cost ≤ 3
    n_high     = 0   # cost ≥ 6

    diff_norm = difficulty / 10.0

    while True:
      mask = (~picked) & (costs <= budget_remaining)
      if not mask.any():
        break

      idxs = np.where(mask)[0]

      # Context row — same for every candidate this step, broadcast via tile.
      if n_picked > 0:
        ctx = np.array([
          n_picked / 30.0,
          budget_remaining / 50.0,
          sum_atk / n_picked / _MAX_ATK,
          sum_def / n_picked / _MAX_DEF,
          sum_cost / n_picked / 10.0,
          n_cheap / n_picked,
          n_high  / n_picked,
          diff_norm,
        ], dtype=np.float32)
      else:
        ctx = np.array([
          0.0, budget_remaining / 50.0, 0.0, 0.0, 0.0, 0.0, 0.0, diff_norm,
        ], dtype=np.float32)

      features = np.concatenate(
        [static[idxs], np.tile(ctx, (len(idxs), 1))],
        axis=1,
      )
      scores = self.net.forward(features)

      if self.training:
        probs = _softmax((scores / self.temperature).astype(np.float64))
        probs = np.clip(probs, 1e-10, None)
        probs /= probs.sum()
        local_idx = int(np.random.choice(len(idxs), p=probs))
        self.trajectory.append((features, local_idx))
      else:
        local_idx = int(np.argmax(scores))

      global_idx = idxs[local_idx]
      card = allowed[global_idx]
      picked[global_idx] = True
      selected.append(card)

      # Incremental context update — O(1).
      budget_remaining -= card.cost
      n_picked  += 1
      sum_atk   += card.attack
      sum_def   += card.defense
      sum_cost  += card.cost
      if card.cost <= 3: n_cheap += 1
      if card.cost >= 6: n_high  += 1

    return selected

  def compute_grads(self, outcome: float) -> tuple[list, list] | None:
    """
    REINFORCE gradients averaged over the pick trajectory.
    outcome: centered reward (win/loss minus baseline).
    Returns (grads_w, grads_b), or None if no picks were made.
    """
    if not self.trajectory:
      return None

    acc_gw = [np.zeros_like(w) for w in self.net.weights]
    acc_gb = [np.zeros_like(b) for b in self.net.biases]

    for features, chosen_idx in self.trajectory:
      scores   = self.net.forward(features)
      probs    = _softmax(scores.astype(np.float64)).astype(np.float32)
      upstream = -probs.copy()
      upstream[chosen_idx] += 1.0
      upstream *= outcome
      gw, gb = self.net.backward(upstream)
      for i in range(len(acc_gw)):
        acc_gw[i] += gw[i]
        acc_gb[i] += gb[i]

    n = len(self.trajectory)
    for i in range(len(acc_gw)):
      acc_gw[i] /= n
      acc_gb[i] /= n

    self.trajectory.clear()
    return acc_gw, acc_gb