🐐
This commit is contained in:
205
backend/train_nn.py
Normal file
205
backend/train_nn.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import os
|
||||
import random
|
||||
import uuid
|
||||
import numpy as np
|
||||
from collections import deque
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
from card import compute_deck_type
|
||||
from ai import AIPersonality, choose_cards, choose_plan
|
||||
from game import PlayerState, GameState, action_play_card, action_sacrifice, action_end_turn
|
||||
from simulate import get_simulation_cards, _make_instances, MAX_TURNS
|
||||
from nn import NeuralNet, NeuralPlayer
|
||||
|
||||
NN_WEIGHTS_PATH = os.path.join(os.path.dirname(__file__), "nn_weights.json")
|
||||
|
||||
P1 = "p1"
|
||||
P2 = "p2"
|
||||
|
||||
FIXED_PERSONALITIES = [p for p in AIPersonality if p != AIPersonality.ARBITRARY]
|
||||
|
||||
|
||||
# ==================== Game runner ====================
|
||||
|
||||
def _build_player(pid: str, name: str, cards: list, difficulty: int, personality: AIPersonality) -> PlayerState:
|
||||
deck = choose_cards(cards, difficulty, personality)
|
||||
instances = _make_instances(deck)
|
||||
random.shuffle(instances)
|
||||
p = PlayerState(
|
||||
user_id=pid, username=name,
|
||||
deck_type=compute_deck_type(deck) or "Balanced",
|
||||
deck=instances,
|
||||
)
|
||||
return p
|
||||
|
||||
|
||||
def run_episode(
|
||||
p1_state: PlayerState,
|
||||
p2_state: PlayerState,
|
||||
p1_ctrl, # (player, opponent) -> MovePlan
|
||||
p2_ctrl, # (player, opponent) -> MovePlan
|
||||
) -> str | None:
|
||||
"""Returns winner_id (P1 or P2) or None on timeout."""
|
||||
p1_state.increment_energy_cap()
|
||||
p2_state.increment_energy_cap()
|
||||
p1_state.refill_energy()
|
||||
p1_state.draw_to_full()
|
||||
|
||||
state = GameState(
|
||||
game_id=str(uuid.uuid4()),
|
||||
players={P1: p1_state, P2: p2_state},
|
||||
player_order=[P1, P2],
|
||||
active_player_id=P1,
|
||||
phase="main",
|
||||
turn=1,
|
||||
)
|
||||
ctrls = {P1: p1_ctrl, P2: p2_ctrl}
|
||||
|
||||
for _ in range(MAX_TURNS):
|
||||
if state.result:
|
||||
break
|
||||
active_id = state.active_player_id
|
||||
player = state.players[active_id]
|
||||
opponent = state.players[state.opponent_id(active_id)]
|
||||
|
||||
plan = ctrls[active_id](player, opponent)
|
||||
|
||||
for slot in plan.sacrifice_slots:
|
||||
if player.board[slot] is not None:
|
||||
action_sacrifice(state, slot)
|
||||
|
||||
plays = list(plan.plays)
|
||||
random.shuffle(plays)
|
||||
for card, slot in plays:
|
||||
hand_idx = next((i for i, c in enumerate(player.hand) if c is card), None)
|
||||
if hand_idx is None or player.board[slot] is not None or card.cost > player.energy:
|
||||
continue
|
||||
action_play_card(state, hand_idx, slot)
|
||||
|
||||
action_end_turn(state)
|
||||
|
||||
return state.result.winner_id if state.result else None
|
||||
|
||||
|
||||
# ==================== Training loop ====================
|
||||
|
||||
def train(
|
||||
n_episodes: int = 20_000,
|
||||
self_play_start: int = 5_000,
|
||||
self_play_max_frac: float = 0.4,
|
||||
lr: float = 1e-3,
|
||||
opp_difficulty: int = 10,
|
||||
temperature: float = 1.0,
|
||||
batch_size: int = 50,
|
||||
save_every: int = 5_000,
|
||||
save_path: str = NN_WEIGHTS_PATH,
|
||||
) -> NeuralNet:
|
||||
cards = get_simulation_cards()
|
||||
|
||||
if os.path.exists(save_path):
|
||||
print(f"Resuming from {save_path}")
|
||||
net = NeuralNet.load(save_path)
|
||||
else:
|
||||
print("Initializing new network")
|
||||
net = NeuralNet(seed=42)
|
||||
|
||||
recent_outcomes: deque[int] = deque(maxlen=1000) # rolling window for win rate display
|
||||
baseline = 0.0 # EMA of recent outcomes; subtracted before each update
|
||||
baseline_alpha = 0.99 # decay — roughly a 100-episode window
|
||||
|
||||
batch_gw = [np.zeros_like(w) for w in net.weights]
|
||||
batch_gb = [np.zeros_like(b) for b in net.biases]
|
||||
batch_count = 0
|
||||
|
||||
for episode in range(1, n_episodes + 1):
|
||||
# Ramp self-play fraction linearly from 0 to self_play_max_frac
|
||||
if episode >= self_play_start:
|
||||
progress = (episode - self_play_start) / max(1, n_episodes - self_play_start)
|
||||
self_play_prob = self_play_max_frac * progress
|
||||
else:
|
||||
self_play_prob = 0.0
|
||||
|
||||
# Randomly decide who goes first (NN is always P1 for simplicity)
|
||||
nn_goes_first = random.random() < 0.5
|
||||
|
||||
if random.random() < self_play_prob:
|
||||
# ---- Self-play ----
|
||||
nn1 = NeuralPlayer(net, training=True, temperature=temperature)
|
||||
nn2 = NeuralPlayer(net, training=True, temperature=temperature)
|
||||
|
||||
p1_state = _build_player(P1, "NN1", cards, 10, AIPersonality.BALANCED)
|
||||
p2_state = _build_player(P2, "NN2", cards, 10, AIPersonality.BALANCED)
|
||||
|
||||
if not nn_goes_first:
|
||||
p1_state, p2_state = p2_state, p1_state
|
||||
|
||||
winner = run_episode(p1_state, p2_state, nn1.choose_plan, nn2.choose_plan)
|
||||
p1_outcome = 1.0 if winner == P1 else -1.0
|
||||
baseline = baseline_alpha * baseline + (1 - baseline_alpha) * p1_outcome
|
||||
|
||||
for player_grads in [nn1.compute_grads(p1_outcome - baseline),
|
||||
nn2.compute_grads(-p1_outcome - baseline)]:
|
||||
if player_grads is not None:
|
||||
gw, gb = player_grads
|
||||
for i in range(len(batch_gw)):
|
||||
batch_gw[i] += gw[i]
|
||||
batch_gb[i] += gb[i]
|
||||
batch_count += 1
|
||||
|
||||
else:
|
||||
# ---- NN vs fixed opponent ----
|
||||
opp_personality = random.choice(FIXED_PERSONALITIES)
|
||||
nn_player = NeuralPlayer(net, training=True, temperature=temperature)
|
||||
opp_ctrl = lambda p, o, pers=opp_personality, diff=opp_difficulty: choose_plan(p, o, pers, diff)
|
||||
|
||||
if nn_goes_first:
|
||||
nn_id = P1
|
||||
p1_state = _build_player(P1, "NN", cards, 10, AIPersonality.BALANCED)
|
||||
p2_state = _build_player(P2, "OPP", cards, opp_difficulty, opp_personality)
|
||||
winner = run_episode(p1_state, p2_state, nn_player.choose_plan, opp_ctrl)
|
||||
else:
|
||||
nn_id = P2
|
||||
p1_state = _build_player(P1, "OPP", cards, opp_difficulty, opp_personality)
|
||||
p2_state = _build_player(P2, "NN", cards, 10, AIPersonality.BALANCED)
|
||||
winner = run_episode(p1_state, p2_state, opp_ctrl, nn_player.choose_plan)
|
||||
|
||||
nn_outcome = 1.0 if winner == nn_id else -1.0
|
||||
player_grads = nn_player.compute_grads(nn_outcome - baseline)
|
||||
baseline = baseline_alpha * baseline + (1 - baseline_alpha) * nn_outcome
|
||||
|
||||
if player_grads is not None:
|
||||
gw, gb = player_grads
|
||||
for i in range(len(batch_gw)):
|
||||
batch_gw[i] += gw[i]
|
||||
batch_gb[i] += gb[i]
|
||||
batch_count += 1
|
||||
|
||||
recent_outcomes.append(1 if winner == nn_id else 0)
|
||||
|
||||
if batch_count >= batch_size:
|
||||
for i in range(len(batch_gw)):
|
||||
batch_gw[i] /= batch_count
|
||||
batch_gb[i] /= batch_count
|
||||
net.adam_update(batch_gw, batch_gb, lr=lr)
|
||||
batch_gw = [np.zeros_like(w) for w in net.weights]
|
||||
batch_gb = [np.zeros_like(b) for b in net.biases]
|
||||
batch_count = 0
|
||||
|
||||
if episode % 1000 == 0 or episode == n_episodes:
|
||||
wr = sum(recent_outcomes) / len(recent_outcomes) if recent_outcomes else 0.0
|
||||
print(f"[{episode:>6}/{n_episodes}] win rate (last {len(recent_outcomes)}): {wr:.1%} "
|
||||
f"self-play frac: {self_play_prob:.0%}", flush=True)
|
||||
|
||||
if episode % save_every == 0:
|
||||
net.save(save_path)
|
||||
print(f" → saved to {save_path}")
|
||||
|
||||
net.save(save_path)
|
||||
wr = sum(recent_outcomes) / len(recent_outcomes) if recent_outcomes else 0.0
|
||||
print(f"Done. Final win rate (last {len(recent_outcomes)}): {wr:.1%}")
|
||||
return net
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train()
|
||||
Reference in New Issue
Block a user