Lecture 21: REINFORCE Demo Code
REINFORCE Algorithm Demo
This code demonstrates the implementation of the REINFORCE (Monte Carlo Policy Gradient) algorithm using PyTorch. REINFORCE is a foundational policy gradient method that directly optimizes the policy by following the gradient of expected returns.
Key concepts illustrated:
- Policy network with softmax output for action selection
- Monte Carlo return calculation
- Policy gradient update using log-probability and returns
- Epsilon-greedy exploration strategy
REINFORCE Implementation
# Partially Adapted from https://towardsdatascience.com/learning-reinforcement-learning-reinforce-with-pytorch-5e8ad7fc7da0
# %%
import numpy as np
import gymnasium as gym
import torch
from torch import nn
import torch.nn.functional as F
import random
from collections import deque
import copy
from tqdm.std import tqdm
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# %%
env = gym.make('CartPole-v1')
n_state = int(np.prod(env.observation_space.shape))
n_action = env.action_space.n
print("# of state", n_state)
print("# of action", n_action)
# %%
def run_episode(env, policy, render=False):
obs_list = []
act_list = []
reward_list = []
next_obs_list = []
done_list = []
obs = env.reset()[0]
while True:
if render:
env.render()
action = policy(obs)
next_obs, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
reward_list.append(reward), obs_list.append(obs), \
done_list.append(done), act_list.append(action), \
next_obs_list.append(next_obs)
if done:
break
obs = next_obs
return obs_list, act_list, reward_list, next_obs_list, done_list
# %%
class Policy():
def __init__(self, n_state, n_action):
# Define network
self.act_net = nn.Sequential(
nn.Linear(n_state, 16),
nn.ReLU(),
nn.Linear(16, n_action),
nn.Softmax()
)
self.act_net.to(device)
self.gamma = 0.99
self.optimizer = torch.optim.Adam(self.act_net.parameters(), lr=1e-3)
def __call__(self, state):
with torch.no_grad():
state = torch.FloatTensor(state).to(device)
action_probs = self.act_net(state).detach().cpu().numpy()
action = np.random.choice(n_action, p=action_probs)
return action
def update(self, data):
obs, act, reward, next_obs, done = data
# Calculate culmulative return
returns = np.zeros_like(reward)
s = 0
for i in reversed(range(len(returns))):
s = s * self.gamma + reward[i]
returns[i] = s
obs = torch.FloatTensor(obs).to(device)
returns = torch.FloatTensor(returns).to(device)
# Actions are used as indices, must be
# LongTensor
act = torch.LongTensor(act).to(device)
# Calculate loss
batch_size = 32
list = [j for j in range(len(obs))]
for i in range(0, len(list), batch_size):
index = list[i:i+batch_size]
logprob = torch.log(self.act_net(obs[index, :]))
gt_logprob = returns[index] * \
torch.gather(logprob, 1,
act[index, None]).squeeze()
loss = -gt_logprob.mean()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss.item()
# %%
losses_list, reward_list = [], []
policy = Policy(n_state, n_action)
loss = 0
n_step = 0
for i in tqdm(range(2000)):
data = run_episode(env, policy)
for _ in range(5):
loss = policy.update(data)
rew = sum(data[2])
if i > 0 and i % 50 == 0:
print("itr:({:>5d}) loss:{:>3.4f} reward:{:>3.1f}".format(
i, np.mean(losses_list[-50:]), np.mean(reward_list[-50:])))
losses_list.append(loss), reward_list.append(rew)
# %%
scores = [sum(run_episode(env, policy, False)[2]) for _ in range(100)]
print("Final score:", np.mean(scores))
import pandas as pd
df = pd.DataFrame({'loss': losses_list, 'reward': reward_list})
df.to_csv("./ClassMaterials/Lecture_19_Actor_Critic/data/REINFORCE.csv",
index=False, header=True)