请问为什么我这个算法会收敛到一个开局就往坑里跳的结果

explorer_yuhao 2024-07-03 10:00:50

``````import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random
import torch
import torch.optim as optim
import math
import numpy as np
import gym
import os
import argparse
import matplotlib.pyplot as plt
import seaborn as sns

# 将数字转换成独热码
def to_one_hot(num, num_classes=16):
# 创建一个全0的向量
one_hot = np.zeros(num_classes, dtype=np.float32)
num = int(num)
# 在对应位置设置为1
one_hot[num] = 1
return one_hot
# 用于输出神经网络对于所有状态的q值
test_state = []
for i in range(16):
test_state.append(to_one_hot(i,16))

# 神经网络
class Net(nn.Module):
# 初始化q网络，为全连接网络
def __init__(self, n_states, n_actions, hidden_dim=128):
# 初始化q网络，为全连接网络
super(Net, self).__init__()
# 输入层
self.fc1 = nn.Linear(n_states, hidden_dim)
# 隐藏层
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
# 输出层
self.fc3 = nn.Linear(hidden_dim, n_actions)

def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

class ReplayBuffer(object):
def __init__(self, capacity: int) -> None:
self.capacity = capacity
self.buffer = deque(maxlen=self.capacity)
# 存储transition到经验回放中
def push(self,transitions):
self.buffer.append(transitions)
def sample(self, batch_size: int, sequential: bool = False):
# 如果批量大小大于经验回放的容量，则取经验回放的容量
if batch_size > len(self.buffer):
batch_size = len(self.buffer)
if sequential: # 顺序采样
rand = random.randint(0, len(self.buffer) - batch_size)
batch = [self.buffer[i] for i in range(rand, rand + batch_size)]
return zip(*batch)
else: # 随机采样
batch = random.sample(self.buffer, batch_size)
return zip(*batch)
# 清空经验回放
def clear(self):
self.buffer.clear()
# 返回当前存储的量
def __len__(self):
return len(self.buffer)

class DQN:
def __init__(self,model,memory,cfg):
self.n_actions = cfg['n_actions']
self.device = torch.device(cfg['device'])
# 奖励的折扣因子
self.gamma = cfg['gamma']
# 用于epsilon的衰减计数
self.sample_count = 0
# 起始epsilon
self.epsilon = cfg['epsilon_start']
self.sample_count = 0
self.epsilon_start = cfg['epsilon_start']
self.epsilon_end = cfg['epsilon_end']
self.epsilon_decay = cfg['epsilon_decay']
self.batch_size = cfg['batch_size']
self.policy_net = model.to(self.device)
self.target_net = model.to(self.device)
# 复制参数到目标网络
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()):
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg['lr']) # 优化器
self.memory = memory
# 采样动作
def choose_action(self, state):
self.sample_count += 1
# epsilon指数衰减
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay)
# 如果随机生成的数大于epsilon，则通过神经网络选择动作
if random.random() > self.epsilon:
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
q_values = self.policy_net(state)
action = q_values.argmax().item()
else:
action = random.randrange(self.n_actions)
return action

# 预测动作
def predict_action(self, state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
q_values = self.policy_net(state)
action = q_values.argmax().item()
return action
def update(self):
# 当经验回放中不满足一个批量时，不更新策略
if len(self.memory) < self.batch_size:
return
# 从经验回放中随机采样一个批量的转移(transition)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 将数据转换为tensor
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1)
# 计算当前状态(s_t,a)对应的Q(s_t, a)
q_values_table = self.policy_net(state_batch)
q_values =q_values_table.gather(dim=1, index=action_batch)
next_q_values_table = self.target_net(next_state_batch)
# 计算下一时刻的状态(s_t_,a)对应的Q值
next_q_values = next_q_values_table.max(1)[0].detach()
# 计算期望的Q值，对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
# 计算均方根损失
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))
# 优化更新模型
loss.backward()
# # clip防止梯度爆炸
# for param in self.policy_net.parameters():
self.optimizer.step()
# 观测对于各个状态的q值
def observation(self):
state = test_state
state = torch.tensor(state,dtype=torch.float32)
q_table =  np.array(self.policy_net(state).detach())
print(q_table)

# 训练
def train(cfg, env, env_show, agent):
print("开始训练！")
# 记录所有回合的奖励，用于绘图
rewards = []
steps = []
for i_ep in range(cfg['train_eps']):
# 记录一回合内的奖励
ep_reward = 0
ep_step = 0
# 每20回合展示一次
if (i_ep) % 20 == 0:
# 重置环境，返回初始状态
state,info = env_show.reset(seed=1)
state = to_one_hot(state,16)
for _ in range(cfg['ep_max_steps']):
ep_step += 1
# 选择动作
action = agent.choose_action(state)
# 更新环境，返回transition
next_state, reward, done,truncated, _ = env_show.step(action)
next_state = to_one_hot(next_state,16)
# 调整奖励机制
if reward == 0 and done != 1:
reward = -1
elif reward == 1:
reward = 100
else:
reward = -10
# 保存transition
agent.memory.push((state, action, reward,next_state, done))
# 更新下一个状态
state = next_state
# 更新智能体
agent.update()
# 累加奖励
ep_reward += reward
if done:
break
else:
# 重置环境，返回初始状态
state,info = env.reset(seed=1)
state = to_one_hot(state,16)
for _ in range(cfg['ep_max_steps']):
ep_step += 1
# 选择动作
action = agent.choose_action(state)
# 更新环境，返回transition
next_state, reward, done,truncated, _ = env.step(action)
next_state = to_one_hot(next_state,16)
# 重写奖励机制
if reward == 0 and done != 1:
reward = -1
elif reward == 1:
reward = 100
else:
reward = -10
# 保存transition
agent.memory.push((state, action, reward,next_state, done))
# 更新下一个状态
state = next_state
# 更新智能体
agent.update()
# 累加奖励
ep_reward += reward
if done:
break
# 智能体目标网络更新
if (i_ep + 1) % cfg['target_update'] == 0:
steps.append(ep_step)
rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0:
print(f"回合：{i_ep+1}/{cfg['train_eps']}，奖励：{ep_reward:.2f}，Epislon：{agent.epsilon:.3f}")
# 展示当前神经网络对所有状态的q值
# agent.observation()
print("完成训练！")
env.close()
return {'rewards':rewards}

def test(cfg, env, agent):
print("开始测试！")
# 记录所有回合的奖励
rewards = []
steps = []
for i_ep in range(cfg['test_eps']):
# 记录一回合内的奖励
ep_reward = 0
ep_step = 0
# 重置环境，返回初始状态
state,info = env.reset(seed=1)
state = to_one_hot(state,16)
for _ in range(cfg['ep_max_steps']):
ep_step+=1
# 选择动作
action = agent.predict_action(state)
# 更新环境，返回transition
next_state, reward, done,truncated, _ = env.step(action)
next_state = to_one_hot(next_state, 16)
# 更新下一个状态
state = next_state
# 累加奖励
ep_reward += reward
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
print(f"回合：{i_ep+1}/{cfg['test_eps']}，奖励：{ep_reward:.2f}")
print("完成测试")
env.close()
return {'rewards':rewards}

# 万能的seed函数
def all_seed(env,seed = 1):
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed) # config for CPU
torch.cuda.manual_seed(seed) # config for GPU
os.environ['PYTHONHASHSEED'] = str(seed) # config for python scripts
# config for cudnn
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
def env_agent_config(cfg):
# 创建环境
env = gym.make(cfg['env_name'])
env_show = gym.make(cfg['env_name'],render_mode = 'human')
if cfg['seed'] !=0:
all_seed(env,seed=cfg['seed'])
n_states = env.observation_space.n
n_actions = env.action_space.n
print(f"状态空间维度：{n_states}，动作空间维度：{n_actions}")
# 更新n_states和n_actions到cfg参数中
cfg.update({"n_states":n_states,"n_actions":n_actions})
# 创建模型
model = Net(n_states, n_actions, hidden_dim = cfg['hidden_dim'])
memory = ReplayBuffer(cfg['memory_capacity'])
agent = DQN(model,memory,cfg)
return env,env_show, agent

# 超参数
def get_args():
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
args = parser.parse_args([])
# 转换成字典类型
args = {**vars(args)}
# 打印超参数
print("超参数")
print(''.join(['=']*80))
tplt = "{:^20}\t{:^20}\t{:^20}"
print(tplt.format("Name", "Value", "Type"))
for k,v in args.items():
print(tplt.format(k,v,str(type(v))))
print(''.join(['=']*80))
return args
#用于平滑曲线，类似于Tensorboard中的smooth曲线
def smooth(data, weight=0.9):
last = data[0]
smoothed = []
for point in data:
# 计算平滑值
smoothed_val = last * weight + (1 - weight) * point
smoothed.append(smoothed_val)
last = smoothed_val
return smoothed

# 画图
def plot_rewards(rewards,cfg, tag='train'):
sns.set()
# 创建一个图形实例，方便同时多画几个图
plt.figure()
plt.title(f"{tag}ing curve on {cfg['device']} of {cfg['algo_name']} for {cfg['env_name']}")
plt.xlabel('epsiodes')
plt.plot(rewards, label='rewards')
plt.plot(smooth(rewards), label='smoothed')
plt.legend()
if not os.path.exists(f'{cfg["env_name"]}'):
# 如果文件夹不存在，则创建它
os.makedirs(f'{cfg["env_name"]}')
plt.savefig(f'{cfg["env_name"]}/DQN_{tag}.png')
plt.show()

# 获取参数
cfg = get_args()
# 训练
env, env_show, agent = env_agent_config(cfg)
res_dic = train(cfg, env, env_show, agent)
# 画出结果
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
res_dic = test(cfg, env, agent)
# 画出结果
plot_rewards(res_dic['rewards'], cfg, tag="test")``````

...全文
53 回复 打赏 收藏 转发到动态 举报

5,867

• 近7日
• 近30日
• 至今