このプログラムの目的
このプログラムはDQN(Deep Q-Learning)と呼ばれる強化学習によって,前項で作成したシミュレーション環境を学習することにあります.
work_share
├06_sampling_dqn_learning
├Dockerfile
├docker-compose.yml
└src
├draw_graph
| └draw_tools.py
├enviroment
| └stock_env.py
├reinforcement_learning
| └dqn.py (これを作成)
└experiment01.py
使用ライブラリ
import pandas as pd
import numpy as np
import json
import os
import environment.stock_env as rl_env
import draw_graph.draw_tools as draw_tools
import torch
from pfrl import explorers, q_functions
import pfrl
学習過程
基本的にはargsにパラメータ類を入れて渡す形式をとっています.
- Qネットーワークのサイズは64ユニット×4層にしました.本当は要調整です.
- また1000回エピソードを学習するごとに現在の記録を保存するようにしています.
def learning(args):
env = rl_env.stock_env(
dataset_dir1=args['dataset_dir1'],
dataset_dir2=args['dataset_dir2'],
init_money=args['init_money'],
trade_cost_func=args['trade_cost_func'],
sampling_t=args['sampling_t'],
reward_last_only=args['reward_last_only'],
n_code_select=args['n_code_select'],
sampling_alpha=args['sampling_alpha']
)
print(f'input num : {env.input_num}')
q_func = q_functions.FCStateQFunctionWithDiscreteAction(
env.input_num,
env.action_num,
n_hidden_channels=64,
n_hidden_layers=4,
)
optimizer = torch.optim.AdamW(q_func.parameters())
gamma = args['gamma']
explorer = pfrl.explorers.ConstantEpsilonGreedy(
epsilon=args['epsilon'], random_action_func=env.action_space.sample)
replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=args['memory_capacity'])
gpu = 0
agent = pfrl.agents.DoubleDQN(
q_func,
optimizer,
replay_buffer,
gamma,
explorer,
replay_start_size=args['replay_start_size'],
update_interval=args['update_interval'],
target_update_interval=args['target_update_interval'],
gpu=gpu,
minibatch_size=args['minibatch_size']
)
n_episodes = args['n_episodes']
result = {
'training_log':[],
}
for i in range(1, n_episodes + 1):
obs = env.reset()
R = 0 # sum of rewards
t = 0 # time step
for t in range(len(env.data['X'])):
action = agent.act(obs)
obs, reward, done, info = env.step(action)
R += reward
t += 1
reset = done
agent.observe(obs, reward, done, reset)
if done or reset:
break
return_rate = env.total_assets/env.start_money
statistics = agent.get_statistics()
if i % 100 == 0:
print(f'episode: {i}, R: {R}, last money : {env.money}, total assets : {env.total_assets} ({return_rate*100}%)')
print('statistics:', statistics)
result['training_log'].append({
'i':i,
'R':R,
'return_rate':return_rate,
})
for key, value in statistics:
result['training_log'][-1][key] = value
if (i % args['ret_save_interval'])==0:
save_temp_result(i, args, agent, env, result)
print('Finished.')
記録の保存
基本的には環境の評価モード関数に渡すだけです.
- 学習記録を保存します.
- 環境の評価モード関数にエージェントを渡して,総資産の推移を得ます.また,評価に利用した学習中の評価値も記録します.
def save_temp_result(i, args, agent, env, result):
print('#### save result #####')
save_dir = f'{args["result_dir"]}/temp_result/{i}'
os.makedirs(save_dir, exist_ok=True)
for key in ['R', 'return_rate', 'average_q', 'average_loss']:
save_path = f'{save_dir}/training_log_{key}.jpg'
draw_tools.plot_log(pd.DataFrame(result['training_log']), save_path, key)
total_assets_df = env.eval_dataset(agent, th=0.01)
if len(total_assets_df) > 0:
total_assets_df.to_pickle(f'{save_dir}/total_assets_df.dfpkl')
save_path = f'{save_dir}/total_assets.jpg'
draw_tools.plot_win_rate_df(total_assets_df, save_path, 'total_assets')
with open(f'{save_dir}/use_eval_scores.json', 'w') as f:
save_ojb = {}
for col in total_assets_df.columns:
if col != 'is_train_data':
save_ojb[int(col)] = [float(e) for e in env.eval_code_info[col]]
json.dump(save_ojb, f, indent=4, ensure_ascii=False)
with open(f'{save_dir}/eval_scores.json', 'w') as f:
save_ojb = {}
for key, value in env.eval_code_info.items():
save_ojb[int(key)] = [float(e) for e in value]
json.dump(save_ojb, f, indent=4, ensure_ascii=False)
print('#########')