このプログラムの目的
1~n日後の株価を予測する予測器を作成することです.
- n個の予測器が生成.
- 手法はLightGBM
- パラメータチューニングはOptuna
work_share
├05_lightGBM_predict
├Dockerfile
├docker-compose.yml
└src
├result (自動生成)
└experiment01.py (これを作成)
使用ライブラリ
import pandas as pd
import os
import json
import pickle
#import lightgbm as lgb
import optuna.integration.lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.simplefilter('ignore')
各銘柄のスコア
全銘柄が同じ系列に混ざっているので,銘柄毎に分離してスコアを算出します.
def calc_score_with_code(X, y, model, code):
pred = model.predict(X)
temp = pd.DataFrame({'y':y, 'pred':pred, 'code':code})
scores = {}
for code_name, grouped in temp.groupby('code'):
rmse = mean_squared_error(grouped['y'], grouped['pred'], squared=False)
scores[code_name] = {'rmse':float(rmse), 'n_point':len(grouped)}
return scores
学習
- 学習器はLightGBM
- 学習データの75%を学習に25%を検証データに利用
- 検証データでEarly Stoppingを実行
- 予測器および,全データでのスコアと各銘柄でのスコアを計算して返す
def learning(X, y, args):
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.25, random_state=0, shuffle=False)
train_code = train_X['code']
train_X = train_X.drop('code', axis=1)
valid_code = valid_X['code']
valid_X = valid_X.drop('code', axis=1)
train_data = lgb.Dataset(train_X, label=train_y)
valid_data = lgb.Dataset(valid_X, label=valid_y)
params = {
'objective':'regression',
'metric':'rmse',
'random_state':0,
'boosting_type':'gbdt',
'verbose':-1
}
verbose_eval = 0
model = lgb.train(params, train_data,
valid_sets=[valid_data],
num_boost_round=100000,
early_stopping_rounds=10,
verbose_eval=verbose_eval
)
scores = {
'train_rmse':float(mean_squared_error(train_y, model.predict(train_X), squared=False)),
'valid_rmse':float(mean_squared_error(valid_y, model.predict(valid_X), squared=False)),
}
ret = {
'model':model,
'scores':scores,
'train_score_with_code':calc_score_with_code(train_X, train_y, model, train_code),
'valid_score_with_code':calc_score_with_code(valid_X, valid_y, model, valid_code),
}
return ret
実行と保存
- データを読み込んで学習させて,スコアを算出して保存するだけです.
- 教師データは正規化されたt日後の株価です.
def run(args):
with open(f'{args["result_dir"]}/args_-t{args["target_t"]}.json', 'w') as f:
json.dump(args, f, indent=4, ensure_ascii=False)
print('load dataset')
X = pd.read_pickle(args['input_dataset_path'])
X_add = pd.read_pickle(args['nearest_input_dataset_path'])
X_add = X_add.drop(['date', 'is_train_data', 'code'], axis=1)
X = pd.concat([X, X_add], axis=1)
Y = pd.read_pickle(args['output_dataset_path'])
y = Y[f'scaled_close_-t{args["target_t"]}']
na_mask = y.isna()
X = X[na_mask==False].reset_index(drop=True)
Y = Y[na_mask==False].reset_index(drop=True)
y = Y[f'scaled_close_-t{args["target_t"]}']
X = X.drop(['date'], axis=1)
train_X = X[X['is_train_data']].reset_index(drop=True)
test_X = X[X['is_train_data']==False].reset_index(drop=True)
test_code = test_X['code']
test_X = test_X.drop('code', axis=1)
train_X = train_X.drop('is_train_data', axis=1)
test_X = test_X.drop('is_train_data', axis=1)
train_y = y[Y['is_train_data']].reset_index(drop=True)
test_y = y[Y['is_train_data']==False].reset_index(drop=True)
ret = learning(train_X, train_y, args)
test_rmse = float(mean_squared_error(test_y, ret['model'].predict(test_X), squared=False))
ret['scores']['test_rmse'] = float(test_rmse)
with open(f'{args["result_dir"]}/score_-t{args["target_t"]}.json', 'w') as f:
json.dump(ret['scores'], f, indent=4, ensure_ascii=False)
with open(f'{args["result_dir"]}/result_-t{args["target_t"]}.pkl', 'wb') as f:
pickle.dump(ret, f)
ret['test_score_with_code'] = calc_score_with_code(test_X, test_y, ret['model'], test_code)
code_list = list(X['code'].unique())
code_list.sort()
code_scores = {}
for code in code_list:
code = int(code)
code_scores[code] = {}
if code in ret['train_score_with_code']:
code_scores[code]['train rmse'] = ret['train_score_with_code'][code]['rmse']
code_scores[code]['train n point'] = ret['train_score_with_code'][code]['n_point']
if code in ret['valid_score_with_code']:
code_scores[code]['valid rmse'] = ret['valid_score_with_code'][code]['rmse']
code_scores[code]['valid n point'] = ret['valid_score_with_code'][code]['n_point']
if code in ret['test_score_with_code']:
code_scores[code]['test rmse'] = ret['test_score_with_code'][code]['rmse']
code_scores[code]['test n point'] = ret['test_score_with_code'][code]['n_point']
with open(f'{args["result_dir"]}/code_score_-t{args["target_t"]}.json', 'w') as f:
json.dump(code_scores, f, indent=4, ensure_ascii=False)
パラメータと実行関数の呼び出し
LightGBMはMulti-outputに対応していないので,1~30日後の株価を予測する予測器をそれぞれ作成しました.
def experiment():
result_dir = f'result/experiment01/lgbm_optuna'
os.makedirs(result_dir, exist_ok=True)
for target_t in range(1, 31):
args = {
'result_dir':result_dir,
'input_dataset_path':'../dataset/learning_dataset_inputs.dfpkl',
'nearest_input_dataset_path':'../dataset/learning_dataset_nearest_inputs.dfpkl',
'output_dataset_path':'../dataset/learning_dataset_outputs.dfpkl',
'target_t':target_t
}
run(args)
if __name__ == '__main__':
experiment()