[機械学習・進化計算による株式取引最適化] No.05-01 予測器の作成

このプログラムの目的

1~n日後の株価を予測する予測器を作成することです.

  • n個の予測器が生成.
  • 手法はLightGBM
  • パラメータチューニングはOptuna
work_share
├05_lightGBM_predict
  ├Dockerfile
  ├docker-compose.yml
  └src
    ├result (自動生成)
    └experiment01.py (これを作成)

使用ライブラリ

import pandas as pd
import os
import json
import pickle
#import lightgbm as lgb
import optuna.integration.lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter('ignore')

各銘柄のスコア

全銘柄が同じ系列に混ざっているので,銘柄毎に分離してスコアを算出します.

def calc_score_with_code(X, y, model, code):
    pred = model.predict(X)
    temp = pd.DataFrame({'y':y, 'pred':pred, 'code':code})

    scores = {}
    for code_name, grouped in temp.groupby('code'):
        rmse = mean_squared_error(grouped['y'], grouped['pred'], squared=False)
        scores[code_name] = {'rmse':float(rmse), 'n_point':len(grouped)}
    return scores

学習

  • 学習器はLightGBM
  • 学習データの75%を学習に25%を検証データに利用
  • 検証データでEarly Stoppingを実行
  • 予測器および,全データでのスコアと各銘柄でのスコアを計算して返す
def learning(X, y, args):
    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.25, random_state=0, shuffle=False)

    train_code = train_X['code']
    train_X = train_X.drop('code', axis=1)
    valid_code = valid_X['code']
    valid_X = valid_X.drop('code', axis=1)

    train_data = lgb.Dataset(train_X, label=train_y)
    valid_data = lgb.Dataset(valid_X, label=valid_y)

    params = {
        'objective':'regression',
        'metric':'rmse',
        'random_state':0,
        'boosting_type':'gbdt',
        'verbose':-1
    }
    verbose_eval = 0
    model = lgb.train(params, train_data,
                valid_sets=[valid_data],
                num_boost_round=100000,
                early_stopping_rounds=10,
                verbose_eval=verbose_eval
    )

    scores = {
        'train_rmse':float(mean_squared_error(train_y, model.predict(train_X), squared=False)),
        'valid_rmse':float(mean_squared_error(valid_y, model.predict(valid_X), squared=False)),
    }

    ret = {
        'model':model,
        'scores':scores,
        'train_score_with_code':calc_score_with_code(train_X, train_y, model, train_code),
        'valid_score_with_code':calc_score_with_code(valid_X, valid_y, model, valid_code),
    }
    return ret

実行と保存

  • データを読み込んで学習させて,スコアを算出して保存するだけです.
  • 教師データは正規化されたt日後の株価です.
def run(args):
    with open(f'{args["result_dir"]}/args_-t{args["target_t"]}.json', 'w') as f:
        json.dump(args, f, indent=4, ensure_ascii=False)

    print('load dataset')
    X = pd.read_pickle(args['input_dataset_path'])
    X_add = pd.read_pickle(args['nearest_input_dataset_path'])
    X_add = X_add.drop(['date', 'is_train_data', 'code'], axis=1)
    X = pd.concat([X, X_add], axis=1)

    Y = pd.read_pickle(args['output_dataset_path'])
    y = Y[f'scaled_close_-t{args["target_t"]}']

    na_mask = y.isna()
    X = X[na_mask==False].reset_index(drop=True)
    Y = Y[na_mask==False].reset_index(drop=True)
    y = Y[f'scaled_close_-t{args["target_t"]}']

    X = X.drop(['date'], axis=1)
    train_X = X[X['is_train_data']].reset_index(drop=True)

    test_X = X[X['is_train_data']==False].reset_index(drop=True)
    test_code = test_X['code']
    test_X = test_X.drop('code', axis=1)

    train_X = train_X.drop('is_train_data', axis=1)
    test_X = test_X.drop('is_train_data', axis=1)

    train_y = y[Y['is_train_data']].reset_index(drop=True)
    test_y = y[Y['is_train_data']==False].reset_index(drop=True)

    ret = learning(train_X, train_y, args)
    test_rmse = float(mean_squared_error(test_y, ret['model'].predict(test_X), squared=False))
    ret['scores']['test_rmse'] = float(test_rmse)

    with open(f'{args["result_dir"]}/score_-t{args["target_t"]}.json', 'w') as f:
        json.dump(ret['scores'], f, indent=4, ensure_ascii=False)

    with open(f'{args["result_dir"]}/result_-t{args["target_t"]}.pkl', 'wb') as f:
        pickle.dump(ret, f)

    ret['test_score_with_code'] = calc_score_with_code(test_X, test_y, ret['model'], test_code)

    code_list = list(X['code'].unique())
    code_list.sort()
    code_scores = {}
    for code in code_list:
        code = int(code)
        code_scores[code] = {}
        if code in ret['train_score_with_code']:
            code_scores[code]['train rmse'] =  ret['train_score_with_code'][code]['rmse']
            code_scores[code]['train n point'] =  ret['train_score_with_code'][code]['n_point']
        if code in ret['valid_score_with_code']:
            code_scores[code]['valid rmse'] =  ret['valid_score_with_code'][code]['rmse']
            code_scores[code]['valid n point'] =  ret['valid_score_with_code'][code]['n_point']
        if code in ret['test_score_with_code']:
            code_scores[code]['test rmse'] =  ret['test_score_with_code'][code]['rmse']
            code_scores[code]['test n point'] =  ret['test_score_with_code'][code]['n_point']

    with open(f'{args["result_dir"]}/code_score_-t{args["target_t"]}.json', 'w') as f:
        json.dump(code_scores, f, indent=4, ensure_ascii=False)

パラメータと実行関数の呼び出し

LightGBMはMulti-outputに対応していないので,1~30日後の株価を予測する予測器をそれぞれ作成しました.

def experiment():
    result_dir = f'result/experiment01/lgbm_optuna'
    os.makedirs(result_dir, exist_ok=True)
    for target_t in range(1, 31):
        args = {
            'result_dir':result_dir,
            'input_dataset_path':'../dataset/learning_dataset_inputs.dfpkl',
            'nearest_input_dataset_path':'../dataset/learning_dataset_nearest_inputs.dfpkl',
            'output_dataset_path':'../dataset/learning_dataset_outputs.dfpkl',
            'target_t':target_t
        }
        run(args)

if __name__ == '__main__':
    experiment()
タイトルとURLをコピーしました