[機械学習・進化計算による株式取引最適化] No.05-02 予測値データセットの作成

このプログラムの目的

前節で作成した予測器を使って，予測値データセットを作成します．
予測できない銘柄は外します．

work_share
├05_lightGBM_predict
  ├Dockerfile
  ├docker-compose.yml
  └src
    ├dataset(自動生成)
    ├result(自動生成)
    ├experiment01.py
    └make_predict_result_dataset.py (これを作成)

使用ライブラリ

import numpy as np
import pandas as pd
import tqdm

import json
import pickle
import os

予測できない銘柄の検出

任意の銘柄が閾値th以上のrmseの場合は，その銘柄を使用しない銘柄として指定して返します．

def detect_non_fitting_codes(scores, th):
    no_use_codes = []
    for code, info in scores.items():
        train_rmse = info['train rmse']
        valid_rmse = info['valid rmse']
        test_rmse = info['test rmse']
        if (train_rmse > th) or (valid_rmse > th) or (test_rmse > th):
            no_use_codes.append(code)
    return no_use_codes

t日後が予測できない銘柄を検出

ret = no_use_summary.any()は全期間(1~t日後)の予測すべてがth以下の誤差で予測できる銘柄のみを利用するという意味です．

ret = no_use_summary.all()にすると，全期間(1~t日後)の予測いずれかがth以下の誤差で予測できれば利用することになります．

利用できる銘柄が少なすぎると感じた場合は`ret = no_use_summary.all()にしてください．

def calculate_not_use_codes_at_all_time(args):
    with open(f'{args["result_dir"]}/code_score_-t1.json', 'r') as f:
        scores = json.load(f)
    code_list = list(scores.keys())
    code_list.sort()

    t_list = list(range(1, args['output_time_length']+1))
    no_use_summary = pd.DataFrame(columns=code_list, index=t_list)
    no_use_summary = no_use_summary.fillna(False).astype('bool')
    for t in t_list:
        path = f'{args["result_dir"]}/code_score_-t{t}.json'
        with open(path, 'r') as f:
            scores = json.load(f)

        if args['not_use_detect_type'] == 'non_fitting':
            not_use_codes = detect_non_fitting_codes(scores, th=args['not_use_th'])
        if args['not_use_detect_type'] == 'over_fitting':
            not_use_codes = detect_over_fitting_codes(scores, alpha=args['not_use_alpha'])
        for code in not_use_codes:
            no_use_summary.at[t, code] = True

    print(no_use_summary)
    ret = no_use_summary.any()
    not_use_codes = ret[ret==True].index.to_list()
    print(ret[ret==True])
    not_use_codes = [int(code) for code in not_use_codes]
    return not_use_codes

データセットの作成

データセットを読み込んで，予測器に渡し，新しいデータセットを作成する．
新しいデータセットは銘柄毎に別々の列を持つものとする．

def make_dataset(args):
    os.makedirs(args['new_dataset_dir'], exist_ok=True)

    not_use_codes = calculate_not_use_codes_at_all_time(args)
    t_list = list(range(1, args['output_time_length']+1))

    print('load dataset')
    X = pd.read_pickle(args['input_dataset_path'])
    X_add = pd.read_pickle(args['nearest_input_dataset_path'])
    X_add = X_add.drop(['date', 'is_train_data', 'code'], axis=1)
    X = pd.concat([X, X_add], axis=1)

    date_x = X['date']
    code_x = X['code']
    is_train_x = X['is_train_data']
    X = X.drop(['date', 'code', 'is_train_data'], axis=1)

    date_uniques = list(date_x.unique())
    date_uniques.sort()

    for t in tqdm.tqdm(t_list):
        path = f'{args["result_dir"]}/result_-t{t}.pkl'
        with open(path, 'rb') as f:
            predict_result = pickle.load(f)
        model = predict_result['model']
        pred = model.predict(X)
        temp = pd.DataFrame({'date':date_x, 'pred':pred, 'code':code_x, 'is_train_data':is_train_x})
        temp = temp.set_index('date')
        pred_codes = pd.DataFrame(index=date_uniques)
        for code_name, grouped in temp.groupby('code'):
            if code_name not in not_use_codes:
                pred_codes[code_name] = grouped['pred']

        pred_codes['is_train_data'] = grouped['is_train_data']

        pred_codes = pred_codes.reset_index().rename(columns={'index':'date'})
        pred_codes.to_pickle(f'{args["new_dataset_dir"]}/pred_codes_-t{t}.dfpkl')

    Y = pd.read_pickle(args['output_dataset_path'])
    Y = Y.set_index('date')
    Y_codes = pd.DataFrame(index=date_uniques)
    for code_name, grouped in Y.groupby('code'):
        if code_name not in not_use_codes:
            Y_codes[code_name] = grouped['original_close_-t0']
    Y_codes['is_train_data'] = grouped['is_train_data']
    Y_codes = Y_codes.reset_index().rename(columns={'index':'date'})
    Y_codes.to_pickle(f'{args["new_dataset_dir"]}/original_value.dfpkl')

パラメータと実行

三種類のデータセットを作成します．

rmse誤差1.0以下の銘柄の予測値データセット
rmse誤差0.5以下の銘柄の予測値データセット
rmse誤差0.3以下の銘柄の予測値データセット

誤差が小さいほど利用できる銘柄が少ないです．

if __name__ == '__main__':
    args = {
        'input_dataset_path':'../dataset/learning_dataset_inputs.dfpkl',
        'nearest_input_dataset_path':'../dataset/learning_dataset_nearest_inputs.dfpkl',
        'output_dataset_path':'../dataset/learning_dataset_outputs.dfpkl',
        'result_dir':'result/experiment01/lgbm_optuna',
        'output_time_length':30,
        'not_use_detect_type':'non_fitting',
        'not_use_th':1.0,
        'new_dataset_dir':'./dataset/non_fitting_1.0'
    }
    make_dataset(args)
    args = {
        'input_dataset_path':'../dataset/learning_dataset_inputs.dfpkl',
        'nearest_input_dataset_path':'../dataset/learning_dataset_nearest_inputs.dfpkl',
        'output_dataset_path':'../dataset/learning_dataset_outputs.dfpkl',
        'result_dir':'result/experiment01/lgbm_optuna',
        'output_time_length':30,
        'not_use_detect_type':'non_fitting',
        'not_use_th':0.5,
        'new_dataset_dir':'./dataset/non_fitting_0.5'
    }
    make_dataset(args)
    args = {
        'input_dataset_path':'../dataset/learning_dataset_inputs.dfpkl',
        'nearest_input_dataset_path':'../dataset/learning_dataset_nearest_inputs.dfpkl',
        'output_dataset_path':'../dataset/learning_dataset_outputs.dfpkl',
        'result_dir':'result/experiment01/lgbm_optuna',
        'output_time_length':30,
        'not_use_detect_type':'non_fitting',
        'not_use_th':0.3,
        'new_dataset_dir':'./dataset/non_fitting_0.3'
    }
    make_dataset(args)

back No.05-01

chapters

next No.06