このプログラムの目的
この章で作成してきたデータをまとめて,学習に利用しやすい形式に直すことです.
銘柄ごとのデータを縦方向に結合します.
また,時系列を扱うにあたり,-n時間までの過去の特徴量も学習データとして追加します.
work_share
├04_get_stock_price_ver2
├Dockerfile
├docker-compose.yml
└src
├dataset
├original_data_2010-01-01_2023-03-01_1d
├time_cluster_result
├get_stock_price.py
├make_dataset.py (これを作成)
├make_original_data.py
├make_time_cluster_dataset.py
├calculate_nearest_codes.py
├make_technical_data.py
└stocks_code.xls
使用ライブラリ
import pandas as pd
import numpy as np
import tqdm
import glob
import os
import json
from sklearn.preprocessing import StandardScaler
データの読み込みと正規化
使用するデータを読み込みます.
- class_distのみ時系列データではなく,他のデータと形状が違うので別に正規化しました.
- 他の生成したデータはほとんど形状が同じなので,正規化して追加します.
- データ量削減のためにnp.float32に変換しました.データ量やメモリが潤沢な場合はnp.float64にしてください.
def make_dataset(args):
data = {
'original_close':pd.read_pickle(args['original_close_dataset_path']),
'original_open':pd.read_pickle(args['original_open_dataset_path']),
'original_high':pd.read_pickle(args['original_high_dataset_path']),
'original_low':pd.read_pickle(args['original_low_dataset_path']),
'class_means':pd.read_pickle(args['class_mean_dataset_path']),
'class_dist':pd.read_pickle(args['class_dist_dataset_path'])
}
eval_type = data['original_close']['eval_type'].reset_index(drop=True)
train_mask = eval_type == 'train'
_max = np.max(data['class_dist'].values)
_min = np.min(data['class_dist'].values)
scaler_data = {
'class_dist':(data['class_dist'] - _min)/(_max - _min)
}
for key in ['class_means', 'original_close', 'original_open', 'original_high', 'original_low']:
df = data[key]
if key != 'class_means':
df = df.drop(args['drop_codes']+['eval_type'], axis=1)
df = df.reset_index(drop=True)
train_df = df[train_mask]
scaler = StandardScaler().fit(train_df)
scaler_df = pd.DataFrame(data=scaler.transform(df), columns=df.columns)
scaler_data[key] = scaler_df.astype(np.float32)
for path in glob.glob(f'{args["technical_data_dir"]}/*.dfpkl'):
name = path.split('/')[-1][:-1*len('.dfpkl')]
df = pd.read_pickle(path)
df = df.reset_index(drop=True)
train_df = df[train_mask]
scaler = StandardScaler().fit(train_df)
scaler_df = pd.DataFrame(data=scaler.transform(df), columns=df.columns)
scaler_data[name] = scaler_df.astype(np.float32)
save_nearest_input_dataset(data, scaler_data, args)
save_input_dataset(data, scaler_data, args)
save_output_dataset(data, scaler_data, args)
学習の入力データの作成
先ほど読み込んだデータを0~t時間ずらしたものを入力データとします.
また,すべての銘柄に対して処理を行い,結合(pd.concat)すると,メモリ上で再展開されてメモリーエラーで落ちてしまうので,100銘柄ごとに結合していきます.
def save_input_dataset(data, scaler_data, args):
original_df = data['original_close'].reset_index()
date = pd.to_datetime(original_df['Date'])
original_df = original_df.drop(['Date', 'eval_type'], axis=1)
out_df = pd.DataFrame()
df_list = []
stock_list = [code for code in original_df.columns if code not in args['drop_codes']]
for stock_code in tqdm.tqdm(stock_list):
df = pd.DataFrame({
'date':date,
'is_train_data':data['original_close']['eval_type'].values == 'train',
'na':original_df[stock_code].isna().values
})
for key, scaler_df in scaler_data.items():
if key != 'class_means' and key != 'class_dist':
key = key.replace('original_', '')
Xt = pd.DataFrame({
f'{key}_t{i}':scaler_df[stock_code].shift(i).values for i in range(args['input_time_length']+1)
})
df = pd.concat([df, Xt], axis=1)
for col in scaler_data['class_means'].columns:
temp_Xt = pd.DataFrame({
f'{col}_t{i}':scaler_data['class_means'][col].shift(i).values for i in range(args['input_time_length']+1)
})
df = pd.concat([df, temp_Xt], axis=1)
for class_name, class_dist in scaler_data['class_dist'][stock_code].to_dict().items():
df[f'{class_name} dist'] = class_dist
df['code'] = stock_code
df = df.iloc[args['input_time_length']+1:-args['output_time_length'], :].reset_index(drop=True)
df = df[df['na'] == False].reset_index(drop=True)
df = df.drop('na', axis=1)
df_list.append(df)
if len(df_list) > 100:
out_df = pd.concat([out_df, pd.concat(df_list)])
df_list = []
out_df = pd.concat([out_df, pd.concat(df_list)])
out_df = out_df[out_df['date'].isnull() == False]
out_df = out_df.sort_values('date').reset_index(drop=True)
out_df['code'] = out_df['code'].astype(np.uint16)
out_df.to_pickle(args['input_dataset_out_path'])
print(out_df)
学習の教師データの作成
教師データを作成します.
- 0~t時間先までのデータを格納します.
- 正規化済みのデータと,オリジナルのデータを格納します.
def save_output_dataset(data, scaler_data, args):
original_df = data['original_close'].reset_index()
date = pd.to_datetime(original_df['Date'])
original_df = original_df.drop(['Date', 'eval_type'], axis=1).astype(np.float32)
scaler_df = scaler_data['original_close']
df_list = []
stock_list = [code for code in original_df.columns if code not in args['drop_codes']]
for stock_code in tqdm.tqdm(stock_list):
df = pd.DataFrame({
'date':date,
'is_train_data':data['original_close']['eval_type'].values == 'train',
'na':original_df[stock_code].isna().values
})
for t in range(0, args['output_time_length']+1):
df[f'original_close_-t{t}'] = original_df[stock_code].shift(-t).reset_index(drop=True)
df[f'scaled_close_-t{t}'] = scaler_df[stock_code].shift(-t).reset_index(drop=True)
df['code'] = stock_code
df = df.iloc[args['input_time_length']+1:-args['output_time_length'], :].reset_index(drop=True)
df = df[df['na'] == False].reset_index(drop=True)
df = df.drop('na', axis=1)
df_list.append(df)
out_df = pd.concat(df_list)
out_df = out_df[out_df['date'].isnull() == False]
out_df = out_df.sort_values('date').reset_index(drop=True)
out_df['code'] = out_df['code'].astype(np.uint16)
out_df.to_pickle(args['output_dataset_out_path'])
print(out_df)
学習の追加入力データの作成
追加データを作成します.追加データはNo.03-04 類似データの計算
で作成した,t時間後の類似銘柄を参照して生成されます.
def save_nearest_input_dataset(data, scaler_data, args):
original_df = data['original_close'].reset_index()
date = pd.to_datetime(original_df['Date'])
original_df = original_df.drop(['Date', 'eval_type'], axis=1)
scaler_df = scaler_data['original_close']
with open(args['nearest_info_path'], 'r') as f:
nearest_info = json.load(f)
out_df = pd.DataFrame()
df_list = []
stock_list = [code for code in original_df.columns if code not in args['drop_codes']]
for stock_code in tqdm.tqdm(stock_list):
df = pd.DataFrame({
'date':date,
'is_train_data':data['original_close']['eval_type'].values == 'train',
'na':original_df[stock_code].isna().values
})
for t in range(1, args['output_time_length']+1):
near_code = nearest_info[stock_code][str(t)]['nearest_code']
if near_code == None:
df[f'near_code_-t{t}'] = None
else:
df[f'near_code_-t{t}'] = scaler_df[near_code].values
df['code'] = stock_code
df = df.iloc[args['input_time_length']+1:-args['output_time_length'], :].reset_index(drop=True)
df = df[df['na'] == False].reset_index(drop=True)
df = df.drop('na', axis=1)
df_list.append(df)
out_df = pd.concat(df_list)
out_df = out_df[out_df['date'].isnull() == False]
out_df = out_df.sort_values('date').reset_index(drop=True)
out_df['code'] = out_df['code'].astype(np.uint16)
out_df.to_pickle(args['nearest_input_dataset_out_path'])
print(out_df)
実行
実行するパラメータ類です.
input_time_length
でt日前までのデータを生成するかを指定します.output_time_length
でt日後までのデータを教師データとして指定します.
if __name__ == '__main__':
dataset_dir = 'dataset'
args = {
'input_dataset_out_path':f'./{dataset_dir}/learning_dataset_inputs.dfpkl',
'output_dataset_out_path':f'./{dataset_dir}/learning_dataset_outputs.dfpkl',
'nearest_input_dataset_out_path':f'./{dataset_dir}/learning_dataset_nearest_inputs.dfpkl',
'original_close_dataset_path': f'./{dataset_dir}/original_dataset_Close.dfpkl',
'original_open_dataset_path': f'./{dataset_dir}/original_dataset_Open.dfpkl',
'original_high_dataset_path': f'./{dataset_dir}/original_dataset_High.dfpkl',
'original_low_dataset_path': f'./{dataset_dir}/original_dataset_Low.dfpkl',
'class_mean_dataset_path': f'./{dataset_dir}/clusterling_dataset_mean.dfpkl',
'class_dist_dataset_path': f'./{dataset_dir}/clusterling_dataset_dist.dfpkl',
'technical_data_dir':f'./{dataset_dir}/technical_data',
'input_time_length':7,
'output_time_length':30,
'drop_codes':['2747', '3858', '6775', '9468'],
'nearest_info_path':f'./{dataset_dir}/nearest_calculate_result.json'
}
make_dataset(args)