Skip to content

Commit

Permalink
code update
Browse files Browse the repository at this point in the history
  • Loading branch information
STHSF committed May 27, 2019
1 parent e09d1a1 commit f36f987
Show file tree
Hide file tree
Showing 8 changed files with 237 additions and 37 deletions.
29 changes: 26 additions & 3 deletions src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
每天开盘前更新训练样本, 重新训练模型。(可优化)

# 模型训练细节
## 特征工程
数据预处理, 对样本进行标准化处理

1、首先对样本数据做常见的标准化处理

2、其次,在此基础上,对一条样本内的数据,针对价格数据做如下处理, 以样本内最后一个close为基准, 其他的所有价格进行比例换算。

## 1、label准备
两种表现方式, 收益增长和收益增长率。

Expand Down Expand Up @@ -46,8 +53,8 @@

### 错误率分析
分析模型判断错误的数据, 统计每条错误的样本中, 当前时段的最大亏损
- 本模型使用了30000条样本数据, 其中随机抽取24000条数据作为训练样本, 剩余6000条样本作为测试样本,通过优化之后模型的测试精度达到85%, 即6000条测试样本中有大约有900条数据方向判断错误,
所以我们将这900条数据提取出来, 分析当前时段,相对于初始收盘价,最大的亏损有

- 本模型使用了30000条样本数据, 其中随机抽取24000条数据作为训练样本, 剩余6000条样本作为测试样本,通过优化之后模型的测试精度达到85%, 即6000条测试样本中有大约有900条数据方向判断错误, 所以我们将这900条数据提取出来, 分析当前时段,相对于初始收盘价,最大的亏损有

- 另外, 整个样本数据中,

Expand All @@ -57,6 +64,23 @@
|close_max - Tclose|-156.48|144.64| 相对于前一时间点的close, 跳到最高点的值
|close_max - close_min|0|267.4| 最高点跳到最低点的值|



# 分类模型进展

1、与东兴沟通分类模型的具体细节, 交代代码的具体实现情况, 然后他们那边给出了相关的建议, 包括训练数据的预处理, 训练和测试数据的预处理.

2、主要是根据股票时间序列的特性, 将价格数据相对于回报率的格式,
具体的修改方式为,对于当个样本内, 计算每个价格数据[open, high, low, close]相对于该样本内的最后一个close的回报率.

VO和VA暂时不变, 目前暂时本机训练的过程中效果没有布预处理的情况好, 下周在服务器上测试, 并且尝试调整一些参数.

3、第二部分就是把VO/VA按照前面回报率计算的方式进行预处理, 然后进行测试.

4、模型的下采样

5、训练集和测试集

### 实证分析
#### 1、判错结果中最大的涨跌幅
统计了判断对的最大涨跌幅
Expand All @@ -67,7 +91,6 @@




## 策略实施
1、根据账户总资金,计算可以开仓的总手数, 合理分配开仓手数。
2、风险控制, 如果持仓亏损达到一定程度则进行平仓或者锁仓操作。
Expand Down
19 changes: 19 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,22 @@
#
# print(b)

import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

n = 2

time_array = datetime.strptime(str(20160229), "%Y%m%d")
time_array = time_array - timedelta(days=365) * n
date_time = int(datetime.strftime(time_array, "%Y%m%d"))
print(date_time)

# time_array = time.strptime(str(20160229), "%Y%m%d")
# year = time_array.tm_year - 1
# mon = time_array.tm_mon
# day = time_array.tm_mday
# date_time = str(year) + str(mon) + str(day)
# time_array = time.strptime(date_time, "%Y%m%d")
# date_time = int(time.strftime("%Y%m%d", time_array))
# print(date_time)
133 changes: 133 additions & 0 deletions src/data_prepare/data_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-

"""
@version: ??
@author: li
@file: data_process.py
@time: 2019-05-24 08:59
"""
import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing


class DataProcess():
def __init__(self):

self.factor_list = ['TOPEN', 'HIGH', 'LOW', 'TCLOSE', 'VOTURNOVER', 'VATURNOVER']
self.training_data_set = []
self.label_set = []

def cal_return(self, data):
for i in range(len(data)):
i = 1 + i
if data[i-1] == 0:
data[i-1] = 0
continue
if i % 6 == 0:
# VATURNOVER 处理
data[i-1] = data[-1] / data[i-1] -1
elif (i + 1) % 6 == 0:
# VOTURNOVER 处理
data[i-1] = data[-2] / data[i-1] -1
else:
# data[-3], -3的位置为TCLOSE
data[i-1] = data[-3] / data[i-1] - 1
return data

def training_data_prepare(self, source_df, series_len=30, total_len=50):
print('len_source_df: %s' % len(source_df))

for index in source_df.index:
print('index %s' % index)
if index > len(source_df) - total_len:
break
# index从零开始
tmp = source_df.loc[index: index + total_len - 1, :]
print("len_tmp %s" % len(tmp))
date = tmp['TRADEDATE'].iloc[-1] # 将序列的最后一个日期作为当前样本的日期
print('trade_date %s' % date)
# 数据准备, 选取指定长度(series_len)的时间序列,
training_data = tmp[self.factor_list].iloc[0: series_len, :].values.reshape(1, -1).tolist()
print("training_data:\n %s" % training_data)
self.cal_return(training_data[0]).append(date)
print("training_data:\n %s" % training_data)
# training_data[0].append(date)
self.training_data_set += training_data
# label prepare
label, t_min, t_max, max_min, tmp = self.label_created(tmp['TCLOSE'])
label_tmp = [label, t_min, t_max, max_min, tmp, date]
print('a %s' % label_tmp)
self.label_set.append(label_tmp)
print('label: %s' % label_tmp[0])
print('len_train_data %s' % len(training_data))
print('------------------------>')

def label_created(self, df):
# aa = df[['TOPEN', 'HIGH', 'LOW', 'TCLOSE', 'VOTURNOVER', 'VATURNOVER']].values.reshape(-1)

# df = df['TCLOSE']
tclose_30 = df.values[29] # 第30分钟的数据值
tclose_50 = df.values[49] # 第50分钟的数据值

tclose_min = min(df.values[30:]) # 30分钟之后的数据,包括第三十分钟
tclose_max = max(df.values[30:])
# 相对于前一时间点的close, 跳到最低点的值
t_min = tclose_min - tclose_30
# 相对于前一时间点的close, 跳到最高点的值
t_max = tclose_max - tclose_30
# 最高点跳到最低点的值
max_min = t_max - t_min
tt = tclose_50 - tclose_30

tmp = (tclose_50 / tclose_30 - 1.0) * 10000
print('t_min: %s, t_max: %s, max_min: %s, tmp: %s' % (t_min, t_max, max_min, tt))

if abs(tmp) - 0.23 <= 0:
label_retrns = 0
elif tmp > 0:
label_retrns = 1
else:
label_retrns = 2
return [label_retrns, t_min, t_max, max_min, tmp]


def data_saver(self):
training_data_df = pd.DataFrame(self.training_data_set)
label_df = pd.DataFrame(self.label_set, columns=['label_retrns', 't_min', 't_max', 'max_min', 'tmp', 'date'])

# 数据保存
with open("train_df_50_pro_all.pkl", 'wb') as pk:
# pickle.dump(training_data_set, pk)
# pickle.dump(training_data_df, pk)
pickle.dump(training_data_df, pk)

with open("label_df_50_pro_all.pkl", 'wb') as pk:
# pickle.dump(label_set, pk)
# pickle.dump(label_df, pk)
pickle.dump(label_df, pk)


if __name__ == '__main__':
pd.set_option('display.max_rows', None, 'display.max_columns', None, "display.max_colwidth", 1000, 'display.width', 1000)
path = "/Users/li/PycharmProjects/stockindex500/src/data/LCY_INDEX_01MS_SH_000905.pkl"

index_500_01ms = pd.read_pickle(path=path)
# print(index_500_01ms.head())

# 时间格式修改
index_500_01ms = index_500_01ms[['TRADEDATE', 'TOPEN', 'HIGH', 'LOW', 'TCLOSE', 'VOTURNOVER', 'VATURNOVER']]
print('shape_of_index_500_01ms: {}'.format(np.shape(index_500_01ms)))
# print('index_500_01ms: {}'.format(index_500_01ms.head(50)))

# tmp_data = index_500_01ms[:100]
tmp_data = index_500_01ms

datapress = DataProcess()
datapress.training_data_prepare(tmp_data)

# print(pd.DataFrame(datapress.training_data_set))

datapress.data_saver()
17 changes: 10 additions & 7 deletions src/data_prepare/data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ def label_created(df):
# aa = df[['TOPEN', 'HIGH', 'LOW', 'TCLOSE', 'VOTURNOVER', 'VATURNOVER']].values.reshape(-1)

# df = df['TCLOSE']
tclose_30 = df.values[29]
tclose_50 = df.values[49]
tclose_30 = df.values[29] # 第30分钟的数据值
tclose_50 = df.values[49] # 第50分钟的数据值

tclose_min = min(df.values[30:])
tclose_min = min(df.values[30:]) # 30分钟之后的数据,包括第三十分钟
tclose_max = max(df.values[30:])
# 相对于前一时间点的close, 跳到最低点的值
t_min = tclose_min - tclose_30
Expand Down Expand Up @@ -92,8 +92,8 @@ def label_created(df):
print("len_tmp %s" % len(tmp))
date = tmp['TRADEDATE'].iloc[-1]
print('date %s' % date)
training_data = tmp[['TOPEN', 'HIGH', 'LOW', 'TCLOSE', 'VOTURNOVER', 'VATURNOVER']].iloc[0:29, :].values.reshape(1, -1).tolist()
print("training_data %s" % training_data)
training_data = tmp[['TOPEN', 'HIGH', 'LOW', 'TCLOSE', 'VOTURNOVER', 'VATURNOVER']].iloc[0:30, :].values.reshape(1, -1).tolist()
print("training_data %s" % (training_data))
training_data[0].append(date)
training_data_set += training_data

Expand Down Expand Up @@ -123,12 +123,15 @@ def label_created(df):
# print(np.array(training_data_set))

# 数据保存
with open("train_df.pkl", 'wb') as pk:
with open("train_df_50.pkl", 'wb') as pk:
# pickle.dump(training_data_set, pk)
# pickle.dump(training_data_df, pk)
pickle.dump(training_data_df, pk)

with open("label_df.pkl", 'wb') as pk:

with open("label_df_50.pkl", 'wb') as pk:
# pickle.dump(label_set, pk)
# pickle.dump(label_df, pk)
pickle.dump(label_df, pk)


Expand Down
36 changes: 20 additions & 16 deletions src/data_prepare/read_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,24 @@
with open('label.pkl', 'rb') as pk:
label_dataset_list = pickle.load(pk)

with open('train_df.pkl', 'rb') as pk:
with open('train_df_50.pkl', 'rb') as pk:
train_dataset = pickle.load(pk)

with open('label_df.pkl', 'rb') as pk:
with open('label_df_50.pkl', 'rb') as pk:
label_dataset = pickle.load(pk)

# print(train_dataset_list[:5])
label = []
for i in label_dataset_list:
if i == -1:
label.append(2)
else:
label.append(i)
print(label[:300])

# print(train_dataset.head(30))
print(label_dataset.head(300)['label_retrns'].values.tolist())
# label = []
# for i in label_dataset_list:
# if i == -1:
# label.append(2)
# else:
# label.append(i)
# print(label[:300])

print(train_dataset.head(30))
print(label_dataset.head(30))
print(label_dataset.head(30)['label_retrns'].values.tolist())

print("训练集类型: {}".format(type(train_dataset)))
print("训练集结构: {}".format(np.shape(train_dataset)))
Expand All @@ -51,18 +52,21 @@
a3 = 0
a4 = 0

for i in label_dataset_list:
label_list = label_dataset['label_retrns'].values.tolist()
print(len(label_list))

for i in label_list:
if i == 0:
a1 += 1
elif i == 1:
a2 += 1
elif i == -1:
elif i == 2:
a3 += 1
else:
a4 += 1

print("'0'的个数{}, '1'的个数{}, '-1'的个数{}".format(a1, a2, a3, a4))
print("'0'的比例{}, '1'的比例{}, '-1'的比例{}".format(a1/len(label_dataset), a2/len(label_dataset), a3/len(label_dataset), a4/len(label_dataset)))
print("'0'的个数{}, '1'的个数{}, '2'的个数{}, 其他:{}".format(a1, a2, a3, a4))
print("'0'的比例{}, '1'的比例{}, '2'的比例{}".format(a1/len(label_dataset), a2/len(label_dataset), a3/len(label_dataset), a4/len(label_dataset)))

# 训练集提取

Expand Down
1 change: 0 additions & 1 deletion src/grid_search/xgb_grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
model = xgb.train(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)


optimized_GBM.fit(x_train, y_train)
evalute_result = optimized_GBM.grid_scores_
print('每轮迭代运行结果:{0}'.format(evalute_result))
Expand Down
33 changes: 26 additions & 7 deletions src/stacking/m1_xgb.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pickle
import argparse
import numpy as np
from math import *
import xgboost as xgb
from src.stacking.configuration import conf
import pandas as pd
Expand Down Expand Up @@ -154,6 +155,21 @@ def run_cv(x_train, x_test, y_train, y_test):
xgb_predict(best_model, x_test, y_test, save_result_path=result_saved_path)


def train_test_sp(train_dataset_df, label_dataset_df, test_size=0.2,random=None):
# 训练集和测试集划分
if random:
# 随机划分
x_train, x_test, y_train, y_test = train_test_split(train_dataset_df, label_dataset_df, test_size=0.02, random_state=10000, shuffle=True)
else:
# 按时间序列前后划分
len_data = len(train_dataset_df)
a1 = ceil(len_data * (1 - test_size))
x_train, x_test = train_dataset_df[:a1], train_dataset_df[a1:]
y_train, y_test = label_dataset_df[:a1], label_dataset_df[a1:]

return x_train, x_test, y_train, y_test


now = time.strftime('%Y-%m-%d %H:%M')

if __name__ == '__main__':
Expand Down Expand Up @@ -185,25 +201,28 @@ def run_cv(x_train, x_test, y_train, y_test):

# 输入数据为dataframe格式

with open('../data_prepare/train_df.pkl', 'rb') as pk:
with open('../data_prepare/train_df_50_pro.pkl', 'rb') as pk:
train_dataset_df = pickle.load(pk)

with open('../data_prepare/label_df.pkl', 'rb') as pk:
with open('../data_prepare/label_df_50_pro.pkl', 'rb') as pk:
label_dataset_df = pickle.load(pk)

x_train, x_test, y_train, y_test = train_test_split(train_dataset_df[:30000], label_dataset_df[:30000], test_size=0.2, random_state=10000, shuffle=True)
# x_train, x_test, y_train, y_test = train_test_split(train_dataset_df[:30000], label_dataset_df[:30000], test_size=0.2, random_state=10000, shuffle=True)
x_train, x_test, y_train, y_test = train_test_sp(train_dataset_df[:60000], label_dataset_df[:60000])
print('x_train_pre: %s' % x_train.head())
print('y_train_pre: %s' % y_train.head())
print('x_test_pre: %s' % x_test.head())
print('y_test_pre: %s' % y_test.head())

# 数据统计用
x_test.to_csv('../result/x_test_{}.csv'.format(now), index=0)
y_test.to_csv('../result/y_test_{}.csv'.format(now), index=0)
# x_test.to_csv('../result/x_test_{}.csv'.format(now), index=0)
# y_test.to_csv('../result/y_test_{}.csv'.format(now), index=0)

# 样本预处理
x_train = x_train.drop([174], axis=1)
x_test = x_test.drop([174], axis=1)
# 剔除training中的date列
x_train = x_train.drop([180], axis=1)
x_test = x_test.drop([180], axis=1)
# 剔除lebel中的其他所有列, 只保留label列
y_train = y_train.drop(['t_min', 't_max', 'max_min', 'tmp', 'date'], axis=1)
y_test = y_test.drop(['t_min', 't_max', 'max_min', 'tmp', 'date'], axis=1)

Expand Down
Loading

0 comments on commit f36f987

Please sign in to comment.