In [1]:
Copied!
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
try:
import xgboost as xgb
print('XGBoost 加载成功 ✅')
except ImportError:
print('⚠️ 请安装 xgboost: pip install xgboost')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
try:
import xgboost as xgb
print('XGBoost 加载成功 ✅')
except ImportError:
print('⚠️ 请安装 xgboost: pip install xgboost')
XGBoost 加载成功 ✅
In [2]:
Copied!
import matplotlib.pyplot as plt
# 1. 设置系统自带的中文字体(这里使用黑体 SimHei)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 如果你想用微软雅黑,可以改成 ['Microsoft YaHei']
# 2. 解决更换字体后,负号(-)显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.pyplot as plt
# 1. 设置系统自带的中文字体(这里使用黑体 SimHei)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 如果你想用微软雅黑,可以改成 ['Microsoft YaHei']
# 2. 解决更换字体后,负号(-)显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
1. 准备特征矩阵¶
In [3]:
Copied!
# 下载数据
raw = yf.download('SPY', start='2015-01-01', end='2024-01-01', progress=False)
df = pd.DataFrame({
'Open': raw['Open'].squeeze(),
'High': raw['High'].squeeze(),
'Low': raw['Low'].squeeze(),
'Close': raw['Close'].squeeze(),
'Volume': raw['Volume'].squeeze(),
})
# 构建特征(全部使用历史数据, 无未来泄漏)
for w in [5, 10, 20, 60]:
df[f'ret_{w}d'] = df['Close'].pct_change(w)
df[f'vol_{w}d'] = df['Close'].pct_change().rolling(w).std()
df[f'ma_ratio_{w}'] = df['Close'] / df['Close'].rolling(w).mean() - 1
# RSI
delta = df['Close'].diff()
gain = delta.clip(lower=0).ewm(com=13).mean()
loss = (-delta.clip(upper=0)).ewm(com=13).mean()
df['rsi_14'] = 100 - 100 / (1 + gain / loss)
# Volume ratio
df['vol_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
# 日内特征
df['daily_range'] = (df['High'] - df['Low']) / df['Close']
# 目标变量:明天是否上涨(1=涨,0=跌)
df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
df = df.dropna()
feature_cols = [c for c in df.columns if c not in ['Open','High','Low','Close','Volume','target']]
print(f'特征数量: {len(feature_cols)}')
print(f'样本数量: {len(df)}')
print(f'正样本比例(明日涨): {df["target"].mean():.2%}')
# 下载数据
raw = yf.download('SPY', start='2015-01-01', end='2024-01-01', progress=False)
df = pd.DataFrame({
'Open': raw['Open'].squeeze(),
'High': raw['High'].squeeze(),
'Low': raw['Low'].squeeze(),
'Close': raw['Close'].squeeze(),
'Volume': raw['Volume'].squeeze(),
})
# 构建特征(全部使用历史数据, 无未来泄漏)
for w in [5, 10, 20, 60]:
df[f'ret_{w}d'] = df['Close'].pct_change(w)
df[f'vol_{w}d'] = df['Close'].pct_change().rolling(w).std()
df[f'ma_ratio_{w}'] = df['Close'] / df['Close'].rolling(w).mean() - 1
# RSI
delta = df['Close'].diff()
gain = delta.clip(lower=0).ewm(com=13).mean()
loss = (-delta.clip(upper=0)).ewm(com=13).mean()
df['rsi_14'] = 100 - 100 / (1 + gain / loss)
# Volume ratio
df['vol_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
# 日内特征
df['daily_range'] = (df['High'] - df['Low']) / df['Close']
# 目标变量:明天是否上涨(1=涨,0=跌)
df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
df = df.dropna()
feature_cols = [c for c in df.columns if c not in ['Open','High','Low','Close','Volume','target']]
print(f'特征数量: {len(feature_cols)}')
print(f'样本数量: {len(df)}')
print(f'正样本比例(明日涨): {df["target"].mean():.2%}')
特征数量: 15 样本数量: 2204 正样本比例(明日涨): 54.31%
2. 时序交叉验证¶
⚠️ 金融时序数据不能随机打乱!必须用 TimeSeriesSplit 保证训练集在测试集之前。
In [4]:
Copied!
X = df[feature_cols].values
y = df['target'].values
# 简单时序分割: 80% train, 20% test
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]
# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f'训练集: {len(X_train)} 天 ({df.index[0].date()} ~ {df.index[split-1].date()})')
print(f'测试集: {len(X_test)} 天 ({df.index[split].date()} ~ {df.index[-1].date()})')
X = df[feature_cols].values
y = df['target'].values
# 简单时序分割: 80% train, 20% test
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]
# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f'训练集: {len(X_train)} 天 ({df.index[0].date()} ~ {df.index[split-1].date()})')
print(f'测试集: {len(X_test)} 天 ({df.index[split].date()} ~ {df.index[-1].date()})')
训练集: 1763 天 (2015-03-31 ~ 2022-03-29) 测试集: 441 天 (2022-03-30 ~ 2023-12-29)
3. 训练 XGBoost¶
In [5]:
Copied!
model = xgb.XGBClassifier(
n_estimators=200,
max_depth=4,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
use_label_encoder=False,
eval_metric='logloss',
random_state=42
)
model.fit(X_train_scaled, y_train,
eval_set=[(X_test_scaled, y_test)],
verbose=False)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]
print('\n========== 测试集分类报告 ==========')
print(classification_report(y_test, y_pred, target_names=['跌', '涨']))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2%}')
print('⚠️ 注意:金融预测的准确率通常只比50%略高,这是正常的!')
model = xgb.XGBClassifier(
n_estimators=200,
max_depth=4,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
use_label_encoder=False,
eval_metric='logloss',
random_state=42
)
model.fit(X_train_scaled, y_train,
eval_set=[(X_test_scaled, y_test)],
verbose=False)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]
print('\n========== 测试集分类报告 ==========')
print(classification_report(y_test, y_pred, target_names=['跌', '涨']))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2%}')
print('⚠️ 注意:金融预测的准确率通常只比50%略高,这是正常的!')
========== 测试集分类报告 ==========
precision recall f1-score support
跌 0.49 0.21 0.29 219
涨 0.50 0.79 0.61 222
accuracy 0.50 441
macro avg 0.50 0.50 0.45 441
weighted avg 0.50 0.50 0.46 441
Accuracy: 50.11%
⚠️ 注意:金融预测的准确率通常只比50%略高,这是正常的!
In [6]:
Copied!
# 特征重要性
importance = pd.Series(model.feature_importances_, index=feature_cols)
importance = importance.sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(8, 6))
importance.tail(15).plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('XGBoost 特征重要性 (Top 15)', fontsize=13)
ax.set_xlabel('Importance Score')
plt.tight_layout()
plt.show()
# 特征重要性
importance = pd.Series(model.feature_importances_, index=feature_cols)
importance = importance.sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(8, 6))
importance.tail(15).plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('XGBoost 特征重要性 (Top 15)', fontsize=13)
ax.set_xlabel('Importance Score')
plt.tight_layout()
plt.show()
4. 将 ML 信号接入回测¶
In [7]:
Copied!
# 测试集期间的回测
test_df = df.iloc[split:].copy()
test_df['ml_signal'] = y_pred # 1=预测明天涨(持有),0=跌(空仓)
test_df['daily_ret'] = test_df['Close'].pct_change()
test_df['strategy_ret'] = test_df['ml_signal'].shift(1) * test_df['daily_ret']
# 累积收益对比
cum_strategy = (1 + test_df['strategy_ret'].dropna()).cumprod()
cum_bh = (1 + test_df['daily_ret'].dropna()).cumprod()
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(cum_strategy.index, cum_strategy.values, label='ML 策略', linewidth=1.8, color='steelblue')
ax.plot(cum_bh.index, cum_bh.values, label='买入持有', linewidth=1.8,
color='orange', linestyle='--')
ax.set_title('ML 策略 vs 买入持有(样本外)', fontsize=13)
ax.set_ylabel('累积净值')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()
ml_total = cum_strategy.iloc[-1] - 1
bh_total = cum_bh.iloc[-1] - 1
print(f'ML 策略总收益: {ml_total:.2%}')
print(f'买入持有总收益: {bh_total:.2%}')
# 测试集期间的回测
test_df = df.iloc[split:].copy()
test_df['ml_signal'] = y_pred # 1=预测明天涨(持有),0=跌(空仓)
test_df['daily_ret'] = test_df['Close'].pct_change()
test_df['strategy_ret'] = test_df['ml_signal'].shift(1) * test_df['daily_ret']
# 累积收益对比
cum_strategy = (1 + test_df['strategy_ret'].dropna()).cumprod()
cum_bh = (1 + test_df['daily_ret'].dropna()).cumprod()
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(cum_strategy.index, cum_strategy.values, label='ML 策略', linewidth=1.8, color='steelblue')
ax.plot(cum_bh.index, cum_bh.values, label='买入持有', linewidth=1.8,
color='orange', linestyle='--')
ax.set_title('ML 策略 vs 买入持有(样本外)', fontsize=13)
ax.set_ylabel('累积净值')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()
ml_total = cum_strategy.iloc[-1] - 1
bh_total = cum_bh.iloc[-1] - 1
print(f'ML 策略总收益: {ml_total:.2%}')
print(f'买入持有总收益: {bh_total:.2%}')
ML 策略总收益: 4.09% 买入持有总收益: 6.57%
🎯 练习¶
- 改变预测目标:不预测方向,而是预测「明天收益率是否超过 0.5%」。
- 尝试用 LightGBM 替换 XGBoost,训练速度有何变化?
- 增加
n_estimators=500,观察是否过拟合(对比训练集和测试集准确率)。
下一节 → 03_deep_learning.ipynb
In [ ]:
Copied!