6.2 机器学习预测市场涨跌¶

学习目标¶

将因子特征用于监督学习分类任务
使用 XGBoost 预测次日涨跌方向
正确划分训练/测试集，避免数据泄露
将 ML 信号接入简单回测框架

In [1]:

Copied!





import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

try:
    import xgboost as xgb
    print('XGBoost 加载成功 ✅')
except ImportError:
    print('⚠️  请安装 xgboost: pip install xgboost')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

try:
    import xgboost as xgb
    print('XGBoost 加载成功 ✅')
except ImportError:
    print('⚠️  请安装 xgboost: pip install xgboost')

XGBoost 加载成功 ✅

In [2]:

Copied!

import matplotlib.pyplot as plt

# 1. 设置系统自带的中文字体（这里使用黑体 SimHei）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 如果你想用微软雅黑，可以改成 ['Microsoft YaHei']

# 2. 解决更换字体后，负号（-）显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.pyplot as plt

# 1. 设置系统自带的中文字体（这里使用黑体 SimHei）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 如果你想用微软雅黑，可以改成 ['Microsoft YaHei']

# 2. 解决更换字体后，负号（-）显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

1. 准备特征矩阵¶

In [3]:

Copied!





# 下载数据
raw = yf.download('SPY', start='2015-01-01', end='2024-01-01', progress=False)
df = pd.DataFrame({
    'Open': raw['Open'].squeeze(),
    'High': raw['High'].squeeze(),
    'Low': raw['Low'].squeeze(),
    'Close': raw['Close'].squeeze(),
    'Volume': raw['Volume'].squeeze(),
})

# 构建特征（全部使用历史数据, 无未来泄漏）
for w in [5, 10, 20, 60]:
    df[f'ret_{w}d'] = df['Close'].pct_change(w)
    df[f'vol_{w}d'] = df['Close'].pct_change().rolling(w).std()
    df[f'ma_ratio_{w}'] = df['Close'] / df['Close'].rolling(w).mean() - 1

# RSI
delta = df['Close'].diff()
gain = delta.clip(lower=0).ewm(com=13).mean()
loss = (-delta.clip(upper=0)).ewm(com=13).mean()
df['rsi_14'] = 100 - 100 / (1 + gain / loss)

# Volume ratio
df['vol_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()

# 日内特征
df['daily_range'] = (df['High'] - df['Low']) / df['Close']

# 目标变量：明天是否上涨（1=涨，0=跌）
df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

df = df.dropna()
feature_cols = [c for c in df.columns if c not in ['Open','High','Low','Close','Volume','target']]
print(f'特征数量: {len(feature_cols)}')
print(f'样本数量: {len(df)}')
print(f'正样本比例（明日涨）: {df["target"].mean():.2%}')
# 下载数据
raw = yf.download('SPY', start='2015-01-01', end='2024-01-01', progress=False)
df = pd.DataFrame({
    'Open': raw['Open'].squeeze(),
    'High': raw['High'].squeeze(),
    'Low': raw['Low'].squeeze(),
    'Close': raw['Close'].squeeze(),
    'Volume': raw['Volume'].squeeze(),
})

# 构建特征（全部使用历史数据, 无未来泄漏）
for w in [5, 10, 20, 60]:
    df[f'ret_{w}d'] = df['Close'].pct_change(w)
    df[f'vol_{w}d'] = df['Close'].pct_change().rolling(w).std()
    df[f'ma_ratio_{w}'] = df['Close'] / df['Close'].rolling(w).mean() - 1

# RSI
delta = df['Close'].diff()
gain = delta.clip(lower=0).ewm(com=13).mean()
loss = (-delta.clip(upper=0)).ewm(com=13).mean()
df['rsi_14'] = 100 - 100 / (1 + gain / loss)

# Volume ratio
df['vol_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()

# 日内特征
df['daily_range'] = (df['High'] - df['Low']) / df['Close']

# 目标变量：明天是否上涨（1=涨，0=跌）
df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

df = df.dropna()
feature_cols = [c for c in df.columns if c not in ['Open','High','Low','Close','Volume','target']]
print(f'特征数量: {len(feature_cols)}')
print(f'样本数量: {len(df)}')
print(f'正样本比例（明日涨）: {df["target"].mean():.2%}')

特征数量: 15
样本数量: 2204
正样本比例（明日涨）: 54.31%

2. 时序交叉验证¶

⚠️ 金融时序数据不能随机打乱！必须用 TimeSeriesSplit 保证训练集在测试集之前。

In [4]:

Copied!





X = df[feature_cols].values
y = df['target'].values

# 简单时序分割: 80% train, 20% test
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'训练集: {len(X_train)} 天 ({df.index[0].date()} ~ {df.index[split-1].date()})')
print(f'测试集: {len(X_test)} 天  ({df.index[split].date()} ~ {df.index[-1].date()})')
X = df[feature_cols].values
y = df['target'].values

# 简单时序分割: 80% train, 20% test
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'训练集: {len(X_train)} 天 ({df.index[0].date()} ~ {df.index[split-1].date()})')
print(f'测试集: {len(X_test)} 天  ({df.index[split].date()} ~ {df.index[-1].date()})')

训练集: 1763 天 (2015-03-31 ~ 2022-03-29)
测试集: 441 天  (2022-03-30 ~ 2023-12-29)

3. 训练 XGBoost¶

In [5]:

Copied!





model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
model.fit(X_train_scaled, y_train,
          eval_set=[(X_test_scaled, y_test)],
          verbose=False)

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

print('\n========== 测试集分类报告 ==========')
print(classification_report(y_test, y_pred, target_names=['跌', '涨']))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2%}')
print('⚠️  注意：金融预测的准确率通常只比50%略高，这是正常的！')
model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
model.fit(X_train_scaled, y_train,
          eval_set=[(X_test_scaled, y_test)],
          verbose=False)

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

print('\n========== 测试集分类报告 ==========')
print(classification_report(y_test, y_pred, target_names=['跌', '涨']))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2%}')
print('⚠️  注意：金融预测的准确率通常只比50%略高，这是正常的！')

========== 测试集分类报告 ==========
              precision    recall  f1-score   support

           跌       0.49      0.21      0.29       219
           涨       0.50      0.79      0.61       222

    accuracy                           0.50       441
   macro avg       0.50      0.50      0.45       441
weighted avg       0.50      0.50      0.46       441

Accuracy: 50.11%
⚠️  注意：金融预测的准确率通常只比50%略高，这是正常的！

In [6]:

Copied!





# 特征重要性
importance = pd.Series(model.feature_importances_, index=feature_cols)
importance = importance.sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(8, 6))
importance.tail(15).plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('XGBoost 特征重要性 (Top 15)', fontsize=13)
ax.set_xlabel('Importance Score')
plt.tight_layout()
plt.show()
# 特征重要性
importance = pd.Series(model.feature_importances_, index=feature_cols)
importance = importance.sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(8, 6))
importance.tail(15).plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('XGBoost 特征重要性 (Top 15)', fontsize=13)
ax.set_xlabel('Importance Score')
plt.tight_layout()
plt.show()

No description has been provided for this image

4. 将 ML 信号接入回测¶

In [7]:

Copied!





# 测试集期间的回测
test_df = df.iloc[split:].copy()
test_df['ml_signal'] = y_pred  # 1=预测明天涨（持有），0=跌（空仓）
test_df['daily_ret'] = test_df['Close'].pct_change()
test_df['strategy_ret'] = test_df['ml_signal'].shift(1) * test_df['daily_ret']

# 累积收益对比
cum_strategy = (1 + test_df['strategy_ret'].dropna()).cumprod()
cum_bh = (1 + test_df['daily_ret'].dropna()).cumprod()

fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(cum_strategy.index, cum_strategy.values, label='ML 策略', linewidth=1.8, color='steelblue')
ax.plot(cum_bh.index, cum_bh.values, label='买入持有', linewidth=1.8,
         color='orange', linestyle='--')
ax.set_title('ML 策略 vs 买入持有（样本外）', fontsize=13)
ax.set_ylabel('累积净值')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

ml_total = cum_strategy.iloc[-1] - 1
bh_total = cum_bh.iloc[-1] - 1
print(f'ML 策略总收益: {ml_total:.2%}')
print(f'买入持有总收益: {bh_total:.2%}')
# 测试集期间的回测
test_df = df.iloc[split:].copy()
test_df['ml_signal'] = y_pred  # 1=预测明天涨（持有），0=跌（空仓）
test_df['daily_ret'] = test_df['Close'].pct_change()
test_df['strategy_ret'] = test_df['ml_signal'].shift(1) * test_df['daily_ret']

# 累积收益对比
cum_strategy = (1 + test_df['strategy_ret'].dropna()).cumprod()
cum_bh = (1 + test_df['daily_ret'].dropna()).cumprod()

fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(cum_strategy.index, cum_strategy.values, label='ML 策略', linewidth=1.8, color='steelblue')
ax.plot(cum_bh.index, cum_bh.values, label='买入持有', linewidth=1.8,
         color='orange', linestyle='--')
ax.set_title('ML 策略 vs 买入持有（样本外）', fontsize=13)
ax.set_ylabel('累积净值')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

ml_total = cum_strategy.iloc[-1] - 1
bh_total = cum_bh.iloc[-1] - 1
print(f'ML 策略总收益: {ml_total:.2%}')
print(f'买入持有总收益: {bh_total:.2%}')

ML 策略总收益: 4.09%
买入持有总收益: 6.57%

🎯 练习¶

改变预测目标：不预测方向，而是预测「明天收益率是否超过 0.5%」。
尝试用 LightGBM 替换 XGBoost，训练速度有何变化？
增加 n_estimators=500，观察是否过拟合（对比训练集和测试集准确率）。

下一节 → 03_deep_learning.ipynb

In [ ]: