1.9 大数定律与中心极限定理（金融视角）¶

这一节讲什么？

为什么大量交易能降低单笔押注的风险？中心极限定理告诉我们：无论单个资产的收益率分布如何奇怪，足够多的样本平均值都趋于正态分布。这是组合管理和统计推断的理论基础。

学习目标¶

从金融角度理解大数定律（LLN）的实际含义
用 CLT 解释投资组合分散化的数学基础
理解 CLT 在非正态情形下的局限性
学习 Bootstrap 重采样用于有限样本推断

In [1]:

Copied!





import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
np.random.seed(42)
print('Libraries loaded')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
np.random.seed(42)
print('Libraries loaded')

Libraries loaded

In [2]:

Copied!

import matplotlib.pyplot as plt

# 1. 设置系统自带的中文字体（这里使用黑体 SimHei）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 如果你想用微软雅黑，可以改成 ['Microsoft YaHei']

# 2. 解决更换字体后，负号（-）显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.pyplot as plt

# 1. 设置系统自带的中文字体（这里使用黑体 SimHei）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 如果你想用微软雅黑，可以改成 ['Microsoft YaHei']

# 2. 解决更换字体后，负号（-）显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

1. 大数定律（LLN）：为什么回测样本越大越可靠？¶

大数定律：随着观测次数增加，样本均值收敛到总体均值。

在量化交易中的含义：

策略的夏普比率估计需要足够的样本才能可靠
回测使用的历史数据越长，估计越稳定
但要小心：市场是非平稳的，早期数据可能已经失效

In [3]:

Copied!





# LLN 演示：样本均值收敛
true_mean = 0.0005  # 真实日均收益率
true_std  = 0.015   # 真实波动率

trial_sizes = np.arange(10, 2001, 10)
sample_means = [np.random.normal(true_mean, true_std, n).mean() for n in trial_sizes]

plt.figure(figsize=(12, 5))
plt.plot(trial_sizes, np.array(sample_means)*100, color='steelblue', lw=1.2, label='样本均值')
plt.axhline(true_mean*100, color='red', linestyle='--', lw=2, label=f'真实均值 = {true_mean*100:.3f}%')
plt.fill_between(trial_sizes,
    (true_mean - 2*true_std/np.sqrt(trial_sizes))*100,
    (true_mean + 2*true_std/np.sqrt(trial_sizes))*100,
    alpha=0.2, color='red', label='±2σ/√n 置信带')
plt.title('大数定律：样本均值收敛到真实均值')
plt.xlabel('样本量（交易日数）'); plt.ylabel('样本均值 (%/日)')
plt.legend(); plt.grid(alpha=0.3); plt.show()
print(f'  10天样本: 均值误差约 ±{true_std/np.sqrt(10)*100:.3f}%')
print(f'252天样本: 均值误差约 ±{true_std/np.sqrt(252)*100:.3f}%')
print(f'  2520天（10年）: 均值误差约 ±{true_std/np.sqrt(2520)*100:.3f}%')
# LLN 演示：样本均值收敛
true_mean = 0.0005  # 真实日均收益率
true_std  = 0.015   # 真实波动率

trial_sizes = np.arange(10, 2001, 10)
sample_means = [np.random.normal(true_mean, true_std, n).mean() for n in trial_sizes]

plt.figure(figsize=(12, 5))
plt.plot(trial_sizes, np.array(sample_means)*100, color='steelblue', lw=1.2, label='样本均值')
plt.axhline(true_mean*100, color='red', linestyle='--', lw=2, label=f'真实均值 = {true_mean*100:.3f}%')
plt.fill_between(trial_sizes,
    (true_mean - 2*true_std/np.sqrt(trial_sizes))*100,
    (true_mean + 2*true_std/np.sqrt(trial_sizes))*100,
    alpha=0.2, color='red', label='±2σ/√n 置信带')
plt.title('大数定律：样本均值收敛到真实均值')
plt.xlabel('样本量（交易日数）'); plt.ylabel('样本均值 (%/日)')
plt.legend(); plt.grid(alpha=0.3); plt.show()
print(f'  10天样本: 均值误差约 ±{true_std/np.sqrt(10)*100:.3f}%')
print(f'252天样本: 均值误差约 ±{true_std/np.sqrt(252)*100:.3f}%')
print(f'  2520天（10年）: 均值误差约 ±{true_std/np.sqrt(2520)*100:.3f}%')

No description has been provided for this image

  10天样本: 均值误差约 ±0.474%
252天样本: 均值误差约 ±0.094%
  2520天（10年）: 均值误差约 ±0.030%

2. 中心极限定理（CLT）：分散化的数学基础¶

CLT：$n$ 个独立同分布随机变量之和（或均值），当 $n$ 足够大时，近似服从正态分布：

$$\bar{X}_n = \frac{1}{n}\sum_{i=1}^n X_i \xrightarrow{d} N\left(\mu, \frac{\sigma^2}{n}\right)$$

投资组合含义：持有 $n$ 只独立同波动率股票的等权组合，其波动率约为单股的 $\frac{1}{\sqrt{n}}$。

这就是分散化的数学基础——但前提是资产相互独立（或低相关）。

In [4]:

Copied!





# 演示 CLT：从非正态分布的股票收益合成组合
# 单只股票收益服从偏态分布（模拟肥尾）
def generate_stock_returns(n_stocks, n_days, corr=0.0):
    # 生成独立的 t 分布收益
    returns = np.random.standard_t(df=4, size=(n_days, n_stocks)) * 0.01
    return returns

n_days = 252

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
for ax, n_stocks in zip(axes.flat, [1, 5, 20, 50, 100, 300]):
    ret = generate_stock_returns(n_stocks, n_days)
    portfolio_ret = ret.mean(axis=1)  # 等权
    ax.hist(portfolio_ret, bins=40, density=True, alpha=0.7,
            color='steelblue', edgecolor='none')
    x = np.linspace(portfolio_ret.min(), portfolio_ret.max(), 200)
    ax.plot(x, stats.norm.pdf(x, portfolio_ret.mean(), portfolio_ret.std()),
            'r-', lw=2, label=f'正态拟合')
    ax.set_title(f'N={n_stocks} 只股票\n峰度={stats.kurtosis(portfolio_ret):.2f}')
    ax.legend(fontsize=8)
plt.suptitle('CLT 分散化：股票数量越多，组合收益率越接近正态分布', fontsize=12)
plt.tight_layout(); plt.show()
# 演示 CLT：从非正态分布的股票收益合成组合
# 单只股票收益服从偏态分布（模拟肥尾）
def generate_stock_returns(n_stocks, n_days, corr=0.0):
    # 生成独立的 t 分布收益
    returns = np.random.standard_t(df=4, size=(n_days, n_stocks)) * 0.01
    return returns

n_days = 252

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
for ax, n_stocks in zip(axes.flat, [1, 5, 20, 50, 100, 300]):
    ret = generate_stock_returns(n_stocks, n_days)
    portfolio_ret = ret.mean(axis=1)  # 等权
    ax.hist(portfolio_ret, bins=40, density=True, alpha=0.7,
            color='steelblue', edgecolor='none')
    x = np.linspace(portfolio_ret.min(), portfolio_ret.max(), 200)
    ax.plot(x, stats.norm.pdf(x, portfolio_ret.mean(), portfolio_ret.std()),
            'r-', lw=2, label=f'正态拟合')
    ax.set_title(f'N={n_stocks} 只股票\n峰度={stats.kurtosis(portfolio_ret):.2f}')
    ax.legend(fontsize=8)
plt.suptitle('CLT 分散化：股票数量越多，组合收益率越接近正态分布', fontsize=12)
plt.tight_layout(); plt.show()

3. CLT 的局限：相关性破坏分散化¶

In [5]:

Copied!





# 演示：高相关性下，CLT 失效
def portfolio_std_vs_correlation(corr, n_stocks=50, sigma=0.01):
    """计算高相关情形下等权组合的波动率"""
    # 组合方差 = sigma^2 * (1/n + corr*(n-1)/n)
    return sigma * np.sqrt(1/n_stocks + corr * (n_stocks-1)/n_stocks)

corr_values = np.linspace(0, 1, 50)
port_stds = [portfolio_std_vs_correlation(c) for c in corr_values]

plt.figure(figsize=(9, 5))
plt.plot(corr_values, np.array(port_stds)*100, 'b-', lw=2)
plt.axhline(0.01*100/np.sqrt(50), color='green', linestyle='--',
            label=f'独立情形（corr=0）: {0.01/np.sqrt(50)*100:.3f}%')
plt.axhline(0.01*100, color='red', linestyle='--',
            label=f'完全相关（corr=1）: {0.01*100:.2f}%')
plt.xlabel('资产间相关系数 ρ'); plt.ylabel('等权组合波动率 (%/日)')
plt.title(f'相关性对分散化的影响（50 只股票，单股波动率=1%）')
plt.legend(); plt.grid(alpha=0.3); plt.show()
print('结论：相关性越高（如金融危机时），分散化效果越差。这就是为什么危机时所有资产同跌。')
# 演示：高相关性下，CLT 失效
def portfolio_std_vs_correlation(corr, n_stocks=50, sigma=0.01):
    """计算高相关情形下等权组合的波动率"""
    # 组合方差 = sigma^2 * (1/n + corr*(n-1)/n)
    return sigma * np.sqrt(1/n_stocks + corr * (n_stocks-1)/n_stocks)

corr_values = np.linspace(0, 1, 50)
port_stds = [portfolio_std_vs_correlation(c) for c in corr_values]

plt.figure(figsize=(9, 5))
plt.plot(corr_values, np.array(port_stds)*100, 'b-', lw=2)
plt.axhline(0.01*100/np.sqrt(50), color='green', linestyle='--',
            label=f'独立情形（corr=0）: {0.01/np.sqrt(50)*100:.3f}%')
plt.axhline(0.01*100, color='red', linestyle='--',
            label=f'完全相关（corr=1）: {0.01*100:.2f}%')
plt.xlabel('资产间相关系数 ρ'); plt.ylabel('等权组合波动率 (%/日)')
plt.title(f'相关性对分散化的影响（50 只股票，单股波动率=1%）')
plt.legend(); plt.grid(alpha=0.3); plt.show()
print('结论：相关性越高（如金融危机时），分散化效果越差。这就是为什么危机时所有资产同跌。')

结论：相关性越高（如金融危机时），分散化效果越差。这就是为什么危机时所有资产同跌。

4. Bootstrap 重采样：小样本下的推断工具¶

当历史数据不足时（如只有 2 年日频数据），如何估计夏普比率的置信区间？ Bootstrap：有放回地重复抽样，构建统计量的经验分布。

In [6]:

Copied!





np.random.seed(42)
returns_true = np.random.normal(0.0005, 0.012, 252*2)  # 2年模拟数据

# Bootstrap 夏普比率置信区间
n_boot = 2000
boot_sharpes = []
for _ in range(n_boot):
    sample = np.random.choice(returns_true, size=len(returns_true), replace=True)
    sharpe = sample.mean() / sample.std() * np.sqrt(252)
    boot_sharpes.append(sharpe)

boot_sharpes = np.array(boot_sharpes)
point_estimate = returns_true.mean() / returns_true.std() * np.sqrt(252)
ci_low, ci_high = np.percentile(boot_sharpes, [2.5, 97.5])

plt.figure(figsize=(9, 4))
plt.hist(boot_sharpes, bins=60, density=True, color='steelblue', alpha=0.7, edgecolor='none')
plt.axvline(point_estimate, color='red', lw=2, label=f'点估计 SR={point_estimate:.3f}')
plt.axvline(ci_low,  color='orange', lw=2, linestyle='--', label=f'95% CI: [{ci_low:.3f}, {ci_high:.3f}]')
plt.axvline(ci_high, color='orange', lw=2, linestyle='--')
plt.title('Bootstrap 估计夏普比率的 95% 置信区间（2年样本）')
plt.xlabel('Sharpe Ratio'); plt.legend(); plt.show()
print(f'夏普比率: {point_estimate:.3f}')
print(f'95% CI: [{ci_low:.3f}, {ci_high:.3f}]')
np.random.seed(42)
returns_true = np.random.normal(0.0005, 0.012, 252*2)  # 2年模拟数据

# Bootstrap 夏普比率置信区间
n_boot = 2000
boot_sharpes = []
for _ in range(n_boot):
    sample = np.random.choice(returns_true, size=len(returns_true), replace=True)
    sharpe = sample.mean() / sample.std() * np.sqrt(252)
    boot_sharpes.append(sharpe)

boot_sharpes = np.array(boot_sharpes)
point_estimate = returns_true.mean() / returns_true.std() * np.sqrt(252)
ci_low, ci_high = np.percentile(boot_sharpes, [2.5, 97.5])

plt.figure(figsize=(9, 4))
plt.hist(boot_sharpes, bins=60, density=True, color='steelblue', alpha=0.7, edgecolor='none')
plt.axvline(point_estimate, color='red', lw=2, label=f'点估计 SR={point_estimate:.3f}')
plt.axvline(ci_low,  color='orange', lw=2, linestyle='--', label=f'95% CI: [{ci_low:.3f}, {ci_high:.3f}]')
plt.axvline(ci_high, color='orange', lw=2, linestyle='--')
plt.title('Bootstrap 估计夏普比率的 95% 置信区间（2年样本）')
plt.xlabel('Sharpe Ratio'); plt.legend(); plt.show()
print(f'夏普比率: {point_estimate:.3f}')
print(f'95% CI: [{ci_low:.3f}, {ci_high:.3f}]')

夏普比率: 0.846
95% CI: [-0.492, 2.281]

🎯 练习¶

将股票数量从 50 改为 500，但保持相关系数 ρ=0.4，验证组合波动率是否继续下降。
用 Bootstrap 对比 1 年、5 年、10 年三个样本长度的夏普比率估计区间宽度。
研究「Block Bootstrap」（保留时间序列自相关的重采样），相比简单 Bootstrap 有什么优势？

下一节 → 10_present_value.ipynb

In [ ]: