Data Generating Process

Published

March 22, 2023

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
import scipy
data = pd.DataFrame(np.random.uniform(0,100,100),columns=["x"])
data.sort_values(by=["x"],ascending=True,inplace=True)
data.reset_index(drop=True,inplace=True)
def classical(data):
    for i in range(0,100):
        data = data.copy()
        data.loc[i,"residual"] = np.random.normal(loc = 0, scale= 10 )
        data["y"]  = 0 * data["x"] + data["residual"]
    return data
range(0,4)
range(0, 4)
data_sample_classical = []
for i in range(0,4):
    print(i)
    data_sample_classical.append(classical(data))
fig1 = plt.figure(figsize=(8,5))
plt.subplot(2,2,1)
sns.regplot(x='x',y='y',data=data_sample_classical[0],scatter_kws={'s':5},color="blue")
plt.subplot(2, 2, 2)
sns.regplot(x='x',
            y='y',
            data=data_sample_classical[1],
            scatter_kws={'s': 5},
            color="blue")

plt.subplot(2, 2, 3)
sns.regplot(x='x',
            y='y',
            data=data_sample_classical[2],
            scatter_kws={'s': 5},
            color="blue")

plt.subplot(2, 2, 4)
sns.regplot(x='x',
            y='y',
            data=data_sample_classical[3],
            scatter_kws={'s': 5},
            color="blue")
plt.show()
0
1
2
3

上述即为设定的yi=0+0×xi+ε

假设检验

classical_intercept = [] 
classical_intercept_t = []
classical_slope = []
classical_slope_t = []
for i in range(2000):
    data_sample = classical(data)
    est = smf.ols('y ~ 1 + x', data=data_sample).fit()
    classical_intercept.append(est.params[0])
    classical_intercept_t.append(est.tvalues[0])
    classical_slope.append(est.params[1])
    classical_slope_t.append(est.tvalues[1])
fig2 = plt.figure(figsize=(20,6.5))
plt.subplot(1,2,1)
sns.displot(classical_intercept,color="blue")
plt.subplot(1,2,2)
sns.displot(classical_intercept,color="blue")
plt.plot(classical_intercept_t,color="blue")
plt.show()
/var/folders/sn/g01cvq2j72j6tbq2pmmm074h0000gq/T/ipykernel_86985/3698696947.py:4: MatplotlibDeprecationWarning: Auto-removal of overlapping axes is deprecated since 3.6 and will be removed two minor releases later; explicitly call ax.remove() as needed.
  plt.subplot(1,2,2)

2000次的随机样本生成的过程中,大部分的样本都是可以用来估计得到很接近真实截距和真实斜率,但实际上确实有一部分的样本无法代表DGP,用这些样本来进行OLS会有很大的截距

data_sample = classical(data)
sns.regplot(x='x', y='y', data=data_sample, scatter_kws={'s': 5}, color="blue")
est = smf.ols('y ~ 1 + x', data=data_sample).fit()
est.summary()
OLS Regression Results
Dep. Variable: y R-squared: 0.000
Model: OLS Adj. R-squared: -0.010
Method: Least Squares F-statistic: 0.01174
Date: Fri, 31 Mar 2023 Prob (F-statistic): 0.914
Time: 20:38:49 Log-Likelihood: -374.20
No. Observations: 100 AIC: 752.4
Df Residuals: 98 BIC: 757.6
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.9114 1.961 0.465 0.643 -2.980 4.803
x -0.0036 0.033 -0.108 0.914 -0.070 0.063
Omnibus: 5.383 Durbin-Watson: 1.814
Prob(Omnibus): 0.068 Jarque-Bera (JB): 2.576
Skew: -0.046 Prob(JB): 0.276
Kurtosis: 2.219 Cond. No. 112.

需要记住的是显著性水平是我们的犯错概率,显著性水平的选择与我们的接受犯错的可能性相关。