import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
import scipydata = pd.DataFrame(np.random.uniform(0,100,100),columns=["x"])data.sort_values(by=["x"],ascending=True,inplace=True)data.reset_index(drop=True,inplace=True)def classical(data):
for i in range(0,100):
data = data.copy()
data.loc[i,"residual"] = np.random.normal(loc = 0, scale= 10 )
data["y"] = 0 * data["x"] + data["residual"]
return datarange(0,4)range(0, 4)
data_sample_classical = []
for i in range(0,4):
print(i)
data_sample_classical.append(classical(data))
fig1 = plt.figure(figsize=(8,5))
plt.subplot(2,2,1)
sns.regplot(x='x',y='y',data=data_sample_classical[0],scatter_kws={'s':5},color="blue")
plt.subplot(2, 2, 2)
sns.regplot(x='x',
y='y',
data=data_sample_classical[1],
scatter_kws={'s': 5},
color="blue")
plt.subplot(2, 2, 3)
sns.regplot(x='x',
y='y',
data=data_sample_classical[2],
scatter_kws={'s': 5},
color="blue")
plt.subplot(2, 2, 4)
sns.regplot(x='x',
y='y',
data=data_sample_classical[3],
scatter_kws={'s': 5},
color="blue")
plt.show()0
1
2
3
上述即为设定的
假设检验
classical_intercept = []
classical_intercept_t = []
classical_slope = []
classical_slope_t = []for i in range(2000):
data_sample = classical(data)
est = smf.ols('y ~ 1 + x', data=data_sample).fit()
classical_intercept.append(est.params[0])
classical_intercept_t.append(est.tvalues[0])
classical_slope.append(est.params[1])
classical_slope_t.append(est.tvalues[1])fig2 = plt.figure(figsize=(20,6.5))
plt.subplot(1,2,1)
sns.displot(classical_intercept,color="blue")
plt.subplot(1,2,2)
sns.displot(classical_intercept,color="blue")
plt.plot(classical_intercept_t,color="blue")
plt.show()/var/folders/sn/g01cvq2j72j6tbq2pmmm074h0000gq/T/ipykernel_86985/3698696947.py:4: MatplotlibDeprecationWarning: Auto-removal of overlapping axes is deprecated since 3.6 and will be removed two minor releases later; explicitly call ax.remove() as needed.
plt.subplot(1,2,2)
2000次的随机样本生成的过程中,大部分的样本都是可以用来估计得到很接近真实截距和真实斜率,但实际上确实有一部分的样本无法代表DGP,用这些样本来进行OLS会有很大的截距
data_sample = classical(data)
sns.regplot(x='x', y='y', data=data_sample, scatter_kws={'s': 5}, color="blue")
est = smf.ols('y ~ 1 + x', data=data_sample).fit()
est.summary()| Dep. Variable: | y | R-squared: | 0.000 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | -0.010 |
| Method: | Least Squares | F-statistic: | 0.01174 |
| Date: | Fri, 31 Mar 2023 | Prob (F-statistic): | 0.914 |
| Time: | 20:38:49 | Log-Likelihood: | -374.20 |
| No. Observations: | 100 | AIC: | 752.4 |
| Df Residuals: | 98 | BIC: | 757.6 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
| Intercept | 0.9114 | 1.961 | 0.465 | 0.643 | -2.980 | 4.803 |
| x | -0.0036 | 0.033 | -0.108 | 0.914 | -0.070 | 0.063 |
| Omnibus: | 5.383 | Durbin-Watson: | 1.814 |
|---|---|---|---|
| Prob(Omnibus): | 0.068 | Jarque-Bera (JB): | 2.576 |
| Skew: | -0.046 | Prob(JB): | 0.276 |
| Kurtosis: | 2.219 | Cond. No. | 112. |
需要记住的是显著性水平是我们的犯错概率,显著性水平的选择与我们的接受犯错的可能性相关。