March 5, 2019
# the standard, cross-section dataset # qog_path <- "http://www.qogdata.pol.gu.se/data/qog_std_cs_jan19.dta" qog <- rio::import(qog_path)
Summarise the data
qog_summary <- sjmisc::descr(qog)
m1 <- qog %>%
lm(log(unna_gdppc) ~ fi_legprop_pd,
data = .)
plot(log(qog$unna_gdppc) ~ qog$fi_legprop_pd,
xlab = "Property Rights",
ylab = "Log of GDP per capita")
abline(m1)
# make plot
fig1 <- qog %>% ggplot() +
aes(x = fi_legprop_pd,
y = unna_gdppc,
label = cname) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
scale_y_log10(labels=comma) +
labs(x = "Property Rights",
y = "GDP per capita")
#show plot
fig1
ggplotly(fig1, tooltip = c("label", "unna_gdppc"))
x = runif(1000) y = runif(1000) plot(x, y)
x = rnorm(1000, 10, 1) y = rnorm(1000, 10, 1) plot(x, y)
data.frame(
x = rnorm(1000, 10, 1),
y = rnorm(1000, 10, 1)
) %>%
ggplot() + aes(x = x, y = y) +
geom_point(alpha = 0.5) +
geom_smooth(method="lm")
qog %>%
select(unna_gdppc, fi_legprop_pd) %>%
gather(key = "variable",
value = "v") %>%
group_by(variable) %>%
summarize(obs = sum(!is.na(v)),
avg = mean(v, na.rm = TRUE),
sd = sd(v, na.rm = TRUE)) %>%
knitr::kable()
| variable | obs | avg | sd |
|---|---|---|---|
| fi_legprop_pd | 158 | 5.193346 | 1.476081 |
| unna_gdppc | 193 | 15827.621502 | 26327.063311 |
data.frame(
prop = rnorm(158, 5.19, 1.48),
gdppc = rnorm(158, 15828, 26327)
) %>%
ggplot() + aes(x = prop, y = gdppc) +
geom_point(alpha = 0.5) +
geom_smooth(method="lm") +
scale_y_log10(labels=comma)
qog %>%
select(unna_gdppc, fi_legprop_pd) %>%
gather(key = "variable",
value = "v") %>%
ggplot() +
aes(x = v) +
geom_density() +
facet_wrap(~variable,
scales = "free")
infer packageqog %>% specify(unna_gdppc ~ fi_legprop_pd) %>%
hypothesize(null = "independence") %>%
generate(1) %>%
ungroup() %>%
# the simulated distributions
select(unna_gdppc, fi_legprop_pd) %>%
gather(key = "variable",
value = "v") %>%
ggplot() +
aes(x = v) +
geom_density() +
facet_wrap(~variable,
scales = "free")
infer packageqog %>% specify(unna_gdppc ~ fi_legprop_pd) %>%
hypothesize(null = "independence") %>%
generate(4) %>%
ungroup() %>%
# scatter plots
ggplot() +
aes(x = fi_legprop_pd, y = unna_gdppc) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
scale_y_log10(labels = comma) +
facet_wrap(~replicate) +
labs(x = "Property Rights",
y = "GDP per capita")
Simulate the data many, many times.
What percentage of these simulations lead to the same regression line as the one from the real data?
Answer to question 2 is the p-value.
Common misinterpretation:
\[ p(h|I, D) = p(h|I) \frac{p(D|h,I)}{p(D|I)} \]
\[ p(h|I, D) = p(h|I) \frac{p(D|h,I)}{p(D|I)} \]
\[ p(h|I, D) = p(h|I) \frac{\color{red}{p(D|h,I)}}{p(D|I)} \]
\[ p(h|I, D) = p(h|I) \frac{p(D|h,I)}{p(D|I)} \] Posterior prob = p-value, only when we don’t have an informative prior
reg1 <- qog %>% lm(unna_gdppc ~ fi_legprop_pd, data = .) tab_model(reg1, show.se = TRUE)
| Â |
GDP per Capita(Current Prices in US$) |
|||
|---|---|---|---|---|
| Predictors | Estimates | std. Error | CI | p |
| (Intercept) | -39304.54 | 4229.71 | -47594.63 – -31014.46 | <0.001 |
|
Legal Structure and Security of Property Rights(panel data) |
10523.63 | 784.98 | 8985.09 – 12062.16 | <0.001 |
| Observations | 157 | |||
| R2 / adjusted R2 | 0.537 / 0.534 | |||
v is the omitted variable;y and xv <- rnorm(1000) x <- v + rnorm(1000) y <- 1.5 * x + v + rnorm(1000)
Function rnorm generates random numbers from a normal distribution
r: randomnorm: normal distributionplot(x, y)
plot(x, v)
plot(v, y)
Notice the coefficient of x is not 1.5
m1 <- lm(y ~ x) m1
## ## Call: ## lm(formula = y ~ x) ## ## Coefficients: ## (Intercept) x ## -0.05064 1.99526
Now the coefficient of x is 1.5
m2 <- lm(y ~ x + v) m2
## ## Call: ## lm(formula = y ~ x + v) ## ## Coefficients: ## (Intercept) x v ## -0.06678 1.51609 0.96514
tab_model(m1, m2)
| Â | y | y | ||||
|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | -0.05 | -0.12 – 0.02 | 0.179 | -0.07 | -0.13 – -0.01 | 0.033 |
| x | 2.00 | 1.94 – 2.05 | <0.001 | 1.52 | 1.45 – 1.58 | <0.001 |
| v | 0.97 | 0.88 – 1.05 | <0.001 | |||
| Observations | 1000 | 1000 | ||||
| R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 | ||||
tab_model(m1, m2, collapse.ci = TRUE)
| Â | y | y | ||
|---|---|---|---|---|
| Predictors | Estimates | p | Estimates | p |
| (Intercept) |
-0.05 (-0.12 – 0.02) |
0.179 |
-0.07 (-0.13 – -0.01) |
0.033 |
| x |
2.00 (1.94 – 2.05) |
<0.001 |
1.52 (1.45 – 1.58) |
<0.001 |
| v |
0.97 (0.88 – 1.05) |
<0.001 | ||
| Observations | 1000 | 1000 | ||
| R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 | ||
tab_model(m1, m2, p.style = "asterisk", collapse.ci = TRUE)
| Â | y | y |
|---|---|---|
| Predictors | Estimates | Estimates |
| (Intercept) |
-0.05 (-0.12 – 0.02) |
-0.07 * (-0.13 – -0.01) |
| x |
2.00 *** (1.94 – 2.05) |
1.52 *** (1.45 – 1.58) |
| v |
0.97 *** (0.88 – 1.05) |
|
| Observations | 1000 | 1000 |
| R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 |
|
||
tab_model(m1, m2, p.style = "asterisk",
show.se = TRUE, show.ci = FALSE, collapse.se = TRUE)
| Â | y | y |
|---|---|---|
| Predictors | Estimates | Estimates |
| (Intercept) |
-0.05 (0.04) |
-0.07 * (0.03) |
| x |
2.00 *** (0.03) |
1.52 *** (0.03) |
| v |
0.97 *** (0.05) |
|
| Observations | 1000 | 1000 |
| R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 |
|
||
tab_model(m1, m2, p.style = "asterisk",
show.se = TRUE, collapse.se = TRUE)
| Â | y | y | ||
|---|---|---|---|---|
| Predictors | Estimates | CI | Estimates | CI |
| (Intercept) |
-0.05 (0.04) |
-0.12 – 0.02 |
-0.07 * (0.03) |
-0.13 – -0.01 |
| x |
2.00 *** (0.03) |
1.94 – 2.05 |
1.52 *** (0.03) |
1.45 – 1.58 |
| v |
0.97 *** (0.05) |
0.88 – 1.05 | ||
| Observations | 1000 | 1000 | ||
| R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 | ||
|
||||
plot_models(m1, m2, show.values = TRUE)
Repeat the analysis a 1000 times to prove that the above omitted variable bias was not an accident.
B <- map_dbl(1:1000, function(i) {
v <- rnorm(1000)
x <- v + rnorm(1000)
y <- 1.5 * x + v + rnorm(1000)
lm(y ~ x)$coefficients[[2]]
}) %>% data.frame(beta_x = .)
lm objectm1 <- lm(y ~ x)
ggplot(B, aes(x = beta_x)) + geom_density() + geom_vline(xintercept = mean(B$beta_x))
# case 1: Omitted variable bias v <- rnorm(1000) x <- v + rnorm(1000) y <- 1.5 * x + v + rnorm(1000) # case 2: No bias v <- rnorm(1000) x <- rnorm(1000) y <- 1.5 * x + v + rnorm(1000) # case 3: No bias v <- rnorm(1000) x <- v + rnorm(1000) y <- 1.5 * x + rnorm(1000)