March 5, 2019
# the standard, cross-section dataset # qog_path <- "http://www.qogdata.pol.gu.se/data/qog_std_cs_jan19.dta" qog <- rio::import(qog_path)
Summarise the data
qog_summary <- sjmisc::descr(qog)
m1 <- qog %>% lm(log(unna_gdppc) ~ fi_legprop_pd, data = .) plot(log(qog$unna_gdppc) ~ qog$fi_legprop_pd, xlab = "Property Rights", ylab = "Log of GDP per capita") abline(m1)
# make plot fig1 <- qog %>% ggplot() + aes(x = fi_legprop_pd, y = unna_gdppc, label = cname) + geom_point(alpha = 0.5) + geom_smooth(method = "lm") + scale_y_log10(labels=comma) + labs(x = "Property Rights", y = "GDP per capita") #show plot fig1
ggplotly(fig1, tooltip = c("label", "unna_gdppc"))
x = runif(1000) y = runif(1000) plot(x, y)
x = rnorm(1000, 10, 1) y = rnorm(1000, 10, 1) plot(x, y)
data.frame( x = rnorm(1000, 10, 1), y = rnorm(1000, 10, 1) ) %>% ggplot() + aes(x = x, y = y) + geom_point(alpha = 0.5) + geom_smooth(method="lm")
qog %>% select(unna_gdppc, fi_legprop_pd) %>% gather(key = "variable", value = "v") %>% group_by(variable) %>% summarize(obs = sum(!is.na(v)), avg = mean(v, na.rm = TRUE), sd = sd(v, na.rm = TRUE)) %>% knitr::kable()
variable | obs | avg | sd |
---|---|---|---|
fi_legprop_pd | 158 | 5.193346 | 1.476081 |
unna_gdppc | 193 | 15827.621502 | 26327.063311 |
data.frame( prop = rnorm(158, 5.19, 1.48), gdppc = rnorm(158, 15828, 26327) ) %>% ggplot() + aes(x = prop, y = gdppc) + geom_point(alpha = 0.5) + geom_smooth(method="lm") + scale_y_log10(labels=comma)
qog %>% select(unna_gdppc, fi_legprop_pd) %>% gather(key = "variable", value = "v") %>% ggplot() + aes(x = v) + geom_density() + facet_wrap(~variable, scales = "free")
infer
packageqog %>% specify(unna_gdppc ~ fi_legprop_pd) %>% hypothesize(null = "independence") %>% generate(1) %>% ungroup() %>% # the simulated distributions select(unna_gdppc, fi_legprop_pd) %>% gather(key = "variable", value = "v") %>% ggplot() + aes(x = v) + geom_density() + facet_wrap(~variable, scales = "free")
infer
packageqog %>% specify(unna_gdppc ~ fi_legprop_pd) %>% hypothesize(null = "independence") %>% generate(4) %>% ungroup() %>% # scatter plots ggplot() + aes(x = fi_legprop_pd, y = unna_gdppc) + geom_point(alpha = 0.5) + geom_smooth(method = "lm") + scale_y_log10(labels = comma) + facet_wrap(~replicate) + labs(x = "Property Rights", y = "GDP per capita")
Simulate the data many, many times.
What percentage of these simulations lead to the same regression line as the one from the real data?
Answer to question 2 is the p-value.
Common misinterpretation:
\[ p(h|I, D) = p(h|I) \frac{p(D|h,I)}{p(D|I)} \]
\[ p(h|I, D) = p(h|I) \frac{p(D|h,I)}{p(D|I)} \]
\[ p(h|I, D) = p(h|I) \frac{\color{red}{p(D|h,I)}}{p(D|I)} \]
\[ p(h|I, D) = p(h|I) \frac{p(D|h,I)}{p(D|I)} \] Posterior prob = p-value, only when we don’t have an informative prior
reg1 <- qog %>% lm(unna_gdppc ~ fi_legprop_pd, data = .) tab_model(reg1, show.se = TRUE)
 |
GDP per Capita(Current Prices in US$) |
|||
---|---|---|---|---|
Predictors | Estimates | std. Error | CI | p |
(Intercept) | -39304.54 | 4229.71 | -47594.63 – -31014.46 | <0.001 |
Legal Structure and Security of Property Rights(panel data) |
10523.63 | 784.98 | 8985.09 – 12062.16 | <0.001 |
Observations | 157 | |||
R2 / adjusted R2 | 0.537 / 0.534 |
v
is the omitted variable;y
and x
v <- rnorm(1000) x <- v + rnorm(1000) y <- 1.5 * x + v + rnorm(1000)
Function rnorm
generates random numbers from a normal distribution
r
: randomnorm
: normal distributionplot(x, y)
plot(x, v)
plot(v, y)
Notice the coefficient of x is not 1.5
m1 <- lm(y ~ x) m1
## ## Call: ## lm(formula = y ~ x) ## ## Coefficients: ## (Intercept) x ## -0.05064 1.99526
Now the coefficient of x is 1.5
m2 <- lm(y ~ x + v) m2
## ## Call: ## lm(formula = y ~ x + v) ## ## Coefficients: ## (Intercept) x v ## -0.06678 1.51609 0.96514
tab_model(m1, m2)
 | y | y | ||||
---|---|---|---|---|---|---|
Predictors | Estimates | CI | p | Estimates | CI | p |
(Intercept) | -0.05 | -0.12 – 0.02 | 0.179 | -0.07 | -0.13 – -0.01 | 0.033 |
x | 2.00 | 1.94 – 2.05 | <0.001 | 1.52 | 1.45 – 1.58 | <0.001 |
v | 0.97 | 0.88 – 1.05 | <0.001 | |||
Observations | 1000 | 1000 | ||||
R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 |
tab_model(m1, m2, collapse.ci = TRUE)
 | y | y | ||
---|---|---|---|---|
Predictors | Estimates | p | Estimates | p |
(Intercept) |
-0.05 (-0.12 – 0.02) |
0.179 |
-0.07 (-0.13 – -0.01) |
0.033 |
x |
2.00 (1.94 – 2.05) |
<0.001 |
1.52 (1.45 – 1.58) |
<0.001 |
v |
0.97 (0.88 – 1.05) |
<0.001 | ||
Observations | 1000 | 1000 | ||
R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 |
tab_model(m1, m2, p.style = "asterisk", collapse.ci = TRUE)
 | y | y |
---|---|---|
Predictors | Estimates | Estimates |
(Intercept) |
-0.05 (-0.12 – 0.02) |
-0.07 * (-0.13 – -0.01) |
x |
2.00 *** (1.94 – 2.05) |
1.52 *** (1.45 – 1.58) |
v |
0.97 *** (0.88 – 1.05) |
|
Observations | 1000 | 1000 |
R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 |
|
tab_model(m1, m2, p.style = "asterisk", show.se = TRUE, show.ci = FALSE, collapse.se = TRUE)
 | y | y |
---|---|---|
Predictors | Estimates | Estimates |
(Intercept) |
-0.05 (0.04) |
-0.07 * (0.03) |
x |
2.00 *** (0.03) |
1.52 *** (0.03) |
v |
0.97 *** (0.05) |
|
Observations | 1000 | 1000 |
R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 |
|
tab_model(m1, m2, p.style = "asterisk", show.se = TRUE, collapse.se = TRUE)
 | y | y | ||
---|---|---|---|---|
Predictors | Estimates | CI | Estimates | CI |
(Intercept) |
-0.05 (0.04) |
-0.12 – 0.02 |
-0.07 * (0.03) |
-0.13 – -0.01 |
x |
2.00 *** (0.03) |
1.94 – 2.05 |
1.52 *** (0.03) |
1.45 – 1.58 |
v |
0.97 *** (0.05) |
0.88 – 1.05 | ||
Observations | 1000 | 1000 | ||
R2 / adjusted R2 | 0.854 / 0.854 | 0.900 / 0.900 | ||
|
plot_models(m1, m2, show.values = TRUE)
Repeat the analysis a 1000 times to prove that the above omitted variable bias was not an accident.
B <- map_dbl(1:1000, function(i) { v <- rnorm(1000) x <- v + rnorm(1000) y <- 1.5 * x + v + rnorm(1000) lm(y ~ x)$coefficients[[2]] }) %>% data.frame(beta_x = .)
lm
objectm1 <- lm(y ~ x)
ggplot(B, aes(x = beta_x)) + geom_density() + geom_vline(xintercept = mean(B$beta_x))
# case 1: Omitted variable bias v <- rnorm(1000) x <- v + rnorm(1000) y <- 1.5 * x + v + rnorm(1000) # case 2: No bias v <- rnorm(1000) x <- rnorm(1000) y <- 1.5 * x + v + rnorm(1000) # case 3: No bias v <- rnorm(1000) x <- v + rnorm(1000) y <- 1.5 * x + rnorm(1000)