Section 7.1: Estimation
Section 7.1.1: Unbiasedness and Consistency
## simulation parameters
n <- 100 # sample size
mu0 <- 0 # mean of Y_i(0)
sd0 <- 1 # standard deviation of Y_i(0)
mu1 <- 1 # mean of Y_i(1)
sd1 <- 1 # standard deviation of Y_i(1)
## generate a sample
Y0 <- rnorm(n, mean = mu0, sd = sd0)
Y1 <- rnorm(n, mean = mu1, sd = sd1)
tau <- Y1 - Y0 # individual treatment effect
## true value of the sample average treatment effect
SATE <- mean(tau)
SATE
## [1] 0.7910608
## repeatedly conduct randomized controlled trials
sims <- 5000 # repeat 5,000 times, we could do more
diff.means <- rep(NA, sims) # container
for (i in 1:sims) {
## randomize the treatment by sampling of a vector of 0's and 1's
treat <- sample(c(rep(1, n / 2), rep(0, n / 2)), size = n, replace = FALSE)
## difference-in-means
diff.means[i] <- mean(Y1[treat == 1]) - mean(Y0[treat == 0])
}
## estimation error for SATE
est.error <- diff.means - SATE
summary(est.error)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.542364 -0.104843 0.005036 0.002633 0.114995 0.569677
## PATE simulation
PATE <- mu1 - mu0
diff.means <- rep(NA, sims)
for (i in 1:sims) {
## generate a sample for each simulation: this used to be outside of loop
Y0 <- rnorm(n, mean = mu0, sd = sd0)
Y1 <- rnorm(n, mean = mu1, sd = sd1)
treat <- sample(c(rep(1, n / 2), rep(0, n / 2)), size = n, replace = FALSE)
diff.means[i] <- mean(Y1[treat == 1]) - mean(Y0[treat == 0])
}
## estimation error for PATE
est.error <- diff.means - PATE
## unbiased
summary(est.error)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.6947626 -0.1348674 -0.0024725 0.0000347 0.1324152 0.7546195
Section 7.1.2: Standard Error
par(cex = 1.5)
hist(diff.means, freq = FALSE, xlab = "Difference-in-means estimator",
main = "Sampling distribution")
abline(v = SATE, col = "red") # true value of SATE
text(0.6, 2.4, "true SATE", col = "red")
sd(diff.means)
## [1] 0.1973403
sqrt(mean((diff.means - SATE)^2))
## [1] 0.2874117
## PATE simulation with standard error
sims <- 5000
diff.means <- se <- rep(NA, sims) # container for standard error added
for (i in 1:sims) {
## generate a sample
Y0 <- rnorm(n, mean = mu0, sd = sd0)
Y1 <- rnorm(n, mean = mu1, sd = sd1)
## randomize the treatment by sampling of a vector of 0's and 1's
treat <- sample(c(rep(1, n / 2), rep(0, n / 2)), size = n, replace = FALSE)
diff.means[i] <- mean(Y1[treat == 1]) - mean(Y0[treat == 0])
## standard error
se[i] <- sqrt(var(Y1[treat == 1]) / (n / 2) + var(Y0[treat == 0]) / (n / 2))
}
## standard deviation of difference-in-means
sd(diff.means)
## [1] 0.2006359
## mean of standard errors
mean(se)
## [1] 0.199848
Section 7.1.3: Confidence Intervals
n <- 1000 # sample size
x.bar <- 0.6 # point estimate
s.e. <- sqrt(x.bar * (1 - x.bar) / n) # standard error
## 99% confidence intervals
c(x.bar - qnorm(0.995) * s.e., x.bar + qnorm(0.995) * s.e.)
## [1] 0.5600954 0.6399046
## 95% confidence intervals
c(x.bar - qnorm(0.975) * s.e., x.bar + qnorm(0.975) * s.e.)
## [1] 0.5696364 0.6303636
## 90% confidence intervals
c(x.bar - qnorm(0.95) * s.e., x.bar + qnorm(0.95) * s.e.)
## [1] 0.574518 0.625482
## empty container matrices for 2 sets of confidence intervals
ci95 <- ci90 <- matrix(NA, ncol = 2, nrow = sims)
## 95 percent confidence intervals
ci95[, 1] <- diff.means - qnorm(0.975) * se # lower limit
ci95[, 2] <- diff.means + qnorm(0.975) * se # upper limit
## 90 percent confidence intervals
ci90[, 1] <- diff.means - qnorm(0.95) * se # lower limit
ci90[, 2] <- diff.means + qnorm(0.95) * se # upper limit
## coverage rate for 95% confidence interval
mean(ci95[, 1] <= 1 & ci95[, 2] >= 1)
## [1] 0.9452
## coverage rate for 90% confidence interval
mean(ci90[, 1] <= 1 & ci90[, 2] >= 1)
## [1] 0.895
p <- 0.6 # true parameter value
n <- c(50, 100, 1000) # 3 sample sizes to be examined
alpha <- 0.05
sims <- 5000 # number of simulations
results <- rep(NA, length(n)) # a container for results
## loop for different sample sizes
for (i in 1:length(n)) {
ci.results <- rep(NA, sims) # a container for whether CI includes truth
## loop for repeated hypothetical survey sampling
for (j in 1:sims) {
data <- rbinom(n[i], size = 1, prob = p) # simple random sampling
x.bar <- mean(data) # sample proportion as an estimate
s.e. <- sqrt(x.bar * (1 - x.bar) / n[i]) # standard errors
ci.lower <- x.bar - qnorm(1 - alpha/2) * s.e.
ci.upper <- x.bar + qnorm(1 - alpha/2) * s.e.
ci.results[j] <- (p >= ci.lower) & (p <= ci.upper)
}
## proportion of CIs that contain the true value
results[i] <- mean(ci.results)
}
results
## [1] 0.9398 0.9516 0.9464
Scetion 7.1.4: Margin of Error and Sample Size Calculation in Polls
par(cex = 1.5, lwd = 2)
MoE <- c(0.01, 0.03, 0.05) # the desired margin of error
p <- seq(from = 0.01, to = 0.99, by = 0.01)
n <- 1.96^2 * p * (1 - p) / MoE[1]^2
plot(p, n, ylim = c(-1000, 11000), xlab = "Population proportion",
ylab = "Sample size", type = "l")
lines(p, 1.96^2 * p * (1 - p) / MoE[2]^2, lty = "dashed")
lines(p, 1.96^2 * p * (1 - p) / MoE[3]^2, lty = "dotted")
text(0.4, 10000, "margin of error = 0.01")
text(0.4, 1800, "margin of error = 0.03")
text(0.6, -200, "margin of error = 0.05")
## election and polling results, by state
data("pres08", package = "qss")
data("polls08", package = "qss")
## convert to a Date object
polls08$middate <- as.Date(polls08$middate)
## number of days to the election day
polls08$DaysToElection <- as.Date("2008-11-04") - polls08$middate
## Warning in strptime(xx, f <- "%Y-%m-%d", tz = "GMT"): unknown timezone
## 'zone/tz/2017c.1.0/zoneinfo/America/New_York'
## create a matrix place holder
poll.pred <- matrix(NA, nrow = 51, ncol = 3)
## state names which the loop will iterate through
st.names <- unique(pres08$state)
## add labels for easy interpretation later on
row.names(poll.pred) <- as.character(st.names)
## loop across 50 states plus DC
for (i in 1:51){
## subset the ith state
state.data <- subset(polls08, subset = (state == st.names[i]))
## subset the latest polls within the state
latest <- state.data$DaysToElection == min(state.data$DaysToElection)
## compute the mean of latest polls and store it
poll.pred[i, 1] <- mean(state.data$Obama[latest]) / 100
}
## upper and lower confidence limits
n <- 1000 # sample size
alpha <- 0.05
s.e. <- sqrt(poll.pred[, 1] * (1 - poll.pred[, 1]) / n) # standard error
poll.pred[, 2] <- poll.pred[, 1] - qnorm(1 - alpha/2) * s.e.
poll.pred[, 3] <- poll.pred[, 1] + qnorm(1 - alpha/2) * s.e.
par(cex = 1.5)
alpha <- 0.05
plot(pres08$Obama / 100, poll.pred[, 1], xlim = c(0, 1), ylim = c(0, 1),
xlab = "Obama's vote share", ylab = "Poll prediction")
abline(0, 1)
## adding 95% confidence intervals for each state
for (i in 1:51) {
lines(rep(pres08$Obama[i] / 100, 2), c(poll.pred[i, 2], poll.pred[i, 3]))
}
## proportion of confidence intervals that contain the election day outcome
mean((poll.pred[, 2] <= pres08$Obama / 100) &
(poll.pred[, 3] >= pres08$Obama / 100))
## [1] 0.5882353
## bias
bias <- mean(poll.pred[, 1] - pres08$Obama / 100)
bias
## [1] -0.02679739
## bias corrected estimate
poll.bias <- poll.pred[, 1] - bias
## bias-corrected standard error
s.e.bias <- sqrt(poll.bias * (1 - poll.bias) / n)
## bias-corrected 95% confidence interval
ci.bias.lower <- poll.bias - qnorm(1 - alpha / 2) * s.e.bias
ci.bias.upper <- poll.bias + qnorm(1 - alpha / 2) * s.e.bias
## proportion of bias-corrected CIs that contain the election day outcome
mean((ci.bias.lower <= pres08$Obama / 100) &
(ci.bias.upper >= pres08$Obama / 100))
## [1] 0.7647059
Section 7.1.5: Analysis of Randomized Controlled Trials
par(cex = 1.5)
## read in data
data("STAR", package = "qss")
hist(STAR$g4reading[STAR$classtype == 1], freq = FALSE, xlim = c(500, 900),
ylim = c(0, 0.01), main = "Small class",
xlab = "Fourth-grade reading test score")
abline(v = mean(STAR$g4reading[STAR$classtype == 1], na.rm = TRUE),
col = "red")
hist(STAR$g4reading[STAR$classtype == 2], freq = FALSE, xlim = c(500, 900),
ylim = c(0, 0.01), main = "Regular class",
xlab = "Fourth-grade reading test score")
abline(v = mean(STAR$g4reading[STAR$classtype == 2], na.rm = TRUE),
col = "red")
## estimate and standard error for small class
n.small <- sum(STAR$classtype == 1 & !is.na(STAR$g4reading))
est.small <- mean(STAR$g4reading[STAR$classtype == 1], na.rm = TRUE)
se.small <- sd(STAR$g4reading[STAR$classtype == 1], na.rm = TRUE) /
sqrt(n.small)
est.small
## [1] 723.3912
se.small
## [1] 1.913012
## estimate and standard error for regular class
n.regular <- sum(STAR$classtype == 2 & !is.na(STAR$classtype) &
!is.na(STAR$g4reading))
est.regular <- mean(STAR$g4reading[STAR$classtype == 2], na.rm = TRUE)
se.regular <- sd(STAR$g4reading[STAR$classtype == 2], na.rm = TRUE) /
sqrt(n.regular)
est.regular
## [1] 719.89
se.regular
## [1] 1.83885
alpha <- 0.05
## 95% confidence intervals for small class
ci.small <- c(est.small - qnorm(1 - alpha / 2) * se.small,
est.small + qnorm(1 - alpha / 2) * se.small)
ci.small
## [1] 719.6417 727.1406
## 95% confidence intervals for regular class
ci.regular <- c(est.regular - qnorm(1 - alpha / 2) * se.regular,
est.regular + qnorm(1 - alpha / 2) * se.regular)
ci.regular
## [1] 716.2859 723.4940
## difference-in-means estimator
ate.est <- est.small - est.regular
ate.est
## [1] 3.501232
## standard error and 95% confidence interval
ate.se <- sqrt(se.small^2 + se.regular^2)
ate.se
## [1] 2.653485
ate.ci <- c(ate.est - qnorm(1 - alpha / 2) * ate.se,
ate.est + qnorm(1 - alpha / 2) * ate.se)
ate.ci
## [1] -1.699503 8.701968
Section 7.1.6: Analysis Based on Student’s t-Distribution
## 95% CI for small class
c(est.small - qt(0.975, df = n.small - 1) * se.small,
est.small + qt(0.975, df = n.small - 1) * se.small)
## [1] 719.6355 727.1469
## 95% CI based on the central limit theorem
ci.small
## [1] 719.6417 727.1406
## 95% CI for regular class
c(est.regular - qt(0.975, df = n.regular - 1) * se.regular,
est.regular + qt(0.975, df = n.regular - 1) * se.regular)
## [1] 716.2806 723.4993
## 95% CI based on the central limit theorem
ci.regular
## [1] 716.2859 723.4940
t.ci <- t.test(STAR$g4reading[STAR$classtype == 1],
STAR$g4reading[STAR$classtype == 2])
t.ci
##
## Welch Two Sample t-test
##
## data: STAR$g4reading[STAR$classtype == 1] and STAR$g4reading[STAR$classtype == 2]
## t = 1.3195, df = 1541.2, p-value = 0.1872
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.703591 8.706055
## sample estimates:
## mean of x mean of y
## 723.3912 719.8900
Section 7.2: Hypothesis Testing
Section 7.2.1: Tea-Testing Experiment
## truth: enumerate the number of assignment combinations
true <- c(choose(4, 0) * choose(4, 4),
choose(4, 1) * choose(4, 3),
choose(4, 2) * choose(4, 2),
choose(4, 3) * choose(4, 1),
choose(4, 4) * choose(4, 0))
true
## [1] 1 16 36 16 1
## compute probability: divide it by the total number of events
true <- true / sum(true)
## number of correctly classified cups as labels
names(true) <- c(0, 2, 4, 6, 8)
true
## 0 2 4 6 8
## 0.01428571 0.22857143 0.51428571 0.22857143 0.01428571
## simulations
sims <- 1000
## lady's guess: M stands for `Milk first,' T stands for `Tea first'
guess <- c("M", "T", "T", "M", "M", "T", "T", "M")
correct <- rep(NA, sims) # place holder for number of correct guesses
for (i in 1:sims) {
## randomize which cups get Milk/Tea first
cups <- sample(c(rep("T", 4), rep("M", 4)), replace = FALSE)
correct[i] <- sum(guess == cups) # number of correct guesses
}
## estimated probability for each number of correct guesses
prop.table(table(correct))
## correct
## 0 2 4 6 8
## 0.017 0.227 0.506 0.238 0.012
## comparison with analytical answers; the differences are small
prop.table(table(correct)) - true
## correct
## 0 2 4 6 8
## 0.002714286 -0.001571429 -0.008285714 0.009428571 -0.002285714
Section 7.2.2: The General Framework
## all correct
x <- matrix(c(4, 0, 0, 4), byrow = TRUE, ncol = 2, nrow = 2)
## six correct
y <- matrix(c(3, 1, 1, 3), byrow = TRUE, ncol = 2, nrow = 2)
## `M' milk first, `T' tea first
rownames(x) <- colnames(x) <- rownames(y) <- colnames(y) <- c("M", "T")
x
## M T
## M 4 0
## T 0 4
y
## M T
## M 3 1
## T 1 3
## one-sided test for 8 correct guesses
fisher.test(x, alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: x
## p-value = 0.01429
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 2.003768 Inf
## sample estimates:
## odds ratio
## Inf
## two-sided test for 6 correct guesses
fisher.test(y)
##
## Fisher's Exact Test for Count Data
##
## data: y
## p-value = 0.4857
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.2117329 621.9337505
## sample estimates:
## odds ratio
## 6.408309
Section 7.2.3: One-Sample Tests
n <- 1018
x.bar <- 550 / n
se <- sqrt(0.5 * 0.5 / n) # standard deviation of sampling distribution
## upper red area in the figure
upper <- pnorm(x.bar, mean = 0.5, sd = se, lower.tail = FALSE)
## lower red area in the figure; identical to the upper area
lower <- pnorm(0.5 - (x.bar - 0.5), mean = 0.5, sd = se)
## two-side p-value
upper + lower
## [1] 0.01016866
2 * upper
## [1] 0.01016866
## one-sided p-value
upper
## [1] 0.005084332
z.score <- (x.bar - 0.5) / se
z.score
## [1] 2.57004
pnorm(z.score, lower.tail = FALSE) # one-sided p-value
## [1] 0.005084332
2 * pnorm(z.score, lower.tail = FALSE) # two-sided p-value
## [1] 0.01016866
## 99% confidence interval contains 0.5
c(x.bar - qnorm(0.995) * se, x.bar + qnorm(0.995) * se)
## [1] 0.4999093 0.5806408
## 95% confidence interval does not contain 0.5
c(x.bar - qnorm(0.975) * se, x.bar + qnorm(0.975) * se)
## [1] 0.5095605 0.5709896
## no continuity correction to get the same p-value as above
prop.test(550, n = n, p = 0.5, correct = FALSE)
##
## 1-sample proportions test without continuity correction
##
## data: 550 out of n, null probability 0.5
## X-squared = 6.6051, df = 1, p-value = 0.01017
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.5095661 0.5706812
## sample estimates:
## p
## 0.540275
## with continuity correction
prop.test(550, n = n, p = 0.5)
##
## 1-sample proportions test with continuity correction
##
## data: 550 out of n, null probability 0.5
## X-squared = 6.445, df = 1, p-value = 0.01113
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.5090744 0.5711680
## sample estimates:
## p
## 0.540275
prop.test(550, n = n, p = 0.5, conf.level = 0.99)
##
## 1-sample proportions test with continuity correction
##
## data: 550 out of n, null probability 0.5
## X-squared = 6.445, df = 1, p-value = 0.01113
## alternative hypothesis: true p is not equal to 0.5
## 99 percent confidence interval:
## 0.4994182 0.5806040
## sample estimates:
## p
## 0.540275
## two-sided one-sample t-test
t.test(STAR$g4reading, mu = 710)
##
## One Sample t-test
##
## data: STAR$g4reading
## t = 10.407, df = 2352, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 710
## 95 percent confidence interval:
## 719.1284 723.3671
## sample estimates:
## mean of x
## 721.2478
Section 7.2.4: Two-Sample Tests
## one-sided p-value
pnorm(-abs(ate.est), mean = 0, sd = ate.se)
## [1] 0.09350361
## two-sided p-value
2 * pnorm(-abs(ate.est), mean = 0, sd = ate.se)
## [1] 0.1870072
## testing the null of zero average treatment effect
t.test(STAR$g4reading[STAR$classtype == 1],
STAR$g4reading[STAR$classtype == 2])
##
## Welch Two Sample t-test
##
## data: STAR$g4reading[STAR$classtype == 1] and STAR$g4reading[STAR$classtype == 2]
## t = 1.3195, df = 1541.2, p-value = 0.1872
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.703591 8.706055
## sample estimates:
## mean of x mean of y
## 723.3912 719.8900
data("resume", package = "qss")
## organize the data in tables
x <- table(resume$race, resume$call)
x
##
## 0 1
## black 2278 157
## white 2200 235
## one-sided test
prop.test(x, alternative = "greater")
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: x
## X-squared = 16.449, df = 1, p-value = 2.499e-05
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.01881967 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.9355236 0.9034908
## sample size
n0 <- sum(resume$race == "black")
n1 <- sum(resume$race == "white")
## sample proportions
p <- mean(resume$call) # overall
p0 <- mean(resume$call[resume$race == "black"]) # black
p1 <- mean(resume$call[resume$race == "white"]) # white
## point estimate
est <- p1 - p0
est
## [1] 0.03203285
## standard error
se <- sqrt(p * (1 - p) * (1 / n0 + 1 / n1))
se
## [1] 0.007796894
## z-statistic
zstat <- est / se
zstat
## [1] 4.108412
## one-sided p-value
pnorm(-abs(zstat))
## [1] 1.991943e-05
prop.test(x, alternative = "greater", correct = FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: x
## X-squared = 16.879, df = 1, p-value = 1.992e-05
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.01923035 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.9355236 0.9034908
Section 7.2.5: Pitfalls of Hypothesis Testing
Section 7.2.6: Power Analysis
## set the parameters
n <- 250
p.star <- 0.48 # data generating process
p <- 0.5 # null value
alpha <- 0.05
## critical value
cr.value <- qnorm(1 - alpha / 2)
## standard errors under the hypothetical data generating process
se.star <- sqrt(p.star * (1 - p.star) / n)
## standard error under the null
se <- sqrt(p * (1 - p) / n)
## power
pnorm(p - cr.value * se, mean = p.star, sd = se.star) +
pnorm(p + cr.value * se, mean = p.star, sd = se.star, lower.tail = FALSE)
## [1] 0.09673114
## parameters
n1 <- 500
n0 <- 500
p1.star <- 0.05
p0.star <- 0.1
## overall call back rate as a weighted average
p <- (n1 * p1.star + n0 * p0.star) / (n1 + n0)
## standard error under the null
se <- sqrt(p * (1 - p) * (1 / n1 + 1 / n0))
## standard error under the hypothetical data generating process
se.star <- sqrt(p1.star * (1 - p1.star) / n1 + p0.star * (1 - p0.star) / n0)
pnorm(-cr.value * se, mean = p1.star - p0.star, sd = se.star) +
pnorm(cr.value * se, mean = p1.star - p0.star, sd = se.star,
lower.tail = FALSE)
## [1] 0.85228
power.prop.test(n = 500, p1 = 0.05, p2 = 0.1, sig.level = 0.05)
##
## Two-sample comparison of proportions power calculation
##
## n = 500
## p1 = 0.05
## p2 = 0.1
## sig.level = 0.05
## power = 0.8522797
## alternative = two.sided
##
## NOTE: n is number in *each* group
power.prop.test(p1 = 0.05, p2 = 0.1, sig.level = 0.05, power = 0.9)
##
## Two-sample comparison of proportions power calculation
##
## n = 581.0821
## p1 = 0.05
## p2 = 0.1
## sig.level = 0.05
## power = 0.9
## alternative = two.sided
##
## NOTE: n is number in *each* group
power.t.test(n = 100, delta = 0.25, sd = 1, type = "one.sample")
##
## One-sample t test power calculation
##
## n = 100
## delta = 0.25
## sd = 1
## sig.level = 0.05
## power = 0.6969757
## alternative = two.sided
power.t.test(power = 0.9, delta = 0.25, sd = 1, type = "one.sample")
##
## One-sample t test power calculation
##
## n = 170.0511
## delta = 0.25
## sd = 1
## sig.level = 0.05
## power = 0.9
## alternative = two.sided
power.t.test(delta = 0.25, sd = 1, type = "two.sample",
alternative = "one.sided", power = 0.9)
##
## Two-sample t test power calculation
##
## n = 274.7222
## delta = 0.25
## sd = 1
## sig.level = 0.05
## power = 0.9
## alternative = one.sided
##
## NOTE: n is number in *each* group
Section 7.3: Linear Regression Model with Uncertainty
Section 7.3.1: Linear Regression as a Generative Model
data("minwage", package = "qss")
## compute proportion of full employment before minimum wage increase
minwage$fullPropBefore <- minwage$fullBefore /
(minwage$fullBefore + minwage$partBefore)
## same thing after minimum wage increase
minwage$fullPropAfter <- minwage$fullAfter /
(minwage$fullAfter + minwage$partAfter)
## an indicator for NJ: 1 if it's located in NJ and 0 if in PA
minwage$NJ <- ifelse(minwage$location == "PA", 0, 1)
fit.minwage <- lm(fullPropAfter ~ -1 + NJ + fullPropBefore +
wageBefore + chain, data = minwage)
## regression result
fit.minwage
##
## Call:
## lm(formula = fullPropAfter ~ -1 + NJ + fullPropBefore + wageBefore +
## chain, data = minwage)
##
## Coefficients:
## NJ fullPropBefore wageBefore chainburgerking
## 0.05422 0.16879 0.08133 -0.11563
## chainkfc chainroys chainwendys
## -0.15080 -0.20639 -0.22013
fit.minwage1 <- lm(fullPropAfter ~ NJ + fullPropBefore +
wageBefore + chain, data = minwage)
fit.minwage1
##
## Call:
## lm(formula = fullPropAfter ~ NJ + fullPropBefore + wageBefore +
## chain, data = minwage)
##
## Coefficients:
## (Intercept) NJ fullPropBefore wageBefore
## -0.11563 0.05422 0.16879 0.08133
## chainkfc chainroys chainwendys
## -0.03517 -0.09076 -0.10451
predict(fit.minwage, newdata = minwage[1, ])
## 1
## 0.2709367
predict(fit.minwage1, newdata = minwage[1, ])
## 1
## 0.2709367
Section 7.3.2: Unbiasedness of Estimated Coefficients
Section 7.3.3: Standard Errors of Estimated Coefficients
Section 7.3.4: Inference About Coefficients
data(women, package = "qss")
fit.women <- lm(water ~ reserved, data = women)
summary(fit.women)
##
## Call:
## lm(formula = water ~ reserved, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.991 -14.738 -7.865 2.262 316.009
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.738 2.286 6.446 4.22e-10 ***
## reserved 9.252 3.948 2.344 0.0197 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.45 on 320 degrees of freedom
## Multiple R-squared: 0.01688, Adjusted R-squared: 0.0138
## F-statistic: 5.493 on 1 and 320 DF, p-value: 0.0197
confint(fit.women) # 95% confidence intervals
## 2.5 % 97.5 %
## (Intercept) 10.240240 19.23640
## reserved 1.485608 17.01924
summary(fit.minwage)
##
## Call:
## lm(formula = fullPropAfter ~ -1 + NJ + fullPropBefore + wageBefore +
## chain, data = minwage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48617 -0.18135 -0.02809 0.15127 0.75091
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## NJ 0.05422 0.03321 1.633 0.10343
## fullPropBefore 0.16879 0.05662 2.981 0.00307 **
## wageBefore 0.08133 0.03892 2.090 0.03737 *
## chainburgerking -0.11563 0.17888 -0.646 0.51844
## chainkfc -0.15080 0.18310 -0.824 0.41074
## chainroys -0.20639 0.18671 -1.105 0.26974
## chainwendys -0.22013 0.18840 -1.168 0.24343
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2438 on 351 degrees of freedom
## Multiple R-squared: 0.6349, Adjusted R-squared: 0.6277
## F-statistic: 87.21 on 7 and 351 DF, p-value: < 2.2e-16
## confidence interval just for the `NJ' variable
confint(fit.minwage)["NJ", ]
## 2.5 % 97.5 %
## -0.01109295 0.11953297
Section 7.3.5: Inference About Predictions
## load the data and subset them into two parties
data("MPs", package = "qss")
MPs.labour <- subset(MPs, subset = (party == "labour"))
MPs.tory <- subset(MPs, subset = (party == "tory"))
## two regressions for labour: negative and positive margin
labour.fit1 <- lm(ln.net ~ margin,
data = MPs.labour[MPs.labour$margin < 0, ])
labour.fit2 <- lm(ln.net ~ margin,
data = MPs.labour[MPs.labour$margin > 0, ])
## two regressions for tory: negative and positive margin
tory.fit1 <- lm(ln.net ~ margin, data = MPs.tory[MPs.tory$margin < 0, ])
tory.fit2 <- lm(ln.net ~ margin, data = MPs.tory[MPs.tory$margin > 0, ])
## tory party: prediction at the threshold
tory.y0 <- predict(tory.fit1, interval = "confidence",
newdata = data.frame(margin = 0))
tory.y0
## fit lwr upr
## 1 12.53812 12.11402 12.96221
tory.y1 <- predict(tory.fit2, interval = "confidence",
newdata = data.frame(margin = 0))
tory.y1
## fit lwr upr
## 1 13.1878 12.80691 13.56869
## range of predictors; min to 0 and 0 to max
y1.range <- seq(from = 0, to = min(MPs.tory$margin), by = -0.01)
y2.range <- seq(from = 0, to = max(MPs.tory$margin), by = 0.01)
## prediction using all the values
tory.y0 <- predict(tory.fit1, interval = "confidence",
newdata = data.frame(margin = y1.range))
tory.y1 <- predict(tory.fit2, interval = "confidence",
newdata = data.frame(margin = y2.range))
par(cex = 1.5)
## plotting the first regression with losers
plot(y1.range, tory.y0[, "fit"], type = "l", xlim = c(-0.5, 0.5),
ylim = c(10, 15), xlab = "Margin of victory", ylab = "log net wealth")
abline(v = 0, lty = "dotted")
lines(y1.range, tory.y0[, "lwr"], lty = "dashed") # lower CI
lines(y1.range, tory.y0[, "upr"], lty = "dashed") # upper CI
## plotting the second regression with winners
lines(y2.range, tory.y1[, "fit"], lty = "solid") # point estimates
lines(y2.range, tory.y1[, "lwr"], lty = "dashed") # lower CI
lines(y2.range, tory.y1[, "upr"], lty = "dashed") # upper CI
## re-compute the predicted value and return standard errors
tory.y0 <- predict(tory.fit1, interval = "confidence", se.fit = TRUE,
newdata = data.frame(margin = 0))
tory.y0
## $fit
## fit lwr upr
## 1 12.53812 12.11402 12.96221
##
## $se.fit
## [1] 0.2141793
##
## $df
## [1] 119
##
## $residual.scale
## [1] 1.434283
tory.y1 <- predict(tory.fit2, interval = "confidence", se.fit = TRUE,
newdata = data.frame(margin = 0))
## s.e. of the intercept is the same as s.e. of the predicted value
summary(tory.fit1)
##
## Call:
## lm(formula = ln.net ~ margin, data = MPs.tory[MPs.tory$margin <
## 0, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3195 -0.4721 -0.0349 0.6629 3.5798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.5381 0.2142 58.540 <2e-16 ***
## margin 1.4911 1.2914 1.155 0.251
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.434 on 119 degrees of freedom
## Multiple R-squared: 0.01108, Adjusted R-squared: 0.002769
## F-statistic: 1.333 on 1 and 119 DF, p-value: 0.2506
## standard error
se.diff <- sqrt(tory.y0$se.fit^2 + tory.y1$se.fit^2)
se.diff
## [1] 0.2876281
## point estimate
diff.est <- tory.y1$fit[1, "fit"] - tory.y0$fit[1, "fit"]
diff.est
## [1] 0.6496861
## confidence interval
CI <- c(diff.est - se.diff * qnorm(0.975), diff.est + se.diff * qnorm(0.975))
CI
## [1] 0.0859455 1.2134268
## hypothesis test
z.score <- diff.est / se.diff
p.value <- 2 * pnorm(abs(z.score), lower.tail = FALSE) # two-sided p-value
p.value
## [1] 0.02389759