Chapter 2: Causality

Section 2.1: Racial Discrimination in the Labor Market

data("resume", package = "qss")

dim(resume)
## [1] 4870    4
head(resume)
##   firstname    sex  race call
## 1   Allison female white    0
## 2   Kristen female white    0
## 3   Lakisha female black    0
## 4   Latonya female black    0
## 5    Carrie female white    0
## 6       Jay   male white    0
summary(resume)
##   firstname             sex                race          
##  Length:4870        Length:4870        Length:4870       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##       call        
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.08049  
##  3rd Qu.:0.00000  
##  Max.   :1.00000
race.call.tab <- table(race = resume$race, call = resume$call)
race.call.tab
##        call
## race       0    1
##   black 2278  157
##   white 2200  235
addmargins(race.call.tab)
##        call
## race       0    1  Sum
##   black 2278  157 2435
##   white 2200  235 2435
##   Sum   4478  392 4870
## overall callback rate: total callbacks divided by the sample size
sum(race.call.tab[, 2]) / nrow(resume)
## [1] 0.08049281
## callback rates for each race
race.call.tab[1, 2] / sum(race.call.tab[1, ]) # black
## [1] 0.06447639
race.call.tab[2, 2] / sum(race.call.tab[2, ]) # white
## [1] 0.09650924
race.call.tab[1, ]  # the first row
##    0    1 
## 2278  157
race.call.tab[, 2]  # the second column
## black white 
##   157   235
mean(resume$call)
## [1] 0.08049281

Section 2.2: Subsetting the Data in R

class(TRUE)
## [1] "logical"
as.integer(TRUE)
## [1] 1
as.integer(FALSE)
## [1] 0
x <- c(TRUE, FALSE, TRUE) # a vector with logical values

mean(x) # proportion of TRUEs
## [1] 0.6666667
sum(x) # number of TRUEs
## [1] 2
FALSE & TRUE
## [1] FALSE
TRUE & TRUE
## [1] TRUE
TRUE | FALSE
## [1] TRUE
FALSE | FALSE
## [1] FALSE
TRUE & FALSE & TRUE
## [1] FALSE
(TRUE | FALSE) & FALSE # the parentheses evaluate to TRUE
## [1] FALSE
TRUE | (FALSE & FALSE) # the parentheses evaluate to FALSE
## [1] TRUE
TF1 <- c(TRUE, FALSE, FALSE)
TF2 <- c(TRUE, FALSE, TRUE)
TF1 | TF2
## [1]  TRUE FALSE  TRUE
TF1 & TF2
## [1]  TRUE FALSE FALSE

Section 2.2.2: Relational Operators

4 > 3
## [1] TRUE
"Hello" == "hello"  # R is case-sensitive
## [1] FALSE
"Hello" != "hello"
## [1] TRUE
x <- c(3, 2, 1, -2, -1)
x >= 2
## [1]  TRUE  TRUE FALSE FALSE FALSE
x != 1
## [1]  TRUE  TRUE FALSE  TRUE  TRUE
## logical conjunction of two vectors with logical values
(x > 0) & (x <= 2)
## [1] FALSE  TRUE  TRUE FALSE FALSE
## logical disjunction of two vectors with logical values
(x > 2) | (x <= -1)
## [1]  TRUE FALSE FALSE  TRUE  TRUE
x.int <- (x > 0) & (x <= 2) # logical vector
x.int
## [1] FALSE  TRUE  TRUE FALSE FALSE
mean(x.int) # proportion of TRUEs
## [1] 0.4
sum(x.int)  # number of TRUEs
## [1] 2

Section 2.2.3: Subsetting

## callback rate for black-sounding names
mean(resume$call[resume$race == "black"])
## [1] 0.06447639
## race of first 5 observations
resume$race[1:5]
## [1] "white" "white" "black" "black" "white"
## comparison of first 5 observations
(resume$race == "black")[1:5]
## [1] FALSE FALSE  TRUE  TRUE FALSE
dim(resume) # dimension of original data frame
## [1] 4870    4
## subset blacks only
resumeB <- resume[resume$race == "black", ]
dim(resumeB) # this data.frame has fewer rows than the original data.frame
## [1] 2435    4
mean(resumeB$call) # callback rate for blacks
## [1] 0.06447639
## keep "call" and "firstname" variables
## also keep observations with black female-sounding names
resumeBf <- subset(resume, select = c("call", "firstname"),
                   subset = (race == "black" & sex == "female"))
head(resumeBf)
##    call firstname
## 3     0   Lakisha
## 4     0   Latonya
## 8     0     Kenya
## 9     0   Latonya
## 11    0     Aisha
## 13    0     Aisha
## ## an alternative syntax with the same results
## resumeBf <- resume[resume$race == "black" & resume$sex == "female",
##                    c("call", "firstname")]
## black male
resumeBm <- subset(resume, subset = (race == "black") & (sex == "male"))
## white female
resumeWf <- subset(resume, subset = (race == "white") & (sex == "female"))
## white male
resumeWm <- subset(resume, subset = (race == "white") & (sex == "male"))
## racial gaps
mean(resumeWf$call) - mean(resumeBf$call) # among females
## [1] 0.03264689
mean(resumeWm$call) - mean(resumeBm$call) # among males
## [1] 0.03040786

Section 2.2.4: Simple Conditional Statements

resume$BlackFemale <- ifelse(resume$race == "black" &
                                 resume$sex == "female", 1, 0)
table(race = resume$race, sex = resume$sex,
      BlackFemale = resume$BlackFemale)
## , , BlackFemale = 0
## 
##        sex
## race    female male
##   black      0  549
##   white   1860  575
## 
## , , BlackFemale = 1
## 
##        sex
## race    female male
##   black   1886    0
##   white      0    0

Section 2.2.5: Factor Variables

resume$type <- NA
resume$type[resume$race == "black" & resume$sex == "female"] <- "BlackFemale"
resume$type[resume$race == "black" & resume$sex == "male"] <- "BlackMale"
resume$type[resume$race == "white" & resume$sex == "female"] <- "WhiteFemale"
resume$type[resume$race == "white" & resume$sex == "male"] <- "WhiteMale"

## check object class
class(resume$type)
## [1] "character"
## coerce new character variable into a factor variable
resume$type <- as.factor(resume$type)
## list all levels of a factor variable
levels(resume$type)
## [1] "BlackFemale" "BlackMale"   "WhiteFemale" "WhiteMale"
## obtain the number of observations for each level
table(resume$type)
## 
## BlackFemale   BlackMale WhiteFemale   WhiteMale 
##        1886         549        1860         575
tapply(resume$call, resume$type, mean)
## BlackFemale   BlackMale WhiteFemale   WhiteMale 
##  0.06627784  0.05828780  0.09892473  0.08869565
## turn first name into a factor variable
resume$firstname <- as.factor(resume$firstname)
## compute callback rate for each first name
callback.name <- tapply(resume$call, resume$firstname, mean)
## sort the result in the increasing order
sort(callback.name)
##      Aisha    Rasheed     Keisha   Tremayne     Kareem    Darnell 
## 0.02222222 0.02985075 0.03825137 0.04347826 0.04687500 0.04761905 
##     Tyrone      Hakim     Tamika    Lakisha    Tanisha       Todd 
## 0.05333333 0.05454545 0.05468750 0.05500000 0.05797101 0.05882353 
##      Jamal       Neil      Brett   Geoffrey    Brendan       Greg 
## 0.06557377 0.06578947 0.06779661 0.06779661 0.07692308 0.07843137 
##      Emily       Anne       Jill     Latoya      Kenya    Matthew 
## 0.07929515 0.08264463 0.08374384 0.08407080 0.08673469 0.08955224 
##    Latonya      Leroy    Allison      Ebony   Jermaine     Laurie 
## 0.09130435 0.09375000 0.09482759 0.09615385 0.09615385 0.09743590 
##      Sarah   Meredith     Carrie    Kristen        Jay       Brad 
## 0.09844560 0.10160428 0.13095238 0.13145540 0.13432836 0.15873016

Section 2.3: Causal Effects and the Counterfactual

resume[1, ]
##   firstname    sex  race call BlackFemale        type
## 1   Allison female white    0           0 WhiteFemale

Section 2.4: Randomized Controlled Trials

Section 2.4.1: The Role of Randomization

Section 2.4.2: Social Pressure and Voter Turnout

data("social", package = "qss") # load the data

summary(social) # summarize the data
##      sex             yearofbirth    primary2004       messages        
##  Length:305866      Min.   :1900   Min.   :0.0000   Length:305866     
##  Class :character   1st Qu.:1947   1st Qu.:0.0000   Class :character  
##  Mode  :character   Median :1956   Median :0.0000   Mode  :character  
##                     Mean   :1956   Mean   :0.4014                     
##                     3rd Qu.:1965   3rd Qu.:1.0000                     
##                     Max.   :1986   Max.   :1.0000                     
##   primary2006         hhsize     
##  Min.   :0.0000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.0000   Median :2.000  
##  Mean   :0.3122   Mean   :2.184  
##  3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.0000   Max.   :8.000
## turnout for each group
tapply(social$primary2006, social$messages, mean)
## Civic Duty    Control  Hawthorne  Neighbors 
##  0.3145377  0.2966383  0.3223746  0.3779482
## turnout for control group
mean(social$primary2006[social$messages == "Control"])
## [1] 0.2966383
## subtract control group turnout from each group
tapply(social$primary2006, social$messages, mean) -
    mean(social$primary2006[social$messages == "Control"])
## Civic Duty    Control  Hawthorne  Neighbors 
## 0.01789934 0.00000000 0.02573631 0.08130991
social$age <- 2006 - social$yearofbirth # create age variable
tapply(social$age, social$messages, mean)
## Civic Duty    Control  Hawthorne  Neighbors 
##   49.65904   49.81355   49.70480   49.85294
tapply(social$primary2004, social$messages, mean)
## Civic Duty    Control  Hawthorne  Neighbors 
##  0.3994453  0.4003388  0.4032300  0.4066647
tapply(social$hhsize, social$messages, mean)
## Civic Duty    Control  Hawthorne  Neighbors 
##   2.189126   2.183667   2.180138   2.187770

Section 2.5: Observational Studies

Section 2.5.1: Minimum Wage and Unemployment

data("minwage", package = "qss") # load the data

dim(minwage) # dimension of data
## [1] 358   8
summary(minwage) # summary of data
##     chain             location           wageBefore      wageAfter    
##  Length:358         Length:358         Min.   :4.250   Min.   :4.250  
##  Class :character   Class :character   1st Qu.:4.250   1st Qu.:5.050  
##  Mode  :character   Mode  :character   Median :4.500   Median :5.050  
##                                        Mean   :4.618   Mean   :4.994  
##                                        3rd Qu.:4.987   3rd Qu.:5.050  
##                                        Max.   :5.750   Max.   :6.250  
##    fullBefore       fullAfter        partBefore      partAfter    
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 2.125   1st Qu.: 2.000   1st Qu.:11.00   1st Qu.:11.00  
##  Median : 6.000   Median : 6.000   Median :16.25   Median :17.00  
##  Mean   : 8.475   Mean   : 8.362   Mean   :18.75   Mean   :18.69  
##  3rd Qu.:12.000   3rd Qu.:12.000   3rd Qu.:25.00   3rd Qu.:25.00  
##  Max.   :60.000   Max.   :40.000   Max.   :60.00   Max.   :60.00
## subsetting the data into two states
minwageNJ <- subset(minwage, subset = (location != "PA"))
minwagePA <- subset(minwage, subset = (location == "PA"))

## proportion of restaurants whose wage is less than $5.05
mean(minwageNJ$wageBefore < 5.05) # NJ before
## [1] 0.9106529
mean(minwageNJ$wageAfter < 5.05)  # NJ after
## [1] 0.003436426
mean(minwagePA$wageBefore < 5.05) # PA before
## [1] 0.9402985
mean(minwagePA$wageAfter < 5.05)  # PA after
## [1] 0.9552239
## create a variable for proportion of full-time employees in NJ and PA
minwageNJ$fullPropAfter <- minwageNJ$fullAfter /
    (minwageNJ$fullAfter + minwageNJ$partAfter)
minwagePA$fullPropAfter <- minwagePA$fullAfter /
    (minwagePA$fullAfter + minwagePA$partAfter)

## compute the difference in means
mean(minwageNJ$fullPropAfter) - mean(minwagePA$fullPropAfter)
## [1] 0.04811886

Section 2.5.2: Confounding Bias

prop.table(table(minwageNJ$chain))
## 
## burgerking        kfc       roys     wendys 
##  0.4054983  0.2233677  0.2508591  0.1202749
prop.table(table(minwagePA$chain))
## 
## burgerking        kfc       roys     wendys 
##  0.4626866  0.1492537  0.2238806  0.1641791
## subset Burger King only
minwageNJ.bk <- subset(minwageNJ, subset = (chain == "burgerking"))
minwagePA.bk <- subset(minwagePA, subset = (chain == "burgerking"))

## comparison of full-time employment rates
mean(minwageNJ.bk$fullPropAfter) - mean(minwagePA.bk$fullPropAfter)
## [1] 0.03643934
minwageNJ.bk.subset <-
    subset(minwageNJ.bk, subset = ((location != "shoreNJ") &
                                       (location != "centralNJ")))

mean(minwageNJ.bk.subset$fullPropAfter) - mean(minwagePA.bk$fullPropAfter)
## [1] 0.03149853

Section 2.5.3: Before-and-After and Difference-in-Differences Designs

## full-time employment proportion in the previous period for NJ
minwageNJ$fullPropBefore <- minwageNJ$fullBefore /
    (minwageNJ$fullBefore + minwageNJ$partBefore)

## mean difference between before and after the minimum wage increase
NJdiff <- mean(minwageNJ$fullPropAfter) - mean(minwageNJ$fullPropBefore)
NJdiff
## [1] 0.02387474
## full-time employment proportion in the previous period for PA
minwagePA$fullPropBefore <- minwagePA$fullBefore /
    (minwagePA$fullBefore + minwagePA$partBefore)
## mean difference between before and after for PA
PAdiff <- mean(minwagePA$fullPropAfter) - mean(minwagePA$fullPropBefore)
## difference-in-differences
NJdiff - PAdiff
## [1] 0.06155831
## full-time employment proportion in the previous period for PA
minwagePA$fullPropBefore <- minwagePA$fullBefore /
    (minwagePA$fullBefore + minwagePA$partBefore)
## mean difference between before and after for PA
PAdiff <- mean(minwagePA$fullPropAfter) - mean(minwagePA$fullPropBefore)
## difference-in-differences
NJdiff - PAdiff
## [1] 0.06155831

Section 2.6: Descriptive Statistics for a Single Variable

Section 2.6.1: Quantiles

## cross-section comparison between NJ and PA
median(minwageNJ$fullPropAfter) - median(minwagePA$fullPropAfter)
## [1] 0.07291667
## before and after comparison
NJdiff.med <- median(minwageNJ$fullPropAfter) -
    median(minwageNJ$fullPropBefore)
NJdiff.med
## [1] 0.025
## median difference-in-differences
PAdiff.med <- median(minwagePA$fullPropAfter) -
    median(minwagePA$fullPropBefore)
NJdiff.med - PAdiff.med
## [1] 0.03701923
## summary shows quartiles as well as minimum, maximum, and mean
summary(minwageNJ$wageBefore)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.25    4.25    4.50    4.61    4.87    5.75
summary(minwageNJ$wageAfter)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.000   5.050   5.050   5.081   5.050   5.750
## interquartile range
IQR(minwageNJ$wageBefore)
## [1] 0.62
IQR(minwageNJ$wageAfter)
## [1] 0
## deciles (10 groups)
quantile(minwageNJ$wageBefore, probs = seq(from = 0, to = 1, by = 0.1))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
## 4.25 4.25 4.25 4.25 4.50 4.50 4.65 4.75 5.00 5.00 5.75
quantile(minwageNJ$wageAfter, probs = seq(from = 0, to = 1, by = 0.1))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
## 5.00 5.05 5.05 5.05 5.05 5.05 5.05 5.05 5.05 5.15 5.75

2.6.2: Standard Deviation

sqrt(mean((minwageNJ$fullPropAfter - minwageNJ$fullPropBefore)^2))
## [1] 0.3014669
mean(minwageNJ$fullPropAfter - minwageNJ$fullPropBefore)
## [1] 0.02387474
## standard deviation
sd(minwageNJ$fullPropBefore)
## [1] 0.2304592
sd(minwageNJ$fullPropAfter)
## [1] 0.2510016
## variance
var(minwageNJ$fullPropBefore)
## [1] 0.05311145
var(minwageNJ$fullPropAfter)
## [1] 0.0630018