39 Code for HSB Example in Chapter 4 of R&B

Author

Luke Miratrix

This script builds everything from Chapter 4 of Raudenbush and Bryk in R. It is a very useful script for getting pretty much all the code you would need for a conventional multilevel analysis. The code is divided by each table or plot from the chapter.

39.1 R Setup

library(foreign) #this lets us read in spss files
library(tidyverse) #this is a broad package that allows us to do lots of data management-y things (and ggplot!)
library(lme4) #this allows us to run MLM
library(arm) #this allows us to display MLM
library( lmerTest ) # this puts p-values on the summary() command for fixed effects

39.2 Load HS&B data

# Read student data
stud.dat = read.spss( "data/hsb1.sav", to.data.frame=TRUE )

# Read in school data
sch.dat = read.spss( "data/hsb2.sav", to.data.frame=TRUE )

# Make single data frame with all variables, keep all students even if they
# don't match to a school
dat = merge( stud.dat, sch.dat, by="id", all.x=TRUE )

39.3 Table 4.1 Descriptive summaries

## Get mean and SD of the Level 1 variables, rounded to 2 decimal places
# math achievement
round(mean(dat$mathach),2)

[1] 12.75

round(sd(dat$mathach),2)

[1] 6.88

# ses
round(mean(dat$ses),2)

[1] 0

round(sd(dat$ses),2)

[1] 0.78

## Get mean and SD of Level 2 variables, round to 2 decimal places
# NOTE: we are getting these from the SCHOOL-LEVEL FILE
# sector
round(mean(sch.dat$sector),2) # this answers "what percent of schools are catholic?"

[1] 0.44

round(sd(sch.dat$sector),2)

[1] 0.5

# mean ses
round(mean(sch.dat$meanses),2) # this answers "what is the average of the school-average SES values?"

[1] 0

round(sd(sch.dat$meanses),2)

[1] 0.41

# NOTE: if we used the student-level or "dat" file, we would be answering the
# following questions:
# * what percent of students attend a catholic school?
# * what is the average student ses? <- this would match what we calculated
# ourselves if we had the entire school in our sample

39.4 Table 4.2: One-Way ANOVA (i.e uncontrolled random intercept)

## Fit the model described 
mod4.2 <- lmer(mathach ~ 1 + (1|id), data=dat)
# Peek at the results
display(mod4.2)

lmer(formula = mathach ~ 1 + (1 | id), data = dat)
coef.est  coef.se 
   12.64     0.24 

Error terms:
 Groups   Name        Std.Dev.
 id       (Intercept) 2.93    
 Residual             6.26    
---
number of obs: 7185, groups: id, 160
AIC = 47122.8, DIC = 47114.8
deviance = 47115.8

## Extract the fixed effect coefficient (and it's standard error)
fixef(mod4.2) # extracts the fixed effect coefficient(s)

(Intercept) 
   12.63697

se.coef(mod4.2)$fixef #extracts the standard errors for the fixed effect(s)

[1] 0.2443936

## Extract the variance components
# Note: in the model display, we see the SDs, not the variance
VarCorr(mod4.2)

 Groups   Name        Std.Dev.
 id       (Intercept) 2.9350  
 Residual             6.2569

# To get the variances, we extract each part and square it
# variance of random intercept
(sigma.hat(mod4.2)$sigma$id)^2

(Intercept) 
   8.614025

# variance of level 1 residual (easier to extract)
sigma(mod4.2)^2

[1] 39.14832

# could also use the more complicated formula that we used with the intercept.
# If we do, we get the same thing
sigma.hat(mod4.2)$sigma$data^2

[1] 39.14832

# Inference on the need for a random intercept
# Thus uses the book's way of calculating a test statistic with a
# chi-squared distribution.

schools = dat %>% group_by( id ) %>%
  summarise( nj = n(),
             Y.bar.j = mean( mathach ) )
gamma.00 = fixef( mod4.2 )[[1]]
sigma.2 = sigma(mod4.2)^2 
H = sum( schools$nj * (schools$Y.bar.j - gamma.00)^2 / sigma.2 )
H

[1] 1660.232

# our p-value
pchisq( H, df = nrow( schools ) - 1, lower.tail = FALSE )

[1] 4.770612e-248

# calculating the ICC
tau.00 = VarCorr(mod4.2)$id[1,1]
rho.hat = tau.00 / (tau.00 + sigma.2 )
rho.hat

[1] 0.1803518

# Calculating reliability for each school mean. (Here it is purely a function of
# students in the school.  More students, more info, and thus more reliable.)
sigma.2 = sigma(mod4.2)^2 
tau.00 = VarCorr(mod4.2)$id[1,1]
lambda = tau.00 / ( tau.00 + sigma.2 / schools$nj )
mean( lambda )

[1] 0.9013773

# A bonus graph of the reliabilities
qplot( lambda )

39.5 Table 4.3 Means as Outcomes Model

# (i.e. random intercept with Level 2 predictor)
## Fit the model described 
mod4.3 <- lmer(mathach ~ 1 + meanses + (1|id), data=dat)

# Peek at the results
display(mod4.3)

lmer(formula = mathach ~ 1 + meanses + (1 | id), data = dat)
            coef.est coef.se
(Intercept) 12.65     0.15  
meanses      5.86     0.36  

Error terms:
 Groups   Name        Std.Dev.
 id       (Intercept) 1.62    
 Residual             6.26    
---
number of obs: 7185, groups: id, 160
AIC = 46969.3, DIC = 46956.9
deviance = 46959.1

## Extract the fixed effect coefficients (and standard errors/t-statistics)
fixef(mod4.3) # extracts the fixed effect coefficients

(Intercept)     meanses 
  12.649435    5.863538

# NOTE: you can call them separately by "indexing" them
# just the intercept
fixef(mod4.3)[1]

(Intercept) 
   12.64944

# just coefficient on mean ses
fixef(mod4.3)[2]

 meanses 
5.863538

se.coef(mod4.3)$fixef #extracts the standard errors for the fixed effect(s)

[1] 0.1492801 0.3614580

## Calculate (or extract) the t-ratio (aka the t-statistic)

# NOTE: the author's don't present this for the intercept, because we often
# don't care. But it is presented here for completeness

# tstats for intercept
fixef(mod4.3)[1]/se.coef(mod4.3)$fixef[1]

(Intercept) 
   84.73622

# tstat mean ses
fixef(mod4.3)[2]/se.coef(mod4.3)$fixef[2]

 meanses 
16.22191

# tstat extracted - this does both variables at once! 
coef(summary(mod4.3))[,"t value"]

(Intercept)     meanses 
   84.73622    16.22191

# NOTE: Let's look at what is happening here:
coef(summary(mod4.3)) # gives us all the fixed effect statistics we could want

             Estimate Std. Error       df  t value      Pr(>|t|)
(Intercept) 12.649435  0.1492801 153.7425 84.73622 6.032590e-131
meanses      5.863538  0.3614580 153.4067 16.22191  4.267894e-35

# the [ ] is called "indexing" - it's a way of subsetting data by telling R
# which [rows,columns] you want to see we are telling R that we want ALL rows "[
# ," but only the column labeled "t value"

## Extract the variance components
# Note: in the model display, we see the SDs, not the variance
VarCorr(mod4.3)

 Groups   Name        Std.Dev.
 id       (Intercept) 1.6244  
 Residual             6.2576

# To get the variances, we extract each part and square it
# variance of random intercept
(sigma.hat(mod4.3)$sigma$id)^2

(Intercept) 
   2.638708

# variance of level 1 residual
sigma(mod4.3)^2

[1] 39.15708

# Range of plausible values for school means for schools with mean SES of 0:
# See page 73-74)
fixef( mod4.3 )[[1]] + c(-1.96, 1.96) * (sigma.hat(mod4.3)$sigma$id)

[1]  9.465592 15.833279

# Compare to our model without mean ses
fixef( mod4.2 )[[1]] + c(-1.96, 1.96) * (sigma.hat(mod4.2)$sigma$id)

[1]  6.884441 18.389507

# Proportion reduction in variance or "variance explained" at level 2
tau.00.anova = (sigma.hat(mod4.2)$sigma$id)^2
tau.00.meanses = (sigma.hat(mod4.3)$sigma$id)^2
(tau.00.anova-tau.00.meanses) / tau.00.anova

(Intercept) 
   0.693673

## Inference on the random effects
schools = merge( schools, sch.dat, by="id" )
gamma.00 = fixef( mod4.3 )[[1]]
gamma.01 = fixef( mod4.3 )[[2]]
schools = mutate( schools, resid = Y.bar.j - gamma.00 - gamma.01*meanses )
H = sum( schools$nj * schools$resid^2 ) / sigma(mod4.3)^2 
H

[1] 633.5175

pchisq( H, nrow( schools ) - 2, lower.tail = FALSE )

[1] 3.617696e-58

## Reliability revisited (from pg 75)
mod4.3

Linear mixed model fit by REML ['lmerModLmerTest']
Formula: mathach ~ 1 + meanses + (1 | id)
   Data: dat
REML criterion at convergence: 46961.28
Random effects:
 Groups   Name        Std.Dev.
 id       (Intercept) 1.624   
 Residual             6.258   
Number of obs: 7185, groups:  id, 160
Fixed Effects:
(Intercept)      meanses  
     12.649        5.864

u.hat = coef( mod4.3 )$id
head( u.hat )

     (Intercept)  meanses
1224    12.32688 5.863538
1288    12.71898 5.863538
1296    10.70101 5.863538
1308    12.92208 5.863538
1317    11.48086 5.863538
1358    11.73878 5.863538

sigma.2 = sigma(mod4.3)^2 
tau.00 = VarCorr(mod4.3)$id[1,1]
sigma.2

[1] 39.15708

tau.00

[1] 2.638708

# These are the individual reliabilities---how well we can separate schools with the same Mean SES
# (So it is _conditional_ on the mean SES of the schools.)
lambda.j = tau.00 / (tau.00 + (sigma.2 / schools$nj))
mean( lambda.j )

[1] 0.7400747

39.6 Table 4.4 Random coefficient model (i.e. random slope)

# group-mean center ses  
dat <- dat %>% group_by( id ) %>% 
  mutate( ses_grpcenter = ses - mean(ses) )

## Fit the model described 
mod4.4 <- lmer(mathach ~ 1 + ses_grpcenter + ( 1 + ses_grpcenter | id ), data=dat)
# Peek at the results
display(mod4.4)

lmer(formula = mathach ~ 1 + ses_grpcenter + (1 + ses_grpcenter | 
    id), data = dat)
              coef.est coef.se
(Intercept)   12.64     0.24  
ses_grpcenter  2.19     0.13  

Error terms:
 Groups   Name          Std.Dev. Corr 
 id       (Intercept)   2.95          
          ses_grpcenter 0.83     0.02 
 Residual               6.06          
---
number of obs: 7185, groups: id, 160
AIC = 46726.2, DIC = 46707.7
deviance = 46711.0

## Extract the fixed effect coefficients (and standard errors/t-statistics)
coef(summary(mod4.4)) #this reproduces the whole first panel, though methods used above also work

               Estimate Std. Error       df  t value      Pr(>|t|)
(Intercept)   12.636193  0.2445047 156.7512 51.68077 2.286893e-100
ses_grpcenter  2.193196  0.1282589 155.2166 17.09976  1.582355e-37

## Extract the variance components
# Note: in the model display, we see the SDs, not the variance
VarCorr(mod4.4)

 Groups   Name          Std.Dev. Corr 
 id       (Intercept)   2.94636       
          ses_grpcenter 0.83307  0.019
 Residual               6.05807

# variance of random effects
(sigma.hat(mod4.4)$sigma$id)^2

  (Intercept) ses_grpcenter 
    8.6810437     0.6939974

# NOTE: to extract one or the other, you can use indexing
(sigma.hat(mod4.4)$sigma$id[1])^2 #this is just the intercept random effect

(Intercept) 
   8.681044

# variance of level 1 residual
sigma(mod4.4)^2

[1] 36.70019

39.7 Table 4.5 Intercepts and Slopes as Outcomes Model

## Fit the model described 
mod4.5 <- lmer(mathach ~ 1 + meanses + sector + ses_grpcenter*(meanses + sector) + ( 1 + ses_grpcenter | id ), data=dat)

# NOTE: The code above allows the coefficients to appear in the same order as in Table 4.5

# R automatically includes the main effects, so this model can be written more
# concisely as shown below:
#
# lmer(mathach ~ 1 + ses_grpcenter*(meanses + sector) + ( 1 + ses_grpcenter | id ), data=dat)

# Peek at the results
display(mod4.5)

lmer(formula = mathach ~ 1 + meanses + sector + ses_grpcenter * 
    (meanses + sector) + (1 + ses_grpcenter | id), data = dat)
                      coef.est coef.se
(Intercept)           12.10     0.20  
meanses                5.33     0.37  
sector                 1.23     0.31  
ses_grpcenter          2.94     0.16  
meanses:ses_grpcenter  1.04     0.30  
sector:ses_grpcenter  -1.64     0.24  

Error terms:
 Groups   Name          Std.Dev. Corr 
 id       (Intercept)   1.54          
          ses_grpcenter 0.32     0.39 
 Residual               6.06          
---
number of obs: 7185, groups: id, 160
AIC = 46523.7, DIC = 46489.2
deviance = 46496.4

## Extract the fixed effect coefficients (and standard errors/t-statistics)
#this reproduces the whole first panel, though methods used above also work
coef(summary(mod4.5))

                       Estimate Std. Error       df   t value      Pr(>|t|)
(Intercept)           12.095997  0.1987329 159.9143 60.865590 1.625101e-112
meanses                5.332898  0.3691567 150.9836 14.446161  2.944282e-30
sector                 1.226453  0.3062674 149.6139  4.004518  9.756638e-05
ses_grpcenter          2.938785  0.1550889 139.2934 18.949039  2.197507e-40
meanses:ses_grpcenter  1.038918  0.2988941 160.5428  3.475873  6.550388e-04
sector:ses_grpcenter  -1.642619  0.2397854 143.3351 -6.850371  2.009493e-10

# NOTE: there is a slight descrepancy in the estimate for meanses:ses_grpcenter and 
# the t-statistics for meanses:ses_grpcenter and sector:ses_grpcenter; nothing that 
# changes the interpretations, however.


# Testing the need for sector  (see page 82)
# (We use a likelihood ratio test with the anova() function)
mod4.5.null <- lmer(mathach ~ 1 + meanses + ses_grpcenter*(meanses) + ( 1 + ses_grpcenter | id ), data=dat)
anova( mod4.5, mod4.5.null )

Data: dat
Models:
mod4.5.null: mathach ~ 1 + meanses + ses_grpcenter * (meanses) + (1 + ses_grpcenter | id)
mod4.5: mathach ~ 1 + meanses + sector + ses_grpcenter * (meanses + sector) + (1 + ses_grpcenter | id)
            npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
mod4.5.null    8 46568 46623 -23276    46552                         
mod4.5        10 46516 46585 -23248    46496 55.941  2  7.122e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Testing the need for random slope  (see page 84)
# (We use a likelihood ratio test with the anova() function)
mod4.5.null.slope <- lmer(mathach ~ 1 + meanses + sector + ses_grpcenter*(meanses + sector) + ( 1 | id ), data=dat) 
anova( mod4.5, mod4.5.null.slope )

Data: dat
Models:
mod4.5.null.slope: mathach ~ 1 + meanses + sector + ses_grpcenter * (meanses + sector) + (1 | id)
mod4.5: mathach ~ 1 + meanses + sector + ses_grpcenter * (meanses + sector) + (1 + ses_grpcenter | id)
                  npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)
mod4.5.null.slope    8 46513 46568 -23249    46497                     
mod4.5              10 46516 46585 -23248    46496 1.0039  2     0.6054

39.8 Figure 4.1

NOTE: Figure 4.1 is a graphical display using the results from Model/Table 4.5

The solid line represents the slope of the gamma-01 coefficient; this is the same in public and catholic schools. The dotted lines represent the the slope for individual schools with “prototypical” values of meanses (-1,0,1 standard deviations from mean)

# to calculate this, we should note a few values: 
avg_meanses <- mean(dat$meanses) #average of mean ses var
high_meanses <- mean(dat$meanses) + sd(dat$meanses) # 1 sd above avg meanses
low_meanses <- mean(dat$meanses) - sd(dat$meanses) # 1 sd below avg meanses

fake.students = expand.grid( id = -1,
                             meanses = c( low_meanses, avg_meanses, high_meanses ),
                             sector = c( 0, 1 ),
                             ses_grpcenter = c( -1, 0, 1 ) )
fake.students = mutate( fake.students, ses = meanses + ses_grpcenter )
fake.students$mathach = predict( mod4.5, newdata=fake.students, allow.new.levels = TRUE )
fake.schools = filter( fake.students, ses_grpcenter == 0 )

ggplot( fake.students, aes( ses, mathach ) ) + 
  facet_wrap( ~ sector ) +
  geom_line( aes( group=meanses ), lty = 2 ) +
  geom_line( data=fake.schools, aes( x = ses, y = mathach ) ) +
  geom_point( data=fake.schools, aes( x = ses, y = mathach ) )

39.9 Set-up for remaining tables/figures of chapter

In order to create table 4.6 and the following 2 graphs, we will need to prepare a new dataset. These next lines of code do that.

## Start with school level data frame and keep variables interesting to our model comparison
mod.comp <- dplyr::select( sch.dat, id, meanses, sector )

## Add in number of observations per school 
n_j <- dat %>% group_by( id ) %>%
  dplyr::summarise(n_j = n())

mod.comp <- merge(mod.comp, n_j, by="id")
head( mod.comp )

    id meanses sector n_j
1 1224  -0.428      0  47
2 1288   0.128      0  25
3 1296  -0.420      0  48
4 1308   0.534      1  20
5 1317   0.351      1  48
6 1358  -0.014      0  30

## Run site-specific OLS for each school and save estimates 

# Calculate global (not group) centered ses
dat$ses_centered <- dat$ses - mean(dat$ses)

# This is the "for loop" method of generating an estimate for each of many small
# worlds (schools). See lecture 2.3 code for the "tidyverse" way.
est.ols <- matrix(nrow=160,ncol=2) #create a matrix to store estimates 
se.ols <- matrix(nrow=160,ncol=2) #create matrix to store standard errors

for (i in 1:length(unique(dat$id))){ #looping across the 160 different values of id
    id <- unique(dat$id)[i] #pick the value of id we want
    mod <- lm(mathach ~ 1 + ses_grpcenter, data=dat[dat$id==id,]) #run the model on students in that 1 school
    est.ols[i,] <- coef( mod ) #save the setimates in the matrix we created
    se.ols[i,] <- se.coef( mod ) # and the SEs
}

#convert the matrix to a dataframe and attach the schoolid info
est.ols <- as.data.frame(est.ols)
est.ols$id <- sch.dat$id
names(est.ols) <- c( 'b0_ols', 'b1_ols', 'id' )

#store standard errors for later
se.ols <- as.data.frame(se.ols)
se.ols$id <- sch.dat$id
names(se.ols) <- c( 'se_b0_ols', 'se_b1_ols', 'id' )

mod.comp <- merge(mod.comp, est.ols, by='id')
mod.comp <- merge(mod.comp, se.ols, by='id' )
head( mod.comp )

    id meanses sector n_j    b0_ols    b1_ols se_b0_ols se_b1_ols
1 1224  -0.428      0  47  9.715447 2.5085817 1.0954478  1.765216
2 1288   0.128      0  25 13.510800 3.2554487 1.3637656  2.079675
3 1296  -0.420      0  48  7.635958 1.0759591 0.7740752  1.209016
4 1308   0.534      1  20 16.255500 0.1260242 1.4045813  3.003437
5 1317   0.351      1  48 13.177688 1.2739128 0.7902486  1.435942
6 1358  -0.014      0  30 11.206233 5.0680087 0.8994345  1.391550

# We are done running OLS on each of our schools and storing the results.

## Extract site-specific coefficients from "unconditional model" (model 4.4)
est4.4 <- coef(mod4.4)$id
names(est4.4) <- c('b0_uncond', 'b1_uncond') #rename
est4.4$id = rownames( est4.4 )

## Extract site-specific coefficients from the "conditional model" (model 4.5)
est4.5 <- coef(mod4.5)$id
head( est4.5 )

     (Intercept)  meanses   sector ses_grpcenter meanses:ses_grpcenter
1224    12.02263 5.332898 1.226453      2.933689              1.038918
1288    12.55180 5.332898 1.226453      2.979174              1.038918
1296    10.38509 5.332898 1.226453      2.744066              1.038918
1308    12.12710 5.332898 1.226453      2.923822              1.038918
1317    10.56530 5.332898 1.226453      2.806582              1.038918
1358    11.60500 5.332898 1.226453      2.961265              1.038918
     sector:ses_grpcenter
1224            -1.642619
1288            -1.642619
1296            -1.642619
1308            -1.642619
1317            -1.642619
1358            -1.642619

est4.5$id = rownames( est4.5 )

# Now we need to calculate the point estimates using our individual regression equations
# including our level-2 values for each school
# (This is a bit of a pain.)
est4.5 = merge( est4.5, mod.comp, by="id", suffixes = c( "", ".v" ) )
head( est4.5 )

    id (Intercept)  meanses   sector ses_grpcenter meanses:ses_grpcenter
1 1224    12.02263 5.332898 1.226453      2.933689              1.038918
2 1288    12.55180 5.332898 1.226453      2.979174              1.038918
3 1296    10.38509 5.332898 1.226453      2.744066              1.038918
4 1308    12.12710 5.332898 1.226453      2.923822              1.038918
5 1317    10.56530 5.332898 1.226453      2.806582              1.038918
6 1358    11.60500 5.332898 1.226453      2.961265              1.038918
  sector:ses_grpcenter meanses.v sector.v n_j    b0_ols    b1_ols se_b0_ols
1            -1.642619    -0.428        0  47  9.715447 2.5085817 1.0954478
2            -1.642619     0.128        0  25 13.510800 3.2554487 1.3637656
3            -1.642619    -0.420        0  48  7.635958 1.0759591 0.7740752
4            -1.642619     0.534        1  20 16.255500 0.1260242 1.4045813
5            -1.642619     0.351        1  48 13.177688 1.2739128 0.7902486
6            -1.642619    -0.014        0  30 11.206233 5.0680087 0.8994345
  se_b1_ols
1  1.765216
2  2.079675
3  1.209016
4  3.003437
5  1.435942
6  1.391550

est4.5 = mutate( est4.5, 
                 b0_cond = `(Intercept)` + sector * sector.v + meanses * meanses.v,
                 b1_cond = ses_grpcenter + `sector:ses_grpcenter` * sector.v + `meanses:ses_grpcenter` * meanses.v )

est4.5 = dplyr::select( est4.5, id, b0_cond, b1_cond )


## Combine the MLM estimates into 1 dataset with ids
est.mlm <- merge( est4.4, est4.5, by="id" )

# Merge all the estimates together by school id
mod.comp <- merge(mod.comp,est.mlm,by = 'id',all=TRUE)

head( mod.comp )

    id meanses sector n_j    b0_ols    b1_ols se_b0_ols se_b1_ols b0_uncond
1 1224  -0.428      0  47  9.715447 2.5085817 1.0954478  1.765216  9.956953
2 1288   0.128      0  25 13.510800 3.2554487 1.3637656  2.079675 13.386036
3 1296  -0.420      0  48  7.635958 1.0759591 0.7740752  1.209016  8.039091
4 1308   0.534      1  20 16.255500 0.1260242 1.4045813  3.003437 15.622073
5 1317   0.351      1  48 13.177688 1.2739128 0.7902486  1.435942 13.132771
6 1358  -0.014      0  30 11.206233 5.0680087 0.8994345  1.391550 11.387452
  b1_uncond   b0_cond  b1_cond
1  2.262837  9.740146 2.489033
2  2.375964 13.234409 3.112156
3  1.872247  8.145275 2.307720
4  2.050193 16.201317 1.835985
5  1.997129 13.663596 1.528623
6  2.738390 11.530341 2.946721

39.10 Table 4.6 Comparing site-specific estimates from different models

## Create the list of rows that B&R include in the table p. 87
keeprows <- c(4, 15, 17, 22, 27, 53, 69, 75, 81, 90, 135, 153)

## Limit data to the rows of interest, and print the columns in Table 4.6 in the correct order
tab4.6 <- mod.comp[keeprows, c('b0_ols','b1_ols','b0_uncond','b1_uncond','b0_cond','b1_cond','n_j','meanses','sector') ]


## Print Table 4.6 -- the Empirical Bayes from conditional model (b0_cond, b1_cond) are waaaaaay off
round(tab4.6,2)

    b0_ols b1_ols b0_uncond b1_uncond b0_cond b1_cond n_j meanses sector
4    16.26   0.13     15.62      2.05   16.20    1.84  20    0.53      1
15   15.98   2.15     15.74      2.19   16.01    1.84  53    0.52      1
17   18.11   0.09     17.41      1.95   17.25    3.71  29    0.69      0
22   11.14  -0.78     11.22      1.15   10.89    0.63  67   -0.62      1
27   13.40   4.10     13.32      2.54   12.95    3.00  38   -0.06      0
53    9.52   3.74      9.76      2.75    9.37    2.42  51   -0.64      0
69   11.47   6.18     11.64      2.72   11.92    3.03  25    0.08      0
75    9.06   1.65      9.28      2.01    9.30    0.67  63   -0.59      1
81   15.42   5.26     15.25      3.14   15.53    1.91  66    0.43      1
90   12.14   1.97     12.18      2.14   12.34    3.03  50    0.19      0
135   4.55   0.25      6.42      1.92    8.55    2.63  14    0.03      0
153  10.28   0.76     10.71      2.06    9.67    2.37  19   -0.59      0

39.11 Figure 4.2 : Scatter plots of the estimates from 2 unconstrained models

## Panel (a) and Panel (b) are plotted on the same graph 
ggplot(data=mod.comp,aes()) + 
  geom_point(aes(x=b1_ols,y=b0_ols),color='black',alpha=0.7) + 
  geom_point(aes(x=b1_uncond,y=b0_uncond),color='blue',alpha=0.7) + 
  labs(title="Black=OLS; Blue=Unconditional EB") +
  xlim(-5,8) + ylim(2,20)

39.12 Figure 4.3 : Scatter plots of residuals from the OLS & Constrained MLM model

## Luke: Equation 4.271 and 4.27b (p. 92) are allegedly how we calculate the intercept and slope residuals 
## But I'm not sure where the estimates for the gamma-hat terms come from; the OLS model only includes
## individual-level ses

# trying it here with the predictions from conditional EB
fes = fixef( mod4.5 )
fes

          (Intercept)               meanses                sector 
            12.095997              5.332898              1.226453 
        ses_grpcenter meanses:ses_grpcenter  sector:ses_grpcenter 
             2.938785              1.038918             -1.642619

mod.comp = mutate( mod.comp,
                   u0_ols = b0_ols - (fes[1] + fes[2]*meanses + fes[3]*sector),
                   u1_ols = b1_ols - (fes[4] + fes[5]*meanses + fes[6]*sector)  )


## Panel (a) and (b) plotted on same graph

mod.comp = mutate( mod.comp, 
                   u0_cond = b0_cond - (fes[1] + fes[2]*meanses + fes[3]*sector),
                   u1_cond = b1_cond - (fes[4] + fes[5]*meanses + fes[6]*sector)  )

head( mod.comp )

    id meanses sector n_j    b0_ols    b1_ols se_b0_ols se_b1_ols b0_uncond
1 1224  -0.428      0  47  9.715447 2.5085817 1.0954478  1.765216  9.956953
2 1288   0.128      0  25 13.510800 3.2554487 1.3637656  2.079675 13.386036
3 1296  -0.420      0  48  7.635958 1.0759591 0.7740752  1.209016  8.039091
4 1308   0.534      1  20 16.255500 0.1260242 1.4045813  3.003437 15.622073
5 1317   0.351      1  48 13.177688 1.2739128 0.7902486  1.435942 13.132771
6 1358  -0.014      0  30 11.206233 5.0680087 0.8994345  1.391550 11.387452
  b1_uncond   b0_cond  b1_cond      u0_ols      u1_ols     u0_cond      u1_cond
1  2.262837  9.740146 2.489033 -0.09807014  0.01445354 -0.07337107 -0.005095579
2  2.375964 13.234409 3.112156  0.73219201  0.18368221  0.45580075  0.040389349
3  1.872247  8.145275 2.307720 -2.22022179 -1.42648036 -1.71090544 -0.194719195
4  2.050193 16.201317 1.835985  0.08528223 -1.72492410  0.03109939 -0.014963432
5  1.997129 13.663596 1.528623 -2.01661001 -0.38691354 -1.53070178 -0.132203129
6  2.738390 11.530341 2.946721 -0.81510320  2.14376861 -0.49099553  0.022480378

nrow( mod.comp )

[1] 160

ggplot(data=mod.comp, aes( pch=as.factor(sector)) ) + 
         geom_point(aes(x=u1_ols, y=u0_ols),color='black', alpha=0.7) +   
         geom_point(aes(x=u1_cond, y=u0_cond),color='blue', alpha=0.7) + 
         labs(title = "Black: OLS, Blue: Conditional EB") + 
         xlim(-6,6) + ylim(-8,8)

# To get in two-panel format we need to get our data to long format
mod.comp.ols = data.frame( sector = mod.comp$sector,
                           u0 = mod.comp$u0_ols,
                           u1 = mod.comp$u1_ols )
mod.comp.EB = data.frame(  sector = mod.comp$sector,
                           u0 = mod.comp$u0_cond,
                           u1 = mod.comp$u1_cond )
mod.comp.l = bind_rows( ols=mod.comp.ols, cond = mod.comp.EB, .id = "method" )

ggplot(data=mod.comp.l, aes( u1, u0, pch=as.factor(sector)) ) + 
  facet_wrap( ~ method ) +
  geom_point()

39.13 Table 4.7 : pg 94

# This section is not very good--I would skip.
# Generating confidence intervals for individual random intercepts and slopes is a weird business.

# OLS First:

# Doing it by fitting OLS on our subset
sch.2305 = filter( dat, id == 2305 )
head( sch.2305 )

# A tibble: 6 × 13
# Groups:   id [1]
  id    minority female    ses mathach  size sector pracad disclim himinty
  <chr>    <dbl>  <dbl>  <dbl>   <dbl> <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
1 2305         1      1 -0.738   16.4    485      1   0.69   -1.38       1
2 2305         1      1 -1.18    12.8    485      1   0.69   -1.38       1
3 2305         1      1 -0.308   15.3    485      1   0.69   -1.38       1
4 2305         1      1 -0.358   12.7    485      1   0.69   -1.38       1
5 2305         1      1 -1.52    10.2    485      1   0.69   -1.38       1
6 2305         1      1 -0.518    8.94   485      1   0.69   -1.38       1
# ℹ 3 more variables: meanses <dbl>, ses_grpcenter <dbl>, ses_centered <dbl>

M.2305 = lm( mathach ~ ses_grpcenter, data=sch.2305 )
M.2305


Call:
lm(formula = mathach ~ ses_grpcenter, data = sch.2305)

Coefficients:
  (Intercept)  ses_grpcenter  
      11.1378        -0.7821

confint( M.2305 )

                  2.5 %    97.5 %
(Intercept)    9.911824 12.363698
ses_grpcenter -2.665989  1.101767

sch.8367 = filter( dat, id == 8367 )
head( sch.8367 )

# A tibble: 6 × 13
# Groups:   id [1]
  id    minority female    ses mathach  size sector pracad disclim himinty
  <chr>    <dbl>  <dbl>  <dbl>   <dbl> <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
1 8367         0      0 -0.228  15.9     153      0      0    1.79       0
2 8367         0      1 -0.208   1.86    153      0      0    1.79       0
3 8367         1      0  0.532  -2.83    153      0      0    1.79       0
4 8367         0      1  0.662   2.12    153      0      0    1.79       0
5 8367         0      0 -0.228   6.76    153      0      0    1.79       0
6 8367         0      0 -1.08    0.725   153      0      0    1.79       0
# ℹ 3 more variables: meanses <dbl>, ses_grpcenter <dbl>, ses_centered <dbl>

M.8367 = lm( mathach ~ ses_grpcenter, data=sch.8367 )
M.8367


Call:
lm(formula = mathach ~ ses_grpcenter, data = sch.8367)

Coefficients:
  (Intercept)  ses_grpcenter  
       4.5528         0.2504

confint( M.8367 )

                  2.5 %   97.5 %
(Intercept)    1.872974 7.232598
ses_grpcenter -3.431096 3.931845

# Use SE from earlier to get confint
est4.7 <- mod.comp[c(22,135),]
est4.7

      id meanses sector n_j    b0_ols     b1_ols se_b0_ols se_b1_ols b0_uncond
22  2305  -0.622      1  67 11.137761 -0.7821112 0.6138468  0.943289 11.222551
135 8367   0.032      0  14  4.552786  0.2503748 1.2299413  1.689668  6.423938
    b1_uncond   b0_cond   b1_cond    u0_ols    u1_ols    u0_cond     u1_cond
22   1.149555 10.886600 0.6276417  1.132373 -1.432070  0.8812116 -0.02231763
135  1.924903  8.549007 2.6307047 -7.713864 -2.721656 -3.7176433 -0.34132569

# CI for intercept and slope using our normal and stored SEs.
# (Not taking t distribution into account changes things, as does not
# taking the uncertainty in the fixed effects for the EB CIs.  So this is
# very approximate.)
se_uncond = as.data.frame( se.coef(mod4.4)$id )
head( se_uncond )

     (Intercept) ses_grpcenter
1224   0.8464092     0.7189592
1288   1.1205609     0.7593421
1296   0.8382669     0.7111100
1308   1.2307722     0.8005012
1317   0.8382675     0.7377054
1358   1.0354852     0.7489241

names( se_uncond ) = c("se_b0_uncond","se_b1_uncond" )
se_cond = as.data.frame( se.coef(  mod4.5 )$id )
names( se_cond ) = c("se_b0_cond","se_b1_cond" )
head( se_cond )

     se_b0_cond se_b1_cond
1224  0.7662313  0.2929654
1288  0.9521965  0.2988437
1296  0.7601221  0.2923332
1308  1.0176181  0.3025414
1317  0.7603073  0.2940970
1358  0.8982481  0.2971694

se_uncond$id = rownames( se_uncond )
se_cond$id = rownames( se_cond )
est4.7 = merge( est4.7, se_uncond, by="id" )
est4.7 = merge( est4.7, se_cond, by="id" )

est4.7.int = mutate( est4.7, 
                 CI.low.ols = b0_ols + - 1.96 * se_b0_ols,
                 CI.high.ols = b0_ols + 1.96 * se_b0_ols,
                 CI.low.uncond = b0_uncond + - 1.96 * se_b0_uncond,
                 CI.high.uncond = b0_uncond + 1.96 * se_b0_uncond,
                 CI.low.cond = b0_cond + - 1.96 * se_b0_cond,
                 CI.high.cond = b0_cond + 1.96 * se_b0_cond )

dplyr::select( est4.7.int, starts_with("CI" ) )

  CI.low.ols CI.high.ols CI.low.uncond CI.high.uncond CI.low.cond CI.high.cond
1   9.934621   12.340901      9.815648      12.629455    9.579800     12.19340
2   2.142101    6.963471      3.642797       9.205078    6.361492     10.73652

est4.7.slope = mutate( est4.7, 
                     CI.low.ols = b1_ols + - 1.96 * se_b1_ols,
                     CI.high.ols = b1_ols + 1.96 * se_b1_ols,
                     CI.low.uncond = b1_uncond + - 1.96 * se_b1_uncond,
                     CI.high.uncond = b1_uncond + 1.96 * se_b1_uncond,
                     CI.low.cond = b1_cond + - 1.96 * se_b1_cond,
                     CI.high.cond = b1_cond + 1.96 * se_b1_cond )

dplyr::select( est4.7.slope, starts_with("CI" ) )

  CI.low.ols CI.high.ols CI.low.uncond CI.high.uncond CI.low.cond CI.high.cond
1  -2.630958    1.066735    -0.1675367       2.466647   0.0629627     1.192321
2  -3.061375    3.562124     0.3960110       3.453795   2.0356645     3.225745