This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

We start by loading packages.

library(dplyr)
library(tidyverse)
library(zoo)
library(caret)
library(rstanarm)

And data.

full_dat <- readxl::read_xlsx("Z:\\Harmonized\ CAB\ File\\Current\ Data\\Harmonized-CAB-Phases-1-2-Data-9-18-23.xlsx")

A lot of the models we run in this markdown take too long to fit in tandem with the markdown because the session times out relatively quickly. My solution is to fit these models and progressively save the posterior draws. In the following chunk I load these models.

folder <- "Z:\\HUB\ Shared\ Working\ Folders\\Connection\ LHS\\Current\\Peter\\out\\"
# load model fits. we can't refit these each time, it takes too long
load(file = paste0(folder, "models.RData"))

1 Data processing

We process the data into formats that are conducive to analysis.

Site ID

Each observation occurs within the context of a site and a hub. Confusingly, the Site ID gets reused between some hubs but not between others.

To get a unique Site ID we concatenate it with Hub_ID

full_dat$Site_ID <- paste(full_dat$Hub_ID, full_dat$Site_ID, sep="_")

Missing values

Westat was good about providing detailed codes for missingness. They’re always negative values. In the following chunk, we recode these negative values as NA.

# define function that takes vector and replaces negative values with NA
neg_to_NA <- function(x) {x[x < 0] <- NA; return(x)}

full_dat <-
  full_dat %>% mutate(
    Reason_Discharge = neg_to_NA(Reason_Discharge),
    CSC_Duration_Totalmos = neg_to_NA(CSC_Duration_Totalmos),
    Age_Entry_Months = neg_to_NA(Age_Entry_Months),
    QPR_clean = neg_to_NA(QPR_Score),
    CSI_clean = neg_to_NA(CSI_Score),
    Gender_ID = neg_to_NA(Gender_ID),
    QoL = neg_to_NA(QoL)
  )

We do the same for all the COMPASS-10 items. These we vectorize to make things easier

# also do this with all the COMPASS10 subscales and calculate
C10_cols <- grep('COMPASS10',names(full_dat))
names(full_dat)[C10_cols]
##  [1] "COMPASS10_Depressed"       "COMPASS10_Anxiety"        
##  [3] "COMPASS10_Suicidal"        "COMPASS10_Hostility"      
##  [5] "COMPASS10_Suspiciousness"  "COMPASS10_Unusual"        
##  [7] "COMPASS10_Hallucinations"  "COMPASS10_Disorganization"
##  [9] "COMPASS10_Avolition"       "COMPASS10_Asociality"
full_dat[,C10_cols] <- sapply(full_dat[,C10_cols],neg_to_NA)

While we’re here, let’s calculate the COMPASS-10 negative and positive total scores.

full_dat$COMPASS_NEG <- rowMeans(
    full_dat[c("COMPASS10_Asociality",
    "COMPASS10_Avolition")],
    na.rm=TRUE)
full_dat$COMPASS_POS <- rowMeans(
  full_dat[c("COMPASS10_Disorganization",
             "COMPASS10_Hallucinations",
             "COMPASS10_Unusual"
             )],
  na.rm=TRUE)

Sites were able to make decisions about which symptom scales to administer.

Here’s the approximate rate of missingness for some key potential variables.

mean(full_dat$BPRS_Anxiety < 0) # neg values are NA
## [1] 0.9978471
mean(is.na(full_dat$CSI_clean)) # these neg values were recoded
## [1] 0.7831001
mean(is.na(full_dat$COMPASS_NEG))
## [1] 0.7412391
mean(is.na(full_dat$QoL))
## [1] 0.8322569

QoL and BPRS are both > 80% missing. They just seem unusable.

Feature engineering

Now here is a pretty monster chunk of code that produces lots of variables of interest.

dat <- full_dat %>%
  # first, observation-level variables
  mutate(
    months_in_program = ifelse(CSC_Duration_Totalmos < 0, NA, 
                               CSC_Duration_Totalmos),
    disch_visit = Assessment_Type == 26,
    gender_c = factor(Gender_ID,
                      levels = 1:6,
                      labels = c("Male",
                                 "Female",
                                 "Non-binary",
                                 "Transgender male",
                                 "Transgender female",
                                 "Other")
    ),
    hub_c = factor(Hub_ID,
                   levels = 1:8,
                   labels = c("AC-EPINET",
                              "CLHS",
                              "EPI-CAL",
                              "EPI-MINN",
                              "EPINET-TX",
                              "ESPRITO",
                              "LEAP",
                              "ONTY")
    ),
    discharge_c = 
      recode_factor(Reason_Discharge,
                    `1` = "unilateral_termination", # "Terminated, refused or declined services"
                    `2` = "completed", # "Completed program, graduated, or services no longer indicated due to client improvement"
                    `3` = "ineligible", # "Client does not display signs and symptoms that lead to the inclusion of a covered diagnosis and/or an established level of impairment"
                    `4` = "completed", # "Has reached limit for length of allowable stay"
                    `5` = "pursuing_pos_opportunity", # "Pursuing a positive opportunity elsewhere (e.g., school, employment, training)"
                    `6` = "incarcerated", # "Admitted to state hospital"
                    `7` = "incarcerated", # "Admitted to a residential program"
                    `8` = "transferred_or_moved", # "Transferred services to provider outside CSC program (other than state hospital or residential program)"
                    `9` = "incarcerated", # "Incarcerated"
                    `10` = "transferred_or_moved", # "Moved out of service area because of reasons other than options noted above"
                    `11` = "death", # "Deceased (by suicide)"
                    `12` = "death", # "Deceased (by other means)"
                    `13` = "lost_contact", # "Whereabouts unknown, team unable to contact client"
                    `14` = "other", # "Other"
                    `15` = "death" # "Deceased (unknown)"
      ),
    # "NEET" not in employment, education or training
    NEET = !(Job_6mos == 1 | Internship == 1 | School_Attendance %in% 2:3)
  ) %>%
  # then, person-level variables
  group_by(NDA_ID) %>%
  mutate(male_c = any(gender_c == "Male", na.rm=T),
         NIMH_sex = ifelse(any(Bio_Sex == 1, na.rm=T), "Female",
                               ifelse(any(Bio_Sex == 2, na.rm=T), "Male",
                                          "Unknown/Not Reported")),
         ever_medicaid = any(Health_Insurance_Medicaid == 1, na.rm =T),
         ever_private = any(Health_Insurance_Commercial == 1 |
                              Health_Insurance_Private == 1 |
                              Health_Insurance_Other == 1, na.rm=T),
         ever_uninsured = any(Health_Insurance_None == 1, na.rm=t),
         ever_SSI = any(SSI == 1, na.rm=T),
         black_c =  any(Race_Black == 1, na.rm=T),
         white_c = any(Race_White == 1, na.rm=T),
         gender_c = if_else(!is.na(gender_c), gender_c, NA),
         hispanic_c =  any(Ethnicity == 1, na.rm=T),
         asian_pi_c = any(Race_Asian ==1, na.rm=T) | any(Race_NH ==1, na.rm=T),
         # create single race variable
         # note: zero AI
         race_c =  ifelse(  ((any(Race_White ==1, na.rm=T) +
                              any(Race_Black == 1, na.rm=T) + 
                              any(Race_Asian ==1, na.rm=T) + 
                              any(Race_AI ==1, na.rm=T) + 
                              any(Race_NH ==1, na.rm=T) ) > 1) |
           ( (any(Ethnicity == 1, na.rm=T) + 
           any(Race_Black == 1, na.rm=T) + 
                              any(Race_Asian ==1, na.rm=T) + 
                              any(Race_AI ==1, na.rm=T) + 
                              any(Race_NH ==1, na.rm=T))  > 1), "Multiracial",
                  ifelse(any(Ethnicity == 1), "Hispanic",  
                  ifelse(any(Race_White == 1, na.rm=T), "White",
                  ifelse(any(Race_Black == 1, na.rm=T), "Black",
                   ifelse(any(Race_Asian == 1, na.rm=T), "Asian",
                   ifelse(any(Race_AI == 1, na.rm=T), "AI",
                   ifelse(any(Race_NH ==1, na.rm=T), "API",
                      ifelse(any(Race_Other == 1, na.rm=T), "Other",
                           "unknown or prefer not to say")))))))),
         simple_race =factor(ifelse(
           race_c == "White", "White",
           ifelse(race_c == "Black", "Black",
           ifelse(race_c == "Hispanic", "Hispanic",
           ifelse( (race_c == "Asian" | race_c == "API"), "AsianAPI", 
                  ifelse(race_c == "Multiracial", "Multiracial", "Other")))))),
         NIMH_race = factor(
           ifelse(
             (any(Race_White ==1, na.rm=T) +
             any(Race_Black == 1, na.rm=T) + 
             any(Race_Asian ==1, na.rm=T) + 
             any(Race_AI ==1, na.rm=T) + 
             any(Race_NH ==1, na.rm=T)) > 1,
                  "More than one race",
           ifelse(any(Race_AI ==1, na.rm=T),"American Indian/Alaskan Native",
           ifelse(any(Race_NH == 1, na.rm=T), "Native Hawaiian or Other Pacific Islander",
           ifelse(any(Race_Black == 1, na.rm=T), "Black or African American",
           ifelse(any(Race_White == 1, na.rm=T), "White",
           ifelse(any(Race_Asian == 1, na.rm=T), "Asian",
          "Unknown/Not reported"))))))),
         percent_NEET = mean(NEET == 1, na.rm=T),
         any_NEET = any(NEET == 1, na.rm=T),
         Age_Entry_Months = na.locf(Age_Entry_Months, na.rm=FALSE),
         Age_Entry_Years = ifelse(any(!is.na(Age_Entry_Months)), round(Age_Entry_Months / 12, 1), NA)
  ) %>%
  ungroup() %>%
  # finally, some row-level variables that are a function of person-level variables
  mutate(
    Age_At_Assessment = round( (Age_Entry_Months + CSC_Duration_Totalmos) / 12, 1 )
  )
# verify that the race codes make sense
dat %>% subset(Assessment_Type ==0 ) %>% select(Race_Black, Race_White, Race_Asian, Race_Other,  Ethnicity, hispanic_c, black_c, white_c, race_c, simple_race)
## # A tibble: 5,083 x 10
##    Race_Black Race_White Race_Asian Race_Other Ethnicity hispanic_c black_c
##         <dbl>      <dbl>      <dbl>      <dbl>     <dbl> <lgl>      <lgl>  
##  1         -9         -9         -9          0        -8 FALSE      FALSE  
##  2         -9         -9         -9          0         1 TRUE       FALSE  
##  3         -9         -9         -9          0         1 TRUE       FALSE  
##  4         -9         -9         -9          0         1 TRUE       FALSE  
##  5         -9         -9         -9          0         2 FALSE      FALSE  
##  6         -9         -9         -9          0         1 TRUE       FALSE  
##  7         -9         -9         -9          0         1 TRUE       FALSE  
##  8         -9         -9         -9          0         1 TRUE       FALSE  
##  9         -9         -9         -9          0         2 FALSE      FALSE  
## 10         -9          1          1          0         2 FALSE      FALSE  
## # i 5,073 more rows
## # i 3 more variables: white_c <lgl>, race_c <chr>, simple_race <fct>

Descriptives table

Patient-level descriptives of the full sample

desc <- 
  dat %>%
  group_by(NDA_ID) %>%
  summarise(
    age_at_entry = mean(Age_Entry_Years, na.rm=T),
    simple_race = factor(unique(simple_race)),
    NIMH_race = factor(
      ifelse((any(Race_White == 1, na.rm=T) +
             any(Race_Black == 1, na.rm=T) + 
             any(Race_Asian == 1, na.rm=T) + 
             any(Race_AI == 1, na.rm=T) + 
             any(Race_NH == 1, na.rm=T)) > 1,
                  "More than one race",
           ifelse(any(Race_AI ==1, na.rm=T), "American Indian/Alaskan Native",
           ifelse(any(Race_NH == 1, na.rm=T), "Native Hawaiian or Other Pacific Islander",
           ifelse(any(Race_Black == 1, na.rm=T), "Black or African American",
           ifelse(any(Race_White == 1, na.rm=T), "White",
           ifelse(any(Race_Asian == 1, na.rm=T), "Asian",
          "Unknown/Not reported"))))))),
    hispanic = ifelse(any(Ethnicity == 1, na.rm=T), "hispanic",
                      ifelse(any(Ethnicity == 2, na.rm =T), "not hispanic",
                      "unknown/unreported")),
    male = any(gender_c == "Male", na.rm=T),
    female = any(gender_c == "Female", na.rm=T),
    trans = any(gender_c %in% c("Transgender male", "Transgender female"), na.rm=T),
    other_gender = any(gender_c %in% c("Non-binary", "Other")),
    NIMH_sex = unique(NIMH_sex),
    ever_NEET = unique(any_NEET)
    ) %>%
  ungroup() 
summary(desc)
##     NDA_ID           age_at_entry        simple_race  
##  Length:5166        Min.   :11.30   White      :1525  
##  Class :character   1st Qu.:18.50   Hispanic   : 842  
##  Mode  :character   Median :21.00   Black      :1448  
##                     Mean   :21.48   Multiracial: 269  
##                     3rd Qu.:24.00   Other      : 786  
##                     Max.   :47.00   AsianAPI   : 296  
##                     NA's   :1311                      
##                                      NIMH_race      hispanic        
##  White                                    :1973   Length:5166       
##  Black or African American                :1548   Class :character  
##  More than one race                       : 155   Mode  :character  
##  Unknown/Not reported                     :1153                     
##  American Indian/Alaskan Native           :  33                     
##  Asian                                    : 283                     
##  Native Hawaiian or Other Pacific Islander:  21                     
##     male           female          trans         other_gender   
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2162      FALSE:3631      FALSE:5126      FALSE:5045     
##  TRUE :3004      TRUE :1535      TRUE :40        TRUE :121      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##    NIMH_sex         ever_NEET      
##  Length:5166        Mode :logical  
##  Class :character   FALSE:1397     
##  Mode  :character   TRUE :3769     
##                                    
##                                    
##                                    
## 
sd(desc$age_at_entry, na.rm=T)
## [1] 4.106402
# NIMH demographic reporting categories
# note that gender has far less missingness than sex
tab <- with(desc, table(NIMH_race, NIMH_sex, hispanic) )
addmargins(tab)
## , , hispanic = hispanic
## 
##                                            NIMH_sex
## NIMH_race                                   Female Male Unknown/Not Reported
##   White                                         96  220                  132
##   Black or African American                      7   24                   69
##   More than one race                             8   18                    6
##   Unknown/Not reported                         100  118                  176
##   American Indian/Alaskan Native                 2    4                    0
##   Asian                                          2    1                    2
##   Native Hawaiian or Other Pacific Islander      0    2                    1
##   Sum                                          215  387                  386
##                                            NIMH_sex
## NIMH_race                                    Sum
##   White                                      448
##   Black or African American                  100
##   More than one race                          32
##   Unknown/Not reported                       394
##   American Indian/Alaskan Native               6
##   Asian                                        5
##   Native Hawaiian or Other Pacific Islander    3
##   Sum                                        988
## 
## , , hispanic = not hispanic
## 
##                                            NIMH_sex
## NIMH_race                                   Female Male Unknown/Not Reported
##   White                                        330  683                  454
##   Black or African American                    285  563                  500
##   More than one race                            33   57                   30
##   Unknown/Not reported                          90  105                  176
##   American Indian/Alaskan Native                11   10                    5
##   Asian                                         53   75                  140
##   Native Hawaiian or Other Pacific Islander      4    5                    7
##   Sum                                          806 1498                 1312
##                                            NIMH_sex
## NIMH_race                                    Sum
##   White                                     1467
##   Black or African American                 1348
##   More than one race                         120
##   Unknown/Not reported                       371
##   American Indian/Alaskan Native              26
##   Asian                                      268
##   Native Hawaiian or Other Pacific Islander   16
##   Sum                                       3616
## 
## , , hispanic = unknown/unreported
## 
##                                            NIMH_sex
## NIMH_race                                   Female Male Unknown/Not Reported
##   White                                         16   30                   12
##   Black or African American                     12   36                   52
##   More than one race                             0    3                    0
##   Unknown/Not reported                          26   41                  321
##   American Indian/Alaskan Native                 1    0                    0
##   Asian                                          6    2                    2
##   Native Hawaiian or Other Pacific Islander      0    2                    0
##   Sum                                           61  114                  387
##                                            NIMH_sex
## NIMH_race                                    Sum
##   White                                       58
##   Black or African American                  100
##   More than one race                           3
##   Unknown/Not reported                       388
##   American Indian/Alaskan Native               1
##   Asian                                       10
##   Native Hawaiian or Other Pacific Islander    2
##   Sum                                        562
## 
## , , hispanic = Sum
## 
##                                            NIMH_sex
## NIMH_race                                   Female Male Unknown/Not Reported
##   White                                        442  933                  598
##   Black or African American                    304  623                  621
##   More than one race                            41   78                   36
##   Unknown/Not reported                         216  264                  673
##   American Indian/Alaskan Native                14   14                    5
##   Asian                                         61   78                  144
##   Native Hawaiian or Other Pacific Islander      4    9                    8
##   Sum                                         1082 1999                 2085
##                                            NIMH_sex
## NIMH_race                                    Sum
##   White                                     1973
##   Black or African American                 1548
##   More than one race                         155
##   Unknown/Not reported                      1153
##   American Indian/Alaskan Native              33
##   Asian                                      283
##   Native Hawaiian or Other Pacific Islander   21
##   Sum                                       5166

Create discharge data frames

Finally, we create a data.frame which is every assessment for everyone who discharged.

In the process we create some variables that are unique to discharged folks.

dc <- dat %>%
  group_by(NDA_ID) %>%
  filter(any(Assessment_Type == 26)) %>%
  mutate(         
    unilateral_termination = any(discharge_c == "unilateral_termination", na.rm=T),
    completed = any(discharge_c == "completed", na.rm=T),
    lost_contact = any(discharge_c == "lost_contact", na.rm=T),
    transferred_or_moved = any(discharge_c == "transferred_or_moved", na.rm=T),
    avg_COMPASS_NEG = mean(COMPASS_NEG, na.rm=T)
  ) %>%
  ungroup()

We also create a variable that’s composed of discharge assessments only.

dc_vis <- dc %>% 
  subset(Assessment_Type == 26)
nrow(dc_vis)
## [1] 1818

Some people discharged more than once. Specifically, 14 people discharged twice and one person discharged three times. Because this is a very small group, we handle this by taking only their longest stay in the program.

dc_vis <- dc_vis %>%
  group_by(NDA_ID) %>%
  arrange(desc(CSC_Duration_Totalmos)) %>%
  slice(1)
nrow(dc_vis)
## [1] 1787

2 Descriptives

Let’s start by looking at numbers of observations, in general and by hub.

We begin with the number of assessments per hub.

table(dat$hub_c)
## 
## AC-EPINET      CLHS   EPI-CAL  EPI-MINN EPINET-TX   ESPRITO      LEAP      ONTY 
##      1015      2247       768       845       825      2431      1456      7135

And a graph of the same.

ggplot(dat) +
  aes(x = fct_infreq(hub_c)) + 
  geom_bar() +
  xlab("") +
  ggtitle("Number of assessments by hub")

Patients per hub.

dat %>% 
  select(hub_c, NDA_ID) %>%
  distinct() %>%
  group_by(hub_c) %>%
  summarise(number_of_people = n()) %>%
  arrange(desc(number_of_people))
## # A tibble: 8 x 2
##   hub_c     number_of_people
##   <fct>                <int>
## 1 ONTY                  1253
## 2 CLHS                   824
## 3 ESPRITO                775
## 4 LEAP                   766
## 5 AC-EPINET              462
## 6 EPI-CAL                402
## 7 EPINET-TX              398
## 8 EPI-MINN               286
dat %>%
  select(hub_c, NDA_ID) %>%
  distinct() %>%
  ggplot() + 
  aes(x = fct_infreq(hub_c)) +
  geom_bar() + xlab("") +
  ggtitle("Number of people by hub")

Sites/clinics per hub.

dat %>% 
  select(hub_c, Site_ID) %>%
  distinct() %>%
  group_by(hub_c) %>%
  summarise(number_of_sites = n()) %>%
  arrange(desc(number_of_sites))
## # A tibble: 8 x 2
##   hub_c     number_of_sites
##   <fct>               <int>
## 1 ONTY                   26
## 2 CLHS                   23
## 3 EPI-CAL                18
## 4 EPINET-TX              16
## 5 ESPRITO                13
## 6 LEAP                   13
## 7 EPI-MINN                6
## 8 AC-EPINET               5
dat %>%
  select(hub_c, Site_ID) %>%
  distinct() %>%
  ggplot() + 
  aes(x = fct_infreq(hub_c)) +
  geom_bar() + xlab("") +
  ggtitle("Number of sites by hub")

Discharge descriptives

Number of discharges per hub.

table(dat$disch_visit, dat$hub_c)
##        
##         AC-EPINET CLHS EPI-CAL EPI-MINN EPINET-TX ESPRITO LEAP ONTY
##   FALSE       891 1853     725      759       720    1959 1403 6594
##   TRUE        124  394      43       86       105     472   53  541
prop.table(table(dat$disch_visit, dat$hub_c), margin = 2)
##        
##          AC-EPINET       CLHS    EPI-CAL   EPI-MINN  EPINET-TX    ESPRITO
##   FALSE 0.87783251 0.82465510 0.94401042 0.89822485 0.87272727 0.80584122
##   TRUE  0.12216749 0.17534490 0.05598958 0.10177515 0.12727273 0.19415878
##        
##               LEAP       ONTY
##   FALSE 0.96359890 0.92417659
##   TRUE  0.03640110 0.07582341
nrow(dc_vis)
## [1] 1787
ggplot(dc_vis) +
  aes(x=fct_infreq(hub_c)) + 
  geom_bar() + 
  xlab("") +
  ggtitle("Number of discharges by hub")

dat %>% 
  group_by(NDA_ID) %>%
  summarize(hub = unique(hub_c),
            disch = any(disch_visit)) %>%
  ungroup() %>% group_by(hub) %>%
  summarise(disch_rate = round(mean(disch) * 100,1)) %>%
  arrange(desc(disch_rate))
## # A tibble: 8 x 2
##   hub       disch_rate
##   <fct>          <dbl>
## 1 ESPRITO         60  
## 2 CLHS            47.8
## 3 ONTY            41.7
## 4 EPI-MINN        30.1
## 5 EPINET-TX       26.1
## 6 AC-EPINET       26  
## 7 EPI-CAL         10.7
## 8 LEAP             6.9

Type of discharge.

# unrecoded counts
summary(factor(dc_vis$Reason_Discharge))
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 NA's 
##  427  395   85   36   11    6    9  144   16  153    3    8  278  163   10   43
round(prop.table(summary(factor(dc_vis$Reason_Discharge))), 3) * 100
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 NA's 
## 23.9 22.1  4.8  2.0  0.6  0.3  0.5  8.1  0.9  8.6  0.2  0.4 15.6  9.1  0.6  2.4
#recoded counts
summary(dc_vis$discharge_c)
##   unilateral_termination                completed               ineligible 
##                      427                      431                       85 
## pursuing_pos_opportunity             incarcerated     transferred_or_moved 
##                       11                       31                      297 
##                    death             lost_contact                    other 
##                       21                      278                      163 
##                     NA's 
##                       43
#rates
round(prop.table(summary(dc_vis$discharge_c)) * 100, 1)
##   unilateral_termination                completed               ineligible 
##                     23.9                     24.1                      4.8 
## pursuing_pos_opportunity             incarcerated     transferred_or_moved 
##                      0.6                      1.7                     16.6 
##                    death             lost_contact                    other 
##                      1.2                     15.6                      9.1 
##                     NA's 
##                      2.4
ggplot(dc_vis) +
  aes(x = fct_infreq(discharge_c)) +
  geom_bar() + 
  xlab("discharge type") +
  ggtitle("Number of discharge types") +
  theme(axis.text.x = element_text(angle=45, vjust = 1, hjust=1))

Discharge types by hub

# counts
dc_vis %>%
  group_by(hub_c) %>%
  summarize(unilateral_termination = sum(discharge_c == "unilateral_termination", na.rm=T),
            completed = sum(discharge_c == "completed", na.rm=T),
            ineligible = sum(discharge_c == "ineligible", na.rm=T),
            transferred_or_moved = sum(discharge_c == "transferred_or_moved", na.rm=T),
            death = sum(discharge_c == "death", na.rm=T),
            lost_contact = sum(discharge_c == "lost_contact", na.rm=T),
            other = sum(discharge_c == "other", na.rm=T)
  )
## # A tibble: 8 x 8
##   hub_c   unilateral_termination completed ineligible transferred_or_moved death
##   <fct>                    <int>     <int>      <int>                <int> <int>
## 1 AC-EPI~                     27         6          5                   28     2
## 2 CLHS                        67        67         54                   44     0
## 3 EPI-CAL                      8        17          5                    4     0
## 4 EPI-MI~                     10        44          4                    7     0
## 5 EPINET~                     43         6          1                   26     4
## 6 ESPRITO                     85       168         10                   89     5
## 7 LEAP                        18        11          6                    4     0
## 8 ONTY                       169       112          0                   95    10
## # i 2 more variables: lost_contact <int>, other <int>
# rates
dc_vis %>%
  group_by(hub_c) %>%
  summarize(unilateral_termination = mean(discharge_c == "unilateral_termination", na.rm=T),
            completed = mean(discharge_c == "completed", na.rm=T),
            ineligible = mean(discharge_c == "ineligible", na.rm=T),
            transferred_or_moved = mean(discharge_c == "transferred_or_moved", na.rm=T),
            death = mean(discharge_c == "death", na.rm=T),
            lost_contact = mean(discharge_c == "lost_contact", na.rm=T),
            other = mean(discharge_c == "other", na.rm=T)
  ) 
## # A tibble: 8 x 8
##   hub_c  unilateral_termination completed ineligible transferred_or_moved  death
##   <fct>                   <dbl>     <dbl>      <dbl>                <dbl>  <dbl>
## 1 AC-EP~                  0.225    0.05      0.0417                0.233  0.0167
## 2 CLHS                    0.170    0.170     0.137                 0.112  0     
## 3 EPI-C~                  0.186    0.395     0.116                 0.0930 0     
## 4 EPI-M~                  0.128    0.564     0.0513                0.0897 0     
## 5 EPINE~                  0.413    0.0577    0.00962               0.25   0.0385
## 6 ESPRI~                  0.195    0.386     0.0230                0.205  0.0115
## 7 LEAP                    0.36     0.22      0.12                  0.08   0     
## 8 ONTY                    0.324    0.215     0                     0.182  0.0192
## # i 2 more variables: lost_contact <dbl>, other <dbl>

3 Identifying predictors of discharge type

Let’s do some exploratory plotting to understand discharge types.

3.1 Race

Here’s the race breakdown for the cohort that discharged.

table(dc_vis$simple_race)
## 
##       White    Hispanic       Black Multiracial       Other    AsianAPI 
##         525         279         586          80         225          92
table(dc_vis$race_c)
## 
##                           AI                          API 
##                           11                            5 
##                        Asian                        Black 
##                           87                          586 
##                     Hispanic                  Multiracial 
##                          279                           80 
##                        Other unknown or prefer not to say 
##                            3                          211 
##                        White 
##                          525
proportions <- prop.table(with(dc_vis, table(simple_race, discharge_c)), margin = 1)
round(proportions * 100, 1)
##              discharge_c
## simple_race   unilateral_termination completed ineligible
##   White                         25.1      29.4        5.3
##   Hispanic                      30.0      28.9        3.3
##   Black                         24.3      18.9        3.1
##   Multiracial                   26.6      30.4        7.6
##   Other                         17.3      23.6        8.6
##   AsianAPI                      21.7      20.7        6.5
##              discharge_c
## simple_race   pursuing_pos_opportunity incarcerated transferred_or_moved death
##   White                            1.2          2.0                 18.2   1.2
##   Hispanic                         0.4          0.7                 16.3   1.1
##   Black                            0.5          2.1                 17.0   1.4
##   Multiracial                      0.0          2.5                 15.2   1.3
##   Other                            0.0          0.5                 11.8   1.4
##   AsianAPI                         1.1          4.3                 27.2   0.0
##              discharge_c
## simple_race   lost_contact other
##   White                8.5   9.1
##   Hispanic            10.0   9.3
##   Black               26.9   5.9
##   Multiracial         12.7   3.8
##   Other               16.4  20.5
##   AsianAPI             7.6  10.9
proportions <- data.frame(proportions)
full_sample <- as.data.frame(prop.table(table(dc_vis$discharge_c)))
names(full_sample) <- c("discharge_c", "Freq")
full_sample$simple_race <- "Full sample"
proportions <- bind_rows(proportions, full_sample)
proportions <- mutate(
  proportions,
  simple_race = factor(simple_race,
                       levels = c("Full sample","Black", "White", "Hispanic", "AsianAPI", "Multiracial", "Other"),
                       labels=c("Full sample","Black (n=586)", "White (n=525)",  "Hispanic (n=279)", "Asian/Pacific Islander (n=92)", "Multiracial (n=80)", "Other/unknown (n=225)")),
  discharge_c = factor(discharge_c,
                       levels = c("completed",
                                  "unilateral_termination",
                                  "transferred_or_moved",
                                  "lost_contact",
                                  "other",
                                  "ineligible",
                                  "incarcerated",
                                  "death",
                                  "pursuing_pos_opportunity"),
                       labels=c("Completed\n(n=431)",
                                "Unilateral termination\n(n=427)",
                                "Transferred or moved\n(n=297)",
                                "Lost contact\n(n=278)",
                                "Other\n(n=163)",
                                "Ineligible\n(n=85)",
                                "Incarcerated/hospitalized\n(n=31)",
                                "Death\n(n=21)",
                                "Pursuing positive opportunity\n(n=11)"))
)

fill_colors <- c("grey", scales::viridis_pal()(7))
# fill_colors <- scales::viridis_pal(option="magma")(8)

proportions %>%
  ggplot() +
  aes(x = discharge_c, y = Freq, fill = simple_race) + 
  geom_bar(stat="identity", position="dodge") + 
  scale_y_continuous(labels=scales::percent) +
  ylab("") + xlab("Discharge reason") +
  guides(fill=guide_legend(title="Race/ethnicity")) +
  scale_fill_manual(values=fill_colors) + 
  theme_bw()

We can compare expected rates by race with actual rates by race. Black people are much more likely to be lost to follow-up.

rate_black <- mean(dc_vis$simple_race == "Black")
ggplot(dc_vis) +
  aes(x = fct_infreq(discharge_c), fill=(simple_race == "Black")) +
  geom_bar(position="fill") + 
  geom_hline(yintercept = rate_black) + 
  xlab("discharge type") +
  ggtitle("Discharge types. Proportion black", subtitle="line gives expected proportion") +
  theme(axis.text.x = element_text(angle=45, vjust = 1, hjust=1))

Race and gender?

Visually, it kind of looked to me as though there was a two-way interaction between race and gender, with black males particularly unlikely to complete the program.

dc_vis %>%
  mutate(
    race = ifelse(simple_race=="Black", "Black", "Non-black"),
    gender = ifelse(male_c, "Male", "Other"),
    outcome = ifelse(completed, "completed", "did not complete")
  ) %>%
  ggplot() +
  aes(x = outcome, fill=race) +
  geom_bar(position="fill") + 
  xlab("discharge outcome") +
  ylab("proportion") +
  ggtitle("Discharge outcome", subtitle="Interaction between race and gender") +
  theme(axis.text.x = element_text(angle=45, vjust = 1, hjust=1)) +
  facet_wrap(~gender)

However, the effect doesn’t appear to be robust.

summary(glm(completed ~ (simple_race == "Black") + male_c + 
      (simple_race == "Black"):male_c, 
    family="binomial",
    data = dc_vis))
## 
## Call:
## glm(formula = completed ~ (simple_race == "Black") + male_c + 
##     (simple_race == "Black"):male_c, family = "binomial", data = dc_vis)
## 
## Coefficients:
##                                        Estimate Std. Error z value
## (Intercept)                           -0.997223   0.106364  -9.376
## simple_race == "Black"TRUE            -0.221534   0.197178  -1.124
## male_cTRUE                             0.002331   0.134391   0.017
## simple_race == "Black"TRUE:male_cTRUE -0.307069   0.252067  -1.218
##                                                  Pr(>|z|)    
## (Intercept)                           <0.0000000000000002 ***
## simple_race == "Black"TRUE                          0.261    
## male_cTRUE                                          0.986    
## simple_race == "Black"TRUE:male_cTRUE               0.223    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1992.6  on 1786  degrees of freedom
## Residual deviance: 1978.8  on 1783  degrees of freedom
## AIC: 1986.8
## 
## Number of Fisher Scoring iterations: 4

3.2 Age

First some descriptives for the discharged sample.

mean(dc_vis$Age_Entry_Years, na.rm=T)
## [1] 21.87464
sd(dc_vis$Age_Entry_Years, na.rm=T)
## [1] 4.198462
sum(is.na(dc_vis$Age_Entry_Years))
## [1] 178
hist(dc_vis$Age_Entry_Months)

Age is related to discharge

dc_vis %>%
  ggplot() + 
  aes(x = Age_Entry_Years, y = as.numeric(completed)) + 
  geom_jitter(width = 0, height = 0.05) + 
  geom_smooth(method="glm", method.args=list(family="binomial")) + 
  ggtitle("Age at entry is predictive of completion.", 
          subtitle = "Vertical jitter added. Logistic regression fit overlaid.") + 
  xlab("Age at entry") + 
  ylab("") +
  scale_y_continuous(breaks = 0:1, labels=c("Not completed", "Completed"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 178 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 178 rows containing missing values (`geom_point()`).

3.3 Time in program

Duration in program is a wonderful predictor of program completion, but in some ways that’s tautological given that patients are approximately defined as completing the program if they last two years.

# duration in program versus program completion
dc_vis %>%
  ggplot() + 
  aes(x = CSC_Duration_Totalmos, y = as.numeric(completed)) + 
  geom_jitter(width = 0, height = 0.05, alpha=0.5) + 
  geom_smooth(method="glm", method.args=list(family="binomial")) + 
  ggtitle("Length of time in program is even more predictive (obviously?)", 
          subtitle = "Vertical jitter added. Logistic regression fit overlaid.") + 
  xlab("Time in program") + 
  ylab("") +
  scale_y_continuous(breaks = 0:1, labels=c("Not completed", "Completed"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).

Each year in the program is associated with approximately 3x greater odds of program completion.

fit <- glm( completed ~ CSC_Duration_Totalmos, 
            data = dc_vis, family=binomial)
exp(coef(fit)[2] * 12) # coefficient for months, times 12, exponentiated
## CSC_Duration_Totalmos 
##              2.718064
# confint
exp(confint(fit)[2,] * 12)
## Waiting for profiling to be done...
##    2.5 %   97.5 % 
## 2.412285 3.077916

Here’s another way of visualizing the relationship between program completion and time in the program. It’s interesting that many people “complete” the program after a very short duration.

dc_vis %>%
  ggplot() + 
  aes(x = CSC_Duration_Totalmos, fill = completed) + 
  geom_histogram() + 
  ggtitle("Another way of visualizing completion versus time in program.") + 
  xlab("Time in program") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (`stat_bin()`).

Age and time in program predict completion even after controlling for each other.

fit <- glm( completed ~ Age_Entry_Years + CSC_Duration_Totalmos, 
            data = dc_vis, family=binomial)
summary(fit)
## 
## Call:
## glm(formula = completed ~ Age_Entry_Years + CSC_Duration_Totalmos, 
##     family = binomial, data = dc_vis)
## 
## Coefficients:
##                        Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)           -4.104642   0.372725 -11.013 < 0.0000000000000002 ***
## Age_Entry_Years        0.055244   0.015718   3.515              0.00044 ***
## CSC_Duration_Totalmos  0.091602   0.005926  15.456 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1763.7  on 1608  degrees of freedom
## Residual deviance: 1400.1  on 1606  degrees of freedom
##   (178 observations deleted due to missingness)
## AIC: 1406.1
## 
## Number of Fisher Scoring iterations: 5

3.4 Symptoms

We plot the observed trajectories in self-reported symptoms and discharge type. Keep in mind that the curves plotted over these graphs do not take into account the issue of differing numbers of observations per person.

dc %>%
  pivot_longer(cols = c(completed,unilateral_termination,lost_contact),
               names_to = "discharge_type", values_to = "outcome") %>%
  ggplot() + 
  aes(x = CSC_Duration_Totalmos, y = CSI_clean, group = NDA_ID, color = outcome) + 
  geom_line(alpha=0.2) + 
  geom_smooth(aes(x = CSC_Duration_Totalmos, y = CSI_clean, group = outcome)) + 
  ggtitle("CSI symptom trajectory versus program completion") + 
  xlab("Assessment time (months)") +
  xlim(c(0,30)) + 
  facet_wrap(~discharge_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 18003 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 15349 rows containing missing values (`geom_line()`).

These plots suggest there’s not much effect of self-reported symptom severity, but perhaps something is going on with unilateral termination. However, the above graphs don’t account for the multilevel structure of the data, so let’s use a formal model to verify.

csi_fit <- stan_glmer(unilateral_termination ~
                        # interaction between (scaled) CSI scores and time.
                      (scale(CSI_clean) + scale(CSC_Duration_Totalmos))^2 +
                      # account for clustering
                      (1|hub_c) + (1|Site_ID) + (1|NDA_ID),
                 data = dc, 
                 adapt_delta=0.99,
                 family=binomial,
                 iter = 3000,
                 cores=4)

Duration in the program is a strong predictor of unilateral_termination, but CSI scores and changes in CSI scores over time aren’t significantly predictive.

print(csi_fit, digits=3)
## stan_glmer
##  family:       binomial [logit]
##  formula:      unilateral_termination ~ (scale(CSI_clean) + scale(CSC_Duration_Totalmos))^2 + 
##     (1 | hub_c) + (1 | Site_ID) + (1 | NDA_ID)
##  observations: 1103
## ------
##                                               Median MAD_SD
## (Intercept)                                   -6.808  1.413
## scale(CSI_clean)                              -0.394  0.460
## scale(CSC_Duration_Totalmos)                  -2.013  0.546
## scale(CSI_clean):scale(CSC_Duration_Totalmos)  0.797  0.482
## 
## Error terms:
##  Groups  Name        Std.Dev.
##  NDA_ID  (Intercept) 8.638   
##  Site_ID (Intercept) 2.105   
##  hub_c   (Intercept) 2.858   
## Num. levels: NDA_ID 726, Site_ID 63, hub_c 7 
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Now do the same with negative symptoms.

dc %>%
  pivot_longer(cols = c(completed,unilateral_termination,lost_contact),
               names_to = "discharge_type", values_to = "outcome") %>%
  ggplot() + 
  aes(x = CSC_Duration_Totalmos, y = COMPASS_NEG, group = NDA_ID, color = outcome) + 
  geom_line(alpha=0.2) + 
  geom_smooth(aes(x = CSC_Duration_Totalmos, y = COMPASS_NEG, group = outcome)) + 
  ggtitle("COMPASS-10 negative symptoms versus program completion") + 
  xlab("Assessment time (months)") +
  xlim(c(0,30)) + 
  facet_wrap(~discharge_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17643 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 13607 rows containing missing values (`geom_line()`).

Maybe something is happening for program completion (with higher negative symptoms predicting less likelihood of completion) and lost contact (with greater decreases in negative symptoms predicting retention of contact). We’ll check both.

fit_neg_comp <- stan_glmer(completed ~ 
                   (scale(COMPASS_NEG) + scale(CSC_Duration_Totalmos))^2 +
                   # account for clustering
                   (1|hub_c) + (1|Site_ID) + (1|NDA_ID),
                 data = dc, 
                 family=binomial,
                 cores=4)

# same model, but predicting lost_contact
fit_neg_lost <- update(fit_neg_comp, lost_contact ~ .)

There was no relationship between negative symptoms and lost contact (as parameterized by this model).

print(fit_neg_lost, digits=3)
## stan_glmer
##  family:       binomial [logit]
##  formula:      lost_contact ~ scale(COMPASS_NEG) + scale(CSC_Duration_Totalmos) + 
##     (1 | hub_c) + (1 | Site_ID) + (1 | NDA_ID) + scale(COMPASS_NEG):scale(CSC_Duration_Totalmos)
##  observations: 1184
## ------
##                                                 Median MAD_SD
## (Intercept)                                     -8.617  1.233
## scale(COMPASS_NEG)                               0.047  0.387
## scale(CSC_Duration_Totalmos)                    -2.080  0.546
## scale(COMPASS_NEG):scale(CSC_Duration_Totalmos)  0.047  0.450
## 
## Error terms:
##  Groups  Name        Std.Dev.
##  NDA_ID  (Intercept) 6.479   
##  Site_ID (Intercept) 3.677   
##  hub_c   (Intercept) 1.194   
## Num. levels: NDA_ID 750, Site_ID 83, hub_c 8 
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

But people who completed had statistically significantly lower negative symptoms than people who did not complete.

print(fit_neg_comp, digits=3)
## stan_glmer
##  family:       binomial [logit]
##  formula:      completed ~ (scale(COMPASS_NEG) + scale(CSC_Duration_Totalmos))^2 + 
##     (1 | hub_c) + (1 | Site_ID) + (1 | NDA_ID)
##  observations: 1184
## ------
##                                                 Median MAD_SD
## (Intercept)                                     -4.703  1.563
## scale(COMPASS_NEG)                              -1.000  0.460
## scale(CSC_Duration_Totalmos)                     4.087  0.664
## scale(COMPASS_NEG):scale(CSC_Duration_Totalmos) -0.117  0.454
## 
## Error terms:
##  Groups  Name        Std.Dev.
##  NDA_ID  (Intercept) 9.234   
##  Site_ID (Intercept) 4.679   
##  hub_c   (Intercept) 4.406   
## Num. levels: NDA_ID 750, Site_ID 83, hub_c 8 
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
posterior_interval(fit_neg_comp, pars="scale(COMPASS_NEG)", prob=0.95)
##                         2.5%      97.5%
## scale(COMPASS_NEG) -1.940931 -0.1113606

Given this finding, we’ll want to include negative symptom scores in later models. To do this, we’ll need to get an average negative symptom score for each participant and stack it onto the discharge dataframe. We do this with a model because it allows us to get a better estimate of the average given the multilevel structure of the data, and because it allows us to impute missing data for people without COMPASS data.

neg_average_model <- stan_lmer(
  COMPASS_NEG ~ 
    # this is an intercept-only model.
    # we get the expected average for each person in each site in each hub
    (1|NDA_ID) +
    (1|Site_ID) +
    (1|hub_c),
  data=dat,
  cores=4,
  iter=2000
  )

We use the above model to get estimated negative symptom scores for each person.

# first whole dataset
neg_est <- posterior_epred(neg_average_model, 
                             newdata=dat %>% select(NDA_ID,Site_ID,hub_c))
dat$NEG_avg <- apply(neg_est, 2, mean)
# then discharge visit only
neg_est <- posterior_epred(neg_average_model, 
                             newdata=dc_vis %>% select(NDA_ID,Site_ID,hub_c))
dc_vis$NEG_avg <- apply(neg_est, 2, mean)

There doesn’t appear to be any relationship between positive symptom severity (or its trajectory) and discharge type, so we don’t examine this variable further.

dc %>%
  pivot_longer(cols = c(completed,unilateral_termination,lost_contact),
               names_to = "discharge_type", values_to = "outcome") %>%
  ggplot() + 
  aes(x = CSC_Duration_Totalmos, y = COMPASS_POS, group = NDA_ID, color = outcome) + 
  geom_line(alpha=0.2) + 
  geom_smooth(aes(x = CSC_Duration_Totalmos, y = COMPASS_POS, group = outcome)) + 
  ggtitle("COMPASS-10 positive symptoms versus program completion") + 
  xlab("Assessment time (months)") +
  xlim(c(0,30)) + 
  facet_wrap(~discharge_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17664 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 13645 rows containing missing values (`geom_line()`).

3.5 NEET

NEET means not employed, in education, or in training. For this dataset I define NEET as not being employed, in school, or in an internship in the six months prior to the assessment timepoint.

dc_vis %>%
  ggplot() + 
  aes(x = percent_NEET, y = as.numeric(completed)) + 
  geom_jitter(width = 0, height = 0.05, alpha=0.2) + 
  geom_smooth(method="glm", method.args=list(family="binomial")) + 
  ggtitle("Percent of timepoints NEET (not in employment, education, or training)", 
          subtitle = "Vertical jitter added. Logistic regression fit overlaid.") + 
  xlab("Percent of timepoints where patient was NEET") + 
  ylab("") +
  scale_y_continuous(breaks = 0:1, labels=c("Not completed", "Completed"))
## `geom_smooth()` using formula = 'y ~ x'

3.6 Insurance status

Insurance status may be a proxy for socioeconomic status among young people in the United States citation, although it’s definitely complicated. In the present sample, being on medicaid or uninsured isn’t predictive of any outcome, but being on private insurance is predictive of completion and of not unilaterally terminating.

## medicaid
par(mfrow= c(1,3))
with(dc_vis, 
     mosaicplot(table(ever_medicaid, completed), shade=TRUE,
                main = ""))
with(dc_vis, 
     mosaicplot(table(ever_medicaid, unilateral_termination), shade=TRUE,
                 main = "medicaid status by outcome"))
with(dc_vis, 
     mosaicplot(table(ever_medicaid, lost_contact), shade=TRUE,
                 main = ""))

## non-insured
with(dc_vis, 
     mosaicplot(table(ever_uninsured, completed), shade=TRUE,
                main = ""))
with(dc_vis, 
     mosaicplot(table(ever_uninsured, unilateral_termination), shade=TRUE,
                 main = "non-insured status by outcome"))
with(dc_vis, 
     mosaicplot(table(ever_uninsured, lost_contact), shade=TRUE,
                 main = ""))

## private
with(dc_vis, 
     mosaicplot(table(ever_private, completed), shade=TRUE,
                main = ""))
with(dc_vis, 
     mosaicplot(table(ever_private, unilateral_termination), shade=TRUE,
                 main = "private insurance status by outcome"))
with(dc_vis, 
     mosaicplot(table(ever_private, lost_contact), shade=TRUE,
                 main = ""))

3.7 Recovery

Recovery (quantified by the QPR) looks like a decent predictor of lost contact, where CSC programs seem to be more likely to lose contact with people who have higher recovery ratings.

dc %>%
  pivot_longer(cols = c(completed,unilateral_termination,lost_contact),
               names_to = "discharge_type", values_to = "outcome") %>%
  ggplot() + 
  aes(x = CSC_Duration_Totalmos, y = QPR_clean, group = NDA_ID, color = outcome) + 
  geom_line(alpha=0.5) + 
  geom_smooth(aes(x = CSC_Duration_Totalmos, y = QPR_clean, group = outcome)) + 
  ggtitle("QPR trajectory versus program completion") + 
  xlab("Assessment time (months)") + 
  xlim(0,30) + 
  facet_wrap(~discharge_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 18306 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 15145 rows containing missing values (`geom_line()`).

We estimate this using QPR score as well as it’s interaction with time in order to model the effect of QPR scores and their trajectories.

fit_QPR <- stan_glmer(lost_contact ~ 
                   (scale(QPR_clean) + scale(CSC_Duration_Totalmos))^2 +
                   # account for clustering
                   (1|hub_c) + (1|Site_ID) + (1|NDA_ID),
                 data = dc, 
                 family=binomial,
                 adapt_delta = 0.99,
                 cores=4)

It does appear that people with higher QPR scores have

fit_QPR
## stan_glmer
##  family:       binomial [logit]
##  formula:      lost_contact ~ (scale(QPR_clean) + scale(CSC_Duration_Totalmos))^2 + 
##     (1 | hub_c) + (1 | Site_ID) + (1 | NDA_ID)
##  observations: 1008
## ------
##                                               Median MAD_SD
## (Intercept)                                   -8.2    1.3  
## scale(QPR_clean)                               1.0    0.5  
## scale(CSC_Duration_Totalmos)                  -1.7    0.5  
## scale(QPR_clean):scale(CSC_Duration_Totalmos)  0.5    0.4  
## 
## Error terms:
##  Groups  Name        Std.Dev.
##  NDA_ID  (Intercept) 5.6     
##  Site_ID (Intercept) 1.6     
##  hub_c   (Intercept) 1.7     
## Num. levels: NDA_ID 666, Site_ID 64, hub_c 7 
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Each standard deviation increase in the QPR is associated with between 36% and 606% greater odds of lost contact. Of course this indicates considerable uncertainty, but the effect appears to be present.

exp(posterior_interval(fit_QPR, pars="scale(QPR_clean)", probs = c(0.025, 0.975)))
##                        5%      95%
## scale(QPR_clean) 1.355336 6.057725

We’ll want to use QPR scores in later models. To do this, we’ll need to get an average QPR score for each participant and stack it onto the discharge dataframe.

QPR_average_model <- stan_lmer(
  QPR_clean ~ 
    # this is an intercept-only model.
    # we get the expected average for each person in each site in each hub
    (1|NDA_ID) +
    (1|Site_ID) +
    (1|hub_c),
  data=dat,
  cores=4,
  iter=2000
  )

We use the above model to get estimated QPR scores for each person. This code is actually run as the markdown is created.

# first whole dataset
qpr_est <- posterior_epred(QPR_average_model, 
                             newdata=dat %>% select(NDA_ID,Site_ID,hub_c))
dat$QPR_avg <- apply(qpr_est, 2, mean)
# then discharge visit only
qpr_est <- posterior_epred(QPR_average_model, 
                             newdata=dc_vis %>% select(NDA_ID,Site_ID,hub_c))
dc_vis$QPR_avg <- apply(qpr_est, 2, mean)

4 Modeling discharge outcome

4.1 Program completion

fit_comp <-
  stan_glmer(
    completed ~
      scale(Age_Entry_Years) +
      percent_NEET + 
      male_c +
      ever_private + 
      scale(QPR_avg) +
      scale(NEG_avg) +
      (1|Site_ID)  +
      (1|hub_c) +
      # varying intercept for race
      (1|simple_race) +
      # check if different hubs have different race effects?
      (1|Site_ID:simple_race) +
      (1|hub_c:simple_race),
    data = dc_vis,
    cores=4,
    family=binomial
  )

Interrogate the model.

# overall model fit
print(fit_comp, digits=3)
## stan_glmer
##  family:       binomial [logit]
##  formula:      completed ~ scale(Age_Entry_Years) + percent_NEET + male_c + 
##     ever_private + scale(QPR_avg) + scale(NEG_avg) + (1 | Site_ID) + 
##     (1 | hub_c) + (1 | simple_race) + (1 | Site_ID:simple_race) + 
##     (1 | hub_c:simple_race)
##  observations: 1609
## ------
##                        Median MAD_SD
## (Intercept)            -1.160  0.465
## scale(Age_Entry_Years)  0.287  0.073
## percent_NEET           -0.908  0.283
## male_cTRUE             -0.201  0.145
## ever_privateTRUE        0.596  0.148
## scale(QPR_avg)          0.193  0.162
## scale(NEG_avg)         -0.406  0.109
## 
## Error terms:
##  Groups              Name        Std.Dev.
##  Site_ID:simple_race (Intercept) 0.6657  
##  Site_ID             (Intercept) 0.8147  
##  hub_c:simple_race   (Intercept) 0.2364  
##  hub_c               (Intercept) 1.1438  
##  simple_race         (Intercept) 0.2286  
## Num. levels: Site_ID:simple_race 310, Site_ID 86, hub_c:simple_race 41, hub_c 7, simple_race 6 
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
# fixed effects
plot(fit_comp, pars = names(fixef(fit_comp)), prob_outer=0.95, plotfun="areas")

# main race effects
plot(fit_comp, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:", prob_outer= 0.95, plotfun="areas")

# calculate ORs and confidence intervals using the posterior draws
draws <- as.data.frame(fit_comp, pars=names(fixef(fit_comp)))
# OR for age
mean(exp(draws$`scale(Age_Entry_Years)`))
## [1] 1.33802
# 95%CI
exp(quantile(draws$`scale(Age_Entry_Years)`, prob=c(0.025,0.975)))
##     2.5%    97.5% 
## 1.147427 1.553158
# OR for private insurance coverage
mean(exp(draws$ever_privateTRUE))
## [1] 1.833629
# 95%CI
exp(quantile(draws$ever_privateTRUE, prob=c(0.025,0.975)))
##     2.5%    97.5% 
## 1.369223 2.417438
# OR for NEET
mean(exp(-draws$percent_NEET)) # flip sign to get OR for *not* being NEET
## [1] 2.571403
# 95%CI
exp(quantile(-draws$percent_NEET, prob=c(0.025,0.975)))
##     2.5%    97.5% 
## 1.417653 4.291708
# OR for negative symptoms
mean(exp(draws$`scale(NEG_avg)`))
## [1] 0.6691529
# 95%CI
exp(quantile(draws$`scale(NEG_avg)`, prob=c(0.025,0.975)))
##      2.5%     97.5% 
## 0.5364538 0.8191770

Regarding program completion, there’s substantial variation by hub and by site.

plot(fit_comp, regex_pars = "Sigma", prob_outer=0.95, plotfun="areas")

sigma_draws <- as.data.frame(fit_comp, regex_pars = "Sigma")
means <- data.frame(apply(sigma_draws, 2, mean))
CIs <- t(apply(sigma_draws, 2, function(x){quantile(x, probs=c(0.025,0.975))}))
sd_estimates <- cbind(means, CIs); names(sd_estimates)[1] <- "est"
sd_estimates
##                                                           est          2.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.44309673 0.04609038721
## Sigma[Site_ID:(Intercept),(Intercept)]             0.66381348 0.24420704599
## Sigma[hub_c:simple_race:(Intercept),(Intercept)]   0.05588683 0.00004686279
## Sigma[hub_c:(Intercept),(Intercept)]               1.30819933 0.06475704902
## Sigma[simple_race:(Intercept),(Intercept)]         0.05227878 0.00005072530
##                                                        97.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 1.0038078
## Sigma[Site_ID:(Intercept),(Intercept)]             1.3017215
## Sigma[hub_c:simple_race:(Intercept),(Intercept)]   0.2952444
## Sigma[hub_c:(Intercept),(Intercept)]               4.7157268
## Sigma[simple_race:(Intercept),(Intercept)]         0.3123491

Here are the hub effects (on the logit scale). Note LEAP is not included because they have fully missing age data and so the 53 LEAP discharges drop out of the regression. Completion rates for LEAP can still be estimated by the model using the observed hub-level distribution.

plot(fit_comp, regex_pars = "t\\) hub_c:(O|A|E|C)", prob_outer=0.95, plotfun="areas")

draws_hub <- as.data.frame(fit_comp, regex_pars = "t\\) hub_c:(O|A|E|L|C)")
difference <- draws_hub$`b[(Intercept) hub_c:ESPRITO]` - draws_hub$`b[(Intercept) hub_c:EPINET-TX]`
mean(exp(difference))
## [1] 3.850913
quantile(exp(difference), prob=c(0.025,0.975))
##      2.5%     97.5% 
##  0.995311 11.520883

There is less variability by race, and none of the differences between race are statistically significant.

plot(fit_comp, regex_pars = "t\\) simple_race", prob_outer=0.95, plotfun="areas")

draws_race <- as.data.frame(fit_comp, regex_pars = "t\\) simple_race")
biggest_difference <- draws_race$`b[(Intercept) simple_race:Multiracial]` - draws_race$`b[(Intercept) simple_race:Black]` 
mean(exp(biggest_difference))
## [1] 1.189367
quantile(exp(biggest_difference), prob=c(0.025,0.975))
##      2.5%     97.5% 
## 0.8019533 2.1170134

4.2 Lost contact

fit_lost <- 
  stan_glmer(
    lost_contact ~ 
      scale(Age_Entry_Years) +
      percent_NEET + 
      male_c +
      ever_private + 
      scale(QPR_avg) + 
      scale(NEG_avg) +
      (1|simple_race) +
      (1|Site_ID)  +
      (1|hub_c) +
      # varying intercept for race
      # check if different hubs have different racial trajectories?
      (1  |Site_ID:simple_race) +
      (1 |hub_c:simple_race),
    data = dc_vis,
    adapt_delta=0.99,
    cores=4,
    family=binomial
  )

Interrogate the model.

# overall output
print(fit_lost, digits=3)
## stan_glmer
##  family:       binomial [logit]
##  formula:      lost_contact ~ scale(Age_Entry_Years) + percent_NEET + male_c + 
##     ever_private + scale(QPR_avg) + scale(NEG_avg) + (1 | simple_race) + 
##     (1 | Site_ID) + (1 | hub_c) + (1 | Site_ID:simple_race) + 
##     (1 | hub_c:simple_race)
##  observations: 1609
## ------
##                        Median MAD_SD
## (Intercept)            -2.255  0.424
## scale(Age_Entry_Years) -0.018  0.082
## percent_NEET            0.027  0.294
## male_cTRUE              0.413  0.166
## ever_privateTRUE       -0.450  0.162
## scale(QPR_avg)          0.227  0.161
## scale(NEG_avg)          0.085  0.099
## 
## Error terms:
##  Groups              Name        Std.Dev.
##  Site_ID:simple_race (Intercept) 0.4780  
##  Site_ID             (Intercept) 0.5350  
##  hub_c:simple_race   (Intercept) 0.2986  
##  hub_c               (Intercept) 0.6265  
##  simple_race         (Intercept) 0.6768  
## Num. levels: Site_ID:simple_race 310, Site_ID 86, hub_c:simple_race 41, hub_c 7, simple_race 6 
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
# fixed effects
plot(fit_lost, pars = names(fixef(fit_lost)), prob_outer=0.95, plotfun="areas")

# race effects (are substantial for this outcome)
plot(fit_lost, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:", prob_outer= 0.95, plotfun="areas")

# calculate ORs and confidence intervals using posterior draws
draws <- as.data.frame(fit_lost, pars=names(fixef(fit_lost)))
# effect of being male
mean(exp(draws$male_cTRUE))
## [1] 1.533791
# 95% CI
exp(quantile(draws$male_cTRUE, prob=c(0.025,0.975)))
##     2.5%    97.5% 
## 1.098561 2.102246
# effect of private insurance
mean(exp(draws$ever_privateTRUE))
## [1] 0.645468
# 95% CI
exp(quantile(draws$ever_privateTRUE, prob=c(0.025,0.975)))
##      2.5%     97.5% 
## 0.4593164 0.8844855

There’s substantial race-level variation. Specifically, black people had higher lost contact rates than every other race category (no other race differences significant).

# race effects
draws_race <- as.data.frame(fit_lost, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:")
# black people substantially more likely to have lost contact than white people
black_white_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:White]`)
#OR
mean(exp(black_white_diff))
## [1] 3.021695
#95%CI
quantile(exp(black_white_diff), prob = c(0.025, 0.975))
##     2.5%    97.5% 
## 1.206379 5.152579
# difference between black and asian also significant
black_asian_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:AsianAPI]`)
#OR
mean(exp(black_asian_diff))
## [1] 3.725463
#95%CI
quantile(exp(black_asian_diff), prob = c(0.025, 0.975))
##     2.5%    97.5% 
## 1.201704 8.494808
# difference between black and hispanic also significant
black_hisp_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:Hispanic]`)
#OR
mean(exp(black_hisp_diff))
## [1] 2.960781
#95%CI
quantile(exp(black_hisp_diff), prob = c(0.025, 0.975))
##     2.5%    97.5% 
## 1.192902 5.408228
# difference between black and multiracial
black_multi_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:Multiracial]`)
#OR
mean(exp(black_multi_diff))
## [1] 2.196098
#95%CI
quantile(exp(black_multi_diff), prob = c(0.025, 0.975))
##      2.5%     97.5% 
## 0.9087752 4.5260508
# difference between black and other
black_other_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:Other]`)
#OR
mean(exp(black_other_diff))
## [1] 2.324933
#95%CI
quantile(exp(black_other_diff), prob = c(0.025, 0.975))
##      2.5%     97.5% 
## 0.9853573 4.5254354

There’s less program- and hub-level variability for lost contact than for program completion, but it’s there. The most substantial variability we see in these random effects is the estimated standard deviation for race. There also appears to be variability in the relationship between race and lost contact across sites.

plot(fit_lost, regex_pars = "Sigma", prob_outer=0.95, plotfun="areas")

sigma_draws <- as.data.frame(fit_lost, regex_pars = "Sigma")
means <- data.frame(apply(sigma_draws, 2, mean))
CIs <- t(apply(sigma_draws, 2, function(x){quantile(x, probs=c(0.025,0.975))}))
sd_estimates <- cbind(means, CIs); names(sd_estimates)[1] <- "est"
sd_estimates
##                                                          est         2.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.2284999 0.0007348562
## Sigma[Site_ID:(Intercept),(Intercept)]             0.2862249 0.0055102774
## Sigma[hub_c:simple_race:(Intercept),(Intercept)]   0.0891474 0.0001323044
## Sigma[hub_c:(Intercept),(Intercept)]               0.3924918 0.0037405062
## Sigma[simple_race:(Intercept),(Intercept)]         0.4579949 0.0360529004
##                                                        97.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.7163858
## Sigma[Site_ID:(Intercept),(Intercept)]             0.7498976
## Sigma[hub_c:simple_race:(Intercept),(Intercept)]   0.4685355
## Sigma[hub_c:(Intercept),(Intercept)]               1.6066162
## Sigma[simple_race:(Intercept),(Intercept)]         1.8238435

Here are the hub effects.

plot(fit_lost, regex_pars = "t\\) hub_c:(O|A|E|C|L)", prob_outer=0.95, plotfun="areas")

4.3 Unilateral termination

fit_uni <- 
  stan_glmer(
    unilateral_termination ~ 
      scale(Age_Entry_Years) +
      percent_NEET + 
      male_c +
      ever_private +
      scale(QPR_avg) + 
      scale(NEG_avg) +
      (1|simple_race) +
      (1|Site_ID)  +
      (1|hub_c) +
      # varying intercept for race
      # check if different hubs have different racial trajectories?
      (1  |Site_ID:simple_race) +
      (1 |hub_c:simple_race),
    data = dc_vis,
    cores=4,
    family=binomial
  )

Interrogate the model.

# full model
print(fit_uni, digits=3)
## stan_glmer
##  family:       binomial [logit]
##  formula:      unilateral_termination ~ scale(Age_Entry_Years) + percent_NEET + 
##     male_c + ever_private + scale(QPR_avg) + scale(NEG_avg) + 
##     (1 | simple_race) + (1 | Site_ID) + (1 | hub_c) + (1 | Site_ID:simple_race) + 
##     (1 | hub_c:simple_race)
##  observations: 1609
## ------
##                        Median MAD_SD
## (Intercept)            -1.225  0.283
## scale(Age_Entry_Years) -0.117  0.071
## percent_NEET            0.028  0.229
## male_cTRUE              0.067  0.132
## ever_privateTRUE       -0.143  0.132
## scale(QPR_avg)          0.090  0.129
## scale(NEG_avg)          0.088  0.085
## 
## Error terms:
##  Groups              Name        Std.Dev.
##  Site_ID:simple_race (Intercept) 0.1968  
##  Site_ID             (Intercept) 0.5384  
##  hub_c:simple_race   (Intercept) 0.3121  
##  hub_c               (Intercept) 0.5354  
##  simple_race         (Intercept) 0.2685  
## Num. levels: Site_ID:simple_race 310, Site_ID 86, hub_c:simple_race 41, hub_c 7, simple_race 6 
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
# plot of fixed effects
plot(fit_uni, pars = names(fixef(fit_uni)), prob_outer=0.95, plotfun="areas")

# age comes close to (but is not) statistically significant
posterior_interval(fit_uni, pars="scale(Age_Entry_Years)", prob=0.95)
##                              2.5%      97.5%
## scale(Age_Entry_Years) -0.2603549 0.01726058
# no real race effects
plot(fit_uni, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:", prob_outer= 0.95, plotfun="areas")

There was a fair amount of variability in unilateral termination by program. Less so by hub.

plot(fit_uni, regex_pars = "Sigma", prob_outer=0.95, plotfun = "areas")

sigma_draws <- as.data.frame(fit_uni, regex_pars = "Sigma")
means <- data.frame(apply(sigma_draws, 2, mean))
CIs <- t(apply(sigma_draws, 2, function(x){quantile(x, probs=c(0.025,0.975))}))
sd_estimates <- cbind(means, CIs); names(sd_estimates)[1] <- "est"
sd_estimates
##                                                           est          2.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.03873115 0.00003343512
## Sigma[Site_ID:(Intercept),(Intercept)]             0.28992637 0.08599491075
## Sigma[hub_c:simple_race:(Intercept),(Intercept)]   0.09742719 0.00071400301
## Sigma[hub_c:(Intercept),(Intercept)]               0.28669581 0.00180987923
## Sigma[simple_race:(Intercept),(Intercept)]         0.07210927 0.00010043213
##                                                        97.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.1799816
## Sigma[Site_ID:(Intercept),(Intercept)]             0.6087732
## Sigma[hub_c:simple_race:(Intercept),(Intercept)]   0.3248533
## Sigma[hub_c:(Intercept),(Intercept)]               1.2361803
## Sigma[simple_race:(Intercept),(Intercept)]         0.3734211
plot(fit_uni, regex_pars = "t\\) hub_c:(O|A|E|C)", prob_outer=0.95, plotfun="areas")

draws_hub <- as.data.frame(fit_uni, regex_pars = "t\\) hub_c:(O|A|E|L|C)")
difference <- draws_hub$`b[(Intercept) hub_c:EPINET-TX]` - draws_hub$`b[(Intercept) hub_c:CLHS]`
mean(exp(difference))
## [1] 2.438288
quantile(exp(difference), probs=c(0.025, 0.975))
##      2.5%     97.5% 
## 0.9882964 5.4119606

Transferred/moved?

At the request of a reviewer we also include a model for the “transferred or moved” category, although we do not report on this model in the main paper as we find the category both difficult to interpret and in many cases (e.g., moving to a different state) likely unrelated to treatment.

fit_transfer <- 
  stan_glmer(
    transferred_or_moved ~ 
      scale(Age_Entry_Years) +
      percent_NEET + 
      male_c +
      ever_private +
      scale(QPR_avg) + 
      scale(NEG_avg) +
      (1|simple_race) +
      (1|Site_ID)  +
      (1|hub_c) +
      # varying intercept for race
      # check if different hubs have different racial trajectories?
      (1  |Site_ID:simple_race) +
      (1 |hub_c:simple_race),
    data = dc_vis,
    cores=4,
    family=binomial
  )
plot(fit_transfer, pars = names(fixef(fit_transfer)), prob_outer=0.95)

4.4 Plot model results together

We build a plot of all the coefficients of the three primary models.

Start by combining draws for the fixed effects.

draws_uni <- as.data.frame(fit_uni, pars = names(fixef(fit_uni)))
draws_lost <- as.data.frame(fit_lost, pars = names(fixef(fit_lost)))
draws_comp <- as.data.frame(fit_comp, pars = names(fixef(fit_comp)))
# race effects
race_draws_uni <- as.data.frame(fit_uni, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:")
race_draws_lost <- as.data.frame(fit_lost, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:")
race_draws_comp <- as.data.frame(fit_comp, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:")
draws_uni <- cbind(draws_uni, race_draws_uni)
draws_lost <- cbind(draws_lost, race_draws_lost)
draws_comp <- cbind(draws_comp, race_draws_comp)
# drop intercept
draws_uni <- draws_uni[-1]; draws_lost <- draws_lost[-1]; draws_comp <- draws_comp[-1]

# add random effects for race
draws_combined <- rbind(
  data.frame(draws_uni, model = "Unilateral termination"),
  data.frame(draws_lost, model = "Lost contact"),
  data.frame(draws_comp, model = "Program completion")
)

# rename for readability
names(draws_combined) <- c("Age (standardized)", "Percent of visits NEET", "Male","Ever privately insured", "QPR (standardized)","Negative Sx (standardized)", "Race:White", "Race:Hispanic", "Race:Black", "Race:Multiracial", "Race:Other", "Race:Asian/Pacific Islander", "Model")

Then build the plot.

par_labels <- c("Age (standardized)", "Percent of visits NEET", "Male", "Ever privately insured" , "QPR (standardized)","Negative Sx (standardized)","Race:White", "Race:Hispanic", "Race:Black", "Race:Asian/Pacific Islander","Race:Multiracial", "Race:Other")
bounds <- 1.6
coef_plot_uni <- bayesplot::mcmc_intervals(
  draws_combined %>% subset(Model == "Unilateral termination", select=-c(Model)),
  pars = par_labels,
  prob_outer = 0.95
) + xlim(-bounds,bounds) + ggtitle("Unilateral termination")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
coef_plot_comp <- bayesplot::mcmc_intervals(
  draws_combined %>% subset(Model == "Program completion", select=-c(Model)),
    pars = par_labels,
  prob_outer = 0.95
) + xlim(-bounds,bounds) + ggtitle("Program completion")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
coef_plot_lost <- bayesplot::mcmc_intervals(
  draws_combined %>% subset(Model == "Lost contact", select=-c(Model)),
    pars = par_labels,
  prob_outer = 0.95
) + xlim(-bounds,bounds) + ggtitle("Lost contact")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
gridExtra::grid.arrange(coef_plot_comp, coef_plot_lost, coef_plot_uni, ncol=1)

5 Predictive modeling

We think our estimate of completion rate is an underestimate because the longer you stay in CSC the more likely you are to complete, and so all these people in the dataset who haven’t yet discharged become increasingly likely to complete the longer we track them.

We want a lower bound on our completion rate underestimate, so we fit a predictive model (basically the above model but with program duration added as a predictor, which is very, very strongly correlated with program completion).

pred_comp <-
  stan_glmer(
    completed ~
      CSC_Duration_Totalmos + #### THIS IS THE ONLY NEW VARIABLE
      scale(Age_Entry_Years) +
      percent_NEET + 
      male_c +
      ever_private +
      scale(QPR_avg) + 
      scale(NEG_avg) +
      (1|simple_race) +
      (1|Site_ID)  +
      (1|hub_c) +
      (1|Site_ID:simple_race) +
      (1|hub_c:simple_race),
    data = dc_vis,
    cores=4,
    adapt_delta = 0.99,
    family=binomial
    )

We use this model to get predictions for the people who haven’t discharged yet.

non_missing_cases <-
  dc_vis %>% select(CSC_Duration_Totalmos,
                  Age_Entry_Years, 
                  percent_NEET, 
                  simple_race,
                  male_c,
                  ever_private,
                  Site_ID,
                  QPR_avg,
                  hub_c, 
                  completed
                  ) %>%
  complete.cases()
## Adding missing grouping variables: `NDA_ID`
non_missing_discharged <- dc_vis[non_missing_cases,]

Let’s see how good the model is for in-sample predictions

ppred <- posterior_epred(pred_comp, newdata = non_missing_discharged)
pred <- colMeans(ppred)
# model-predicted completion rate
mean(pred)
## [1] 0.2314023
# actual completion rate
mean(non_missing_discharged$completed)
## [1] 0.2374145

Let’s plot model predicted program completion versus actuzal completion.

non_missing_discharged$pred <- pred
non_missing_discharged %>%
  ggplot() + 
  aes(x = pred, y = completed) +
  geom_jitter(width=0, height=0.05, alpha=0.2)

We get good (~88%) accuracy.

confusionMatrix(factor(non_missing_discharged$pred >= 0.5, levels=c(FALSE, TRUE), labels=c("not completed", "completed")), # pred
                factor(non_missing_discharged$completed, levels=c(FALSE, TRUE), labels=c("not completed", "completed")) # truth
)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      not completed completed
##   not completed          1172       149
##   completed                55       233
##                                                
##                Accuracy : 0.8732               
##                  95% CI : (0.856, 0.8891)      
##     No Information Rate : 0.7626               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.6174               
##                                                
##  Mcnemar's Test P-Value : 0.0000000000745      
##                                                
##             Sensitivity : 0.9552               
##             Specificity : 0.6099               
##          Pos Pred Value : 0.8872               
##          Neg Pred Value : 0.8090               
##              Prevalence : 0.7626               
##          Detection Rate : 0.7284               
##    Detection Prevalence : 0.8210               
##       Balanced Accuracy : 0.7826               
##                                                
##        'Positive' Class : not completed        
## 

We use the above model to predict for the people who haven’t discharged yet

latest <- dat %>%
  group_by(NDA_ID) %>%
  slice(which.max(Assessment_Type)) %>%
  subset(!(Assessment_Type %in% c(26,27))) %>%
  ungroup()

non_missing_latest <-
  latest %>% select(CSC_Duration_Totalmos,
                  Age_Entry_Years, 
                  percent_NEET, 
                  simple_race,
                  male_c,
                  ever_private,
                  QPR_avg,
                  NEG_avg,
                  Site_ID,
                  hub_c
                    ) %>%
  complete.cases()
non_missing_undischarged <- latest[non_missing_latest,]
ppred <- posterior_epred(pred_comp, newdata = non_missing_undischarged)
pred <- colMeans(ppred)

Model-predicted completion rates for people who haven’t discharged yet are actually lower than in the group that’s already discharged.

mean(pred)
## [1] 0.1812476

This may partially be due to the fact that people who haven’t yet discharged are more likely to be earlier in the course of the program, and if they continue (which they will) their predicted and actual completion rates would both rise.

mean(non_missing_undischarged$CSC_Duration_Totalmos)
## [1] 13.68466
mean(non_missing_discharged$CSC_Duration_Totalmos)
## [1] 16.04351
non_missing_undischarged$discharged <- FALSE
non_missing_discharged$discharged <- TRUE
bind_rows(non_missing_undischarged, non_missing_discharged) %>%
  ggplot() + 
  aes(x = CSC_Duration_Totalmos, fill = discharged) + 
  geom_density(alpha=0.5) + 
  ggtitle("distribution of the most recent observation",
          subtitle="by discharge status")

In any case, here is the predicted total hub-wide completion rate under various assumptions.

Assumption 1: that the latest observed enrollment duration is the final enrollment duration.

ppred <- posterior_epred(pred_comp, 
                         newdata = 
                           bind_rows(non_missing_undischarged, 
                                     non_missing_discharged) %>%
                           select(CSC_Duration_Totalmos,
                  Age_Entry_Years, 
                  percent_NEET, 
                  simple_race,
                  male_c,
                  ever_private,
                  QPR_avg,
                  NEG_avg,
                  Site_ID,
                  hub_c
                ))
pred <- colMeans(ppred)
mean(pred)
## [1] 0.2022029

…that the non-discharged folks will discharge at (at least) the mean of the discharged folks

av_disch <- mean(non_missing_discharged$CSC_Duration_Totalmos)
non_missing_undischarged$CSC_Duration_Totalmos <-
  ifelse(non_missing_undischarged$CSC_Duration_Totalmos < av_disch, av_disch,non_missing_undischarged$CSC_Duration_Totalmos )
ppred <- posterior_epred(pred_comp, 
                         newdata = 
                           bind_rows(non_missing_undischarged, 
                                     non_missing_discharged) %>%
                           select(CSC_Duration_Totalmos,
                  Age_Entry_Years, 
                  percent_NEET, 
                  simple_race,
                  male_c,
                  ever_private,
                  QPR_avg,
                  NEG_avg,
                  Site_ID,
                  hub_c
                  ))
pred <- colMeans(ppred)
mean(pred)
## [1] 0.2384769

…that the non-discharged folks will discharge at (at least) the upper quartile of the discharged folks

(upper_quartile <- quantile(non_missing_discharged$CSC_Duration_Totalmos, 0.75)) 
## 75% 
##  24
non_missing_undischarged$CSC_Duration_Totalmos <-
  ifelse(non_missing_undischarged$CSC_Duration_Totalmos < upper_quartile, upper_quartile,non_missing_undischarged$CSC_Duration_Totalmos )

ppred <- posterior_epred(pred_comp, 
                         newdata = 
                           bind_rows(non_missing_undischarged, 
                                     non_missing_discharged) %>%
                           select(CSC_Duration_Totalmos,
                                  Age_Entry_Years, 
                                  percent_NEET, 
                                  simple_race,
                                  male_c,
                                  ever_private,
                                  QPR_avg,
                                  NEG_avg,
                                  Site_ID,
                                  hub_c
                                  ))
pred <- colMeans(ppred)
mean(pred)
## [1] 0.292774