This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
We start by loading packages.
library(dplyr)
library(tidyverse)
library(zoo)
library(caret)
library(rstanarm)
And data.
full_dat <- readxl::read_xlsx("Z:\\Harmonized\ CAB\ File\\Current\ Data\\Harmonized-CAB-Phases-1-2-Data-9-18-23.xlsx")
A lot of the models we run in this markdown take too long to fit in tandem with the markdown because the session times out relatively quickly. My solution is to fit these models and progressively save the posterior draws. In the following chunk I load these models.
folder <- "Z:\\HUB\ Shared\ Working\ Folders\\Connection\ LHS\\Current\\Peter\\out\\"
# load model fits. we can't refit these each time, it takes too long
load(file = paste0(folder, "models.RData"))
We process the data into formats that are conducive to analysis.
Each observation occurs within the context of a site and a hub. Confusingly, the Site ID gets reused between some hubs but not between others.
To get a unique Site ID we concatenate it with Hub_ID
full_dat$Site_ID <- paste(full_dat$Hub_ID, full_dat$Site_ID, sep="_")
Westat was good about providing detailed codes for missingness.
They’re always negative values. In the following chunk, we recode these
negative values as NA.
# define function that takes vector and replaces negative values with NA
neg_to_NA <- function(x) {x[x < 0] <- NA; return(x)}
full_dat <-
full_dat %>% mutate(
Reason_Discharge = neg_to_NA(Reason_Discharge),
CSC_Duration_Totalmos = neg_to_NA(CSC_Duration_Totalmos),
Age_Entry_Months = neg_to_NA(Age_Entry_Months),
QPR_clean = neg_to_NA(QPR_Score),
CSI_clean = neg_to_NA(CSI_Score),
Gender_ID = neg_to_NA(Gender_ID),
QoL = neg_to_NA(QoL)
)
We do the same for all the COMPASS-10 items. These we vectorize to make things easier
# also do this with all the COMPASS10 subscales and calculate
C10_cols <- grep('COMPASS10',names(full_dat))
names(full_dat)[C10_cols]
## [1] "COMPASS10_Depressed" "COMPASS10_Anxiety"
## [3] "COMPASS10_Suicidal" "COMPASS10_Hostility"
## [5] "COMPASS10_Suspiciousness" "COMPASS10_Unusual"
## [7] "COMPASS10_Hallucinations" "COMPASS10_Disorganization"
## [9] "COMPASS10_Avolition" "COMPASS10_Asociality"
full_dat[,C10_cols] <- sapply(full_dat[,C10_cols],neg_to_NA)
While we’re here, let’s calculate the COMPASS-10 negative and positive total scores.
full_dat$COMPASS_NEG <- rowMeans(
full_dat[c("COMPASS10_Asociality",
"COMPASS10_Avolition")],
na.rm=TRUE)
full_dat$COMPASS_POS <- rowMeans(
full_dat[c("COMPASS10_Disorganization",
"COMPASS10_Hallucinations",
"COMPASS10_Unusual"
)],
na.rm=TRUE)
Sites were able to make decisions about which symptom scales to administer.
Here’s the approximate rate of missingness for some key potential variables.
mean(full_dat$BPRS_Anxiety < 0) # neg values are NA
## [1] 0.9978471
mean(is.na(full_dat$CSI_clean)) # these neg values were recoded
## [1] 0.7831001
mean(is.na(full_dat$COMPASS_NEG))
## [1] 0.7412391
mean(is.na(full_dat$QoL))
## [1] 0.8322569
QoL and BPRS are both > 80% missing. They just seem unusable.
Now here is a pretty monster chunk of code that produces lots of variables of interest.
dat <- full_dat %>%
# first, observation-level variables
mutate(
months_in_program = ifelse(CSC_Duration_Totalmos < 0, NA,
CSC_Duration_Totalmos),
disch_visit = Assessment_Type == 26,
gender_c = factor(Gender_ID,
levels = 1:6,
labels = c("Male",
"Female",
"Non-binary",
"Transgender male",
"Transgender female",
"Other")
),
hub_c = factor(Hub_ID,
levels = 1:8,
labels = c("AC-EPINET",
"CLHS",
"EPI-CAL",
"EPI-MINN",
"EPINET-TX",
"ESPRITO",
"LEAP",
"ONTY")
),
discharge_c =
recode_factor(Reason_Discharge,
`1` = "unilateral_termination", # "Terminated, refused or declined services"
`2` = "completed", # "Completed program, graduated, or services no longer indicated due to client improvement"
`3` = "ineligible", # "Client does not display signs and symptoms that lead to the inclusion of a covered diagnosis and/or an established level of impairment"
`4` = "completed", # "Has reached limit for length of allowable stay"
`5` = "pursuing_pos_opportunity", # "Pursuing a positive opportunity elsewhere (e.g., school, employment, training)"
`6` = "incarcerated", # "Admitted to state hospital"
`7` = "incarcerated", # "Admitted to a residential program"
`8` = "transferred_or_moved", # "Transferred services to provider outside CSC program (other than state hospital or residential program)"
`9` = "incarcerated", # "Incarcerated"
`10` = "transferred_or_moved", # "Moved out of service area because of reasons other than options noted above"
`11` = "death", # "Deceased (by suicide)"
`12` = "death", # "Deceased (by other means)"
`13` = "lost_contact", # "Whereabouts unknown, team unable to contact client"
`14` = "other", # "Other"
`15` = "death" # "Deceased (unknown)"
),
# "NEET" not in employment, education or training
NEET = !(Job_6mos == 1 | Internship == 1 | School_Attendance %in% 2:3)
) %>%
# then, person-level variables
group_by(NDA_ID) %>%
mutate(male_c = any(gender_c == "Male", na.rm=T),
NIMH_sex = ifelse(any(Bio_Sex == 1, na.rm=T), "Female",
ifelse(any(Bio_Sex == 2, na.rm=T), "Male",
"Unknown/Not Reported")),
ever_medicaid = any(Health_Insurance_Medicaid == 1, na.rm =T),
ever_private = any(Health_Insurance_Commercial == 1 |
Health_Insurance_Private == 1 |
Health_Insurance_Other == 1, na.rm=T),
ever_uninsured = any(Health_Insurance_None == 1, na.rm=t),
ever_SSI = any(SSI == 1, na.rm=T),
black_c = any(Race_Black == 1, na.rm=T),
white_c = any(Race_White == 1, na.rm=T),
gender_c = if_else(!is.na(gender_c), gender_c, NA),
hispanic_c = any(Ethnicity == 1, na.rm=T),
asian_pi_c = any(Race_Asian ==1, na.rm=T) | any(Race_NH ==1, na.rm=T),
# create single race variable
# note: zero AI
race_c = ifelse( ((any(Race_White ==1, na.rm=T) +
any(Race_Black == 1, na.rm=T) +
any(Race_Asian ==1, na.rm=T) +
any(Race_AI ==1, na.rm=T) +
any(Race_NH ==1, na.rm=T) ) > 1) |
( (any(Ethnicity == 1, na.rm=T) +
any(Race_Black == 1, na.rm=T) +
any(Race_Asian ==1, na.rm=T) +
any(Race_AI ==1, na.rm=T) +
any(Race_NH ==1, na.rm=T)) > 1), "Multiracial",
ifelse(any(Ethnicity == 1), "Hispanic",
ifelse(any(Race_White == 1, na.rm=T), "White",
ifelse(any(Race_Black == 1, na.rm=T), "Black",
ifelse(any(Race_Asian == 1, na.rm=T), "Asian",
ifelse(any(Race_AI == 1, na.rm=T), "AI",
ifelse(any(Race_NH ==1, na.rm=T), "API",
ifelse(any(Race_Other == 1, na.rm=T), "Other",
"unknown or prefer not to say")))))))),
simple_race =factor(ifelse(
race_c == "White", "White",
ifelse(race_c == "Black", "Black",
ifelse(race_c == "Hispanic", "Hispanic",
ifelse( (race_c == "Asian" | race_c == "API"), "AsianAPI",
ifelse(race_c == "Multiracial", "Multiracial", "Other")))))),
NIMH_race = factor(
ifelse(
(any(Race_White ==1, na.rm=T) +
any(Race_Black == 1, na.rm=T) +
any(Race_Asian ==1, na.rm=T) +
any(Race_AI ==1, na.rm=T) +
any(Race_NH ==1, na.rm=T)) > 1,
"More than one race",
ifelse(any(Race_AI ==1, na.rm=T),"American Indian/Alaskan Native",
ifelse(any(Race_NH == 1, na.rm=T), "Native Hawaiian or Other Pacific Islander",
ifelse(any(Race_Black == 1, na.rm=T), "Black or African American",
ifelse(any(Race_White == 1, na.rm=T), "White",
ifelse(any(Race_Asian == 1, na.rm=T), "Asian",
"Unknown/Not reported"))))))),
percent_NEET = mean(NEET == 1, na.rm=T),
any_NEET = any(NEET == 1, na.rm=T),
Age_Entry_Months = na.locf(Age_Entry_Months, na.rm=FALSE),
Age_Entry_Years = ifelse(any(!is.na(Age_Entry_Months)), round(Age_Entry_Months / 12, 1), NA)
) %>%
ungroup() %>%
# finally, some row-level variables that are a function of person-level variables
mutate(
Age_At_Assessment = round( (Age_Entry_Months + CSC_Duration_Totalmos) / 12, 1 )
)
# verify that the race codes make sense
dat %>% subset(Assessment_Type ==0 ) %>% select(Race_Black, Race_White, Race_Asian, Race_Other, Ethnicity, hispanic_c, black_c, white_c, race_c, simple_race)
## # A tibble: 5,083 x 10
## Race_Black Race_White Race_Asian Race_Other Ethnicity hispanic_c black_c
## <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl>
## 1 -9 -9 -9 0 -8 FALSE FALSE
## 2 -9 -9 -9 0 1 TRUE FALSE
## 3 -9 -9 -9 0 1 TRUE FALSE
## 4 -9 -9 -9 0 1 TRUE FALSE
## 5 -9 -9 -9 0 2 FALSE FALSE
## 6 -9 -9 -9 0 1 TRUE FALSE
## 7 -9 -9 -9 0 1 TRUE FALSE
## 8 -9 -9 -9 0 1 TRUE FALSE
## 9 -9 -9 -9 0 2 FALSE FALSE
## 10 -9 1 1 0 2 FALSE FALSE
## # i 5,073 more rows
## # i 3 more variables: white_c <lgl>, race_c <chr>, simple_race <fct>
Patient-level descriptives of the full sample
desc <-
dat %>%
group_by(NDA_ID) %>%
summarise(
age_at_entry = mean(Age_Entry_Years, na.rm=T),
simple_race = factor(unique(simple_race)),
NIMH_race = factor(
ifelse((any(Race_White == 1, na.rm=T) +
any(Race_Black == 1, na.rm=T) +
any(Race_Asian == 1, na.rm=T) +
any(Race_AI == 1, na.rm=T) +
any(Race_NH == 1, na.rm=T)) > 1,
"More than one race",
ifelse(any(Race_AI ==1, na.rm=T), "American Indian/Alaskan Native",
ifelse(any(Race_NH == 1, na.rm=T), "Native Hawaiian or Other Pacific Islander",
ifelse(any(Race_Black == 1, na.rm=T), "Black or African American",
ifelse(any(Race_White == 1, na.rm=T), "White",
ifelse(any(Race_Asian == 1, na.rm=T), "Asian",
"Unknown/Not reported"))))))),
hispanic = ifelse(any(Ethnicity == 1, na.rm=T), "hispanic",
ifelse(any(Ethnicity == 2, na.rm =T), "not hispanic",
"unknown/unreported")),
male = any(gender_c == "Male", na.rm=T),
female = any(gender_c == "Female", na.rm=T),
trans = any(gender_c %in% c("Transgender male", "Transgender female"), na.rm=T),
other_gender = any(gender_c %in% c("Non-binary", "Other")),
NIMH_sex = unique(NIMH_sex),
ever_NEET = unique(any_NEET)
) %>%
ungroup()
summary(desc)
## NDA_ID age_at_entry simple_race
## Length:5166 Min. :11.30 White :1525
## Class :character 1st Qu.:18.50 Hispanic : 842
## Mode :character Median :21.00 Black :1448
## Mean :21.48 Multiracial: 269
## 3rd Qu.:24.00 Other : 786
## Max. :47.00 AsianAPI : 296
## NA's :1311
## NIMH_race hispanic
## White :1973 Length:5166
## Black or African American :1548 Class :character
## More than one race : 155 Mode :character
## Unknown/Not reported :1153
## American Indian/Alaskan Native : 33
## Asian : 283
## Native Hawaiian or Other Pacific Islander: 21
## male female trans other_gender
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2162 FALSE:3631 FALSE:5126 FALSE:5045
## TRUE :3004 TRUE :1535 TRUE :40 TRUE :121
##
##
##
##
## NIMH_sex ever_NEET
## Length:5166 Mode :logical
## Class :character FALSE:1397
## Mode :character TRUE :3769
##
##
##
##
sd(desc$age_at_entry, na.rm=T)
## [1] 4.106402
# NIMH demographic reporting categories
# note that gender has far less missingness than sex
tab <- with(desc, table(NIMH_race, NIMH_sex, hispanic) )
addmargins(tab)
## , , hispanic = hispanic
##
## NIMH_sex
## NIMH_race Female Male Unknown/Not Reported
## White 96 220 132
## Black or African American 7 24 69
## More than one race 8 18 6
## Unknown/Not reported 100 118 176
## American Indian/Alaskan Native 2 4 0
## Asian 2 1 2
## Native Hawaiian or Other Pacific Islander 0 2 1
## Sum 215 387 386
## NIMH_sex
## NIMH_race Sum
## White 448
## Black or African American 100
## More than one race 32
## Unknown/Not reported 394
## American Indian/Alaskan Native 6
## Asian 5
## Native Hawaiian or Other Pacific Islander 3
## Sum 988
##
## , , hispanic = not hispanic
##
## NIMH_sex
## NIMH_race Female Male Unknown/Not Reported
## White 330 683 454
## Black or African American 285 563 500
## More than one race 33 57 30
## Unknown/Not reported 90 105 176
## American Indian/Alaskan Native 11 10 5
## Asian 53 75 140
## Native Hawaiian or Other Pacific Islander 4 5 7
## Sum 806 1498 1312
## NIMH_sex
## NIMH_race Sum
## White 1467
## Black or African American 1348
## More than one race 120
## Unknown/Not reported 371
## American Indian/Alaskan Native 26
## Asian 268
## Native Hawaiian or Other Pacific Islander 16
## Sum 3616
##
## , , hispanic = unknown/unreported
##
## NIMH_sex
## NIMH_race Female Male Unknown/Not Reported
## White 16 30 12
## Black or African American 12 36 52
## More than one race 0 3 0
## Unknown/Not reported 26 41 321
## American Indian/Alaskan Native 1 0 0
## Asian 6 2 2
## Native Hawaiian or Other Pacific Islander 0 2 0
## Sum 61 114 387
## NIMH_sex
## NIMH_race Sum
## White 58
## Black or African American 100
## More than one race 3
## Unknown/Not reported 388
## American Indian/Alaskan Native 1
## Asian 10
## Native Hawaiian or Other Pacific Islander 2
## Sum 562
##
## , , hispanic = Sum
##
## NIMH_sex
## NIMH_race Female Male Unknown/Not Reported
## White 442 933 598
## Black or African American 304 623 621
## More than one race 41 78 36
## Unknown/Not reported 216 264 673
## American Indian/Alaskan Native 14 14 5
## Asian 61 78 144
## Native Hawaiian or Other Pacific Islander 4 9 8
## Sum 1082 1999 2085
## NIMH_sex
## NIMH_race Sum
## White 1973
## Black or African American 1548
## More than one race 155
## Unknown/Not reported 1153
## American Indian/Alaskan Native 33
## Asian 283
## Native Hawaiian or Other Pacific Islander 21
## Sum 5166
Finally, we create a data.frame which is every assessment for everyone who discharged.
In the process we create some variables that are unique to discharged folks.
dc <- dat %>%
group_by(NDA_ID) %>%
filter(any(Assessment_Type == 26)) %>%
mutate(
unilateral_termination = any(discharge_c == "unilateral_termination", na.rm=T),
completed = any(discharge_c == "completed", na.rm=T),
lost_contact = any(discharge_c == "lost_contact", na.rm=T),
transferred_or_moved = any(discharge_c == "transferred_or_moved", na.rm=T),
avg_COMPASS_NEG = mean(COMPASS_NEG, na.rm=T)
) %>%
ungroup()
We also create a variable that’s composed of discharge assessments only.
dc_vis <- dc %>%
subset(Assessment_Type == 26)
nrow(dc_vis)
## [1] 1818
Some people discharged more than once. Specifically, 14 people discharged twice and one person discharged three times. Because this is a very small group, we handle this by taking only their longest stay in the program.
dc_vis <- dc_vis %>%
group_by(NDA_ID) %>%
arrange(desc(CSC_Duration_Totalmos)) %>%
slice(1)
nrow(dc_vis)
## [1] 1787
Let’s start by looking at numbers of observations, in general and by hub.
We begin with the number of assessments per hub.
table(dat$hub_c)
##
## AC-EPINET CLHS EPI-CAL EPI-MINN EPINET-TX ESPRITO LEAP ONTY
## 1015 2247 768 845 825 2431 1456 7135
And a graph of the same.
ggplot(dat) +
aes(x = fct_infreq(hub_c)) +
geom_bar() +
xlab("") +
ggtitle("Number of assessments by hub")
Patients per hub.
dat %>%
select(hub_c, NDA_ID) %>%
distinct() %>%
group_by(hub_c) %>%
summarise(number_of_people = n()) %>%
arrange(desc(number_of_people))
## # A tibble: 8 x 2
## hub_c number_of_people
## <fct> <int>
## 1 ONTY 1253
## 2 CLHS 824
## 3 ESPRITO 775
## 4 LEAP 766
## 5 AC-EPINET 462
## 6 EPI-CAL 402
## 7 EPINET-TX 398
## 8 EPI-MINN 286
dat %>%
select(hub_c, NDA_ID) %>%
distinct() %>%
ggplot() +
aes(x = fct_infreq(hub_c)) +
geom_bar() + xlab("") +
ggtitle("Number of people by hub")
Sites/clinics per hub.
dat %>%
select(hub_c, Site_ID) %>%
distinct() %>%
group_by(hub_c) %>%
summarise(number_of_sites = n()) %>%
arrange(desc(number_of_sites))
## # A tibble: 8 x 2
## hub_c number_of_sites
## <fct> <int>
## 1 ONTY 26
## 2 CLHS 23
## 3 EPI-CAL 18
## 4 EPINET-TX 16
## 5 ESPRITO 13
## 6 LEAP 13
## 7 EPI-MINN 6
## 8 AC-EPINET 5
dat %>%
select(hub_c, Site_ID) %>%
distinct() %>%
ggplot() +
aes(x = fct_infreq(hub_c)) +
geom_bar() + xlab("") +
ggtitle("Number of sites by hub")
Number of discharges per hub.
table(dat$disch_visit, dat$hub_c)
##
## AC-EPINET CLHS EPI-CAL EPI-MINN EPINET-TX ESPRITO LEAP ONTY
## FALSE 891 1853 725 759 720 1959 1403 6594
## TRUE 124 394 43 86 105 472 53 541
prop.table(table(dat$disch_visit, dat$hub_c), margin = 2)
##
## AC-EPINET CLHS EPI-CAL EPI-MINN EPINET-TX ESPRITO
## FALSE 0.87783251 0.82465510 0.94401042 0.89822485 0.87272727 0.80584122
## TRUE 0.12216749 0.17534490 0.05598958 0.10177515 0.12727273 0.19415878
##
## LEAP ONTY
## FALSE 0.96359890 0.92417659
## TRUE 0.03640110 0.07582341
nrow(dc_vis)
## [1] 1787
ggplot(dc_vis) +
aes(x=fct_infreq(hub_c)) +
geom_bar() +
xlab("") +
ggtitle("Number of discharges by hub")
dat %>%
group_by(NDA_ID) %>%
summarize(hub = unique(hub_c),
disch = any(disch_visit)) %>%
ungroup() %>% group_by(hub) %>%
summarise(disch_rate = round(mean(disch) * 100,1)) %>%
arrange(desc(disch_rate))
## # A tibble: 8 x 2
## hub disch_rate
## <fct> <dbl>
## 1 ESPRITO 60
## 2 CLHS 47.8
## 3 ONTY 41.7
## 4 EPI-MINN 30.1
## 5 EPINET-TX 26.1
## 6 AC-EPINET 26
## 7 EPI-CAL 10.7
## 8 LEAP 6.9
Type of discharge.
# unrecoded counts
summary(factor(dc_vis$Reason_Discharge))
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 NA's
## 427 395 85 36 11 6 9 144 16 153 3 8 278 163 10 43
round(prop.table(summary(factor(dc_vis$Reason_Discharge))), 3) * 100
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 NA's
## 23.9 22.1 4.8 2.0 0.6 0.3 0.5 8.1 0.9 8.6 0.2 0.4 15.6 9.1 0.6 2.4
#recoded counts
summary(dc_vis$discharge_c)
## unilateral_termination completed ineligible
## 427 431 85
## pursuing_pos_opportunity incarcerated transferred_or_moved
## 11 31 297
## death lost_contact other
## 21 278 163
## NA's
## 43
#rates
round(prop.table(summary(dc_vis$discharge_c)) * 100, 1)
## unilateral_termination completed ineligible
## 23.9 24.1 4.8
## pursuing_pos_opportunity incarcerated transferred_or_moved
## 0.6 1.7 16.6
## death lost_contact other
## 1.2 15.6 9.1
## NA's
## 2.4
ggplot(dc_vis) +
aes(x = fct_infreq(discharge_c)) +
geom_bar() +
xlab("discharge type") +
ggtitle("Number of discharge types") +
theme(axis.text.x = element_text(angle=45, vjust = 1, hjust=1))
Discharge types by hub
# counts
dc_vis %>%
group_by(hub_c) %>%
summarize(unilateral_termination = sum(discharge_c == "unilateral_termination", na.rm=T),
completed = sum(discharge_c == "completed", na.rm=T),
ineligible = sum(discharge_c == "ineligible", na.rm=T),
transferred_or_moved = sum(discharge_c == "transferred_or_moved", na.rm=T),
death = sum(discharge_c == "death", na.rm=T),
lost_contact = sum(discharge_c == "lost_contact", na.rm=T),
other = sum(discharge_c == "other", na.rm=T)
)
## # A tibble: 8 x 8
## hub_c unilateral_termination completed ineligible transferred_or_moved death
## <fct> <int> <int> <int> <int> <int>
## 1 AC-EPI~ 27 6 5 28 2
## 2 CLHS 67 67 54 44 0
## 3 EPI-CAL 8 17 5 4 0
## 4 EPI-MI~ 10 44 4 7 0
## 5 EPINET~ 43 6 1 26 4
## 6 ESPRITO 85 168 10 89 5
## 7 LEAP 18 11 6 4 0
## 8 ONTY 169 112 0 95 10
## # i 2 more variables: lost_contact <int>, other <int>
# rates
dc_vis %>%
group_by(hub_c) %>%
summarize(unilateral_termination = mean(discharge_c == "unilateral_termination", na.rm=T),
completed = mean(discharge_c == "completed", na.rm=T),
ineligible = mean(discharge_c == "ineligible", na.rm=T),
transferred_or_moved = mean(discharge_c == "transferred_or_moved", na.rm=T),
death = mean(discharge_c == "death", na.rm=T),
lost_contact = mean(discharge_c == "lost_contact", na.rm=T),
other = mean(discharge_c == "other", na.rm=T)
)
## # A tibble: 8 x 8
## hub_c unilateral_termination completed ineligible transferred_or_moved death
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AC-EP~ 0.225 0.05 0.0417 0.233 0.0167
## 2 CLHS 0.170 0.170 0.137 0.112 0
## 3 EPI-C~ 0.186 0.395 0.116 0.0930 0
## 4 EPI-M~ 0.128 0.564 0.0513 0.0897 0
## 5 EPINE~ 0.413 0.0577 0.00962 0.25 0.0385
## 6 ESPRI~ 0.195 0.386 0.0230 0.205 0.0115
## 7 LEAP 0.36 0.22 0.12 0.08 0
## 8 ONTY 0.324 0.215 0 0.182 0.0192
## # i 2 more variables: lost_contact <dbl>, other <dbl>
Let’s do some exploratory plotting to understand discharge types.
Here’s the race breakdown for the cohort that discharged.
table(dc_vis$simple_race)
##
## White Hispanic Black Multiracial Other AsianAPI
## 525 279 586 80 225 92
table(dc_vis$race_c)
##
## AI API
## 11 5
## Asian Black
## 87 586
## Hispanic Multiracial
## 279 80
## Other unknown or prefer not to say
## 3 211
## White
## 525
proportions <- prop.table(with(dc_vis, table(simple_race, discharge_c)), margin = 1)
round(proportions * 100, 1)
## discharge_c
## simple_race unilateral_termination completed ineligible
## White 25.1 29.4 5.3
## Hispanic 30.0 28.9 3.3
## Black 24.3 18.9 3.1
## Multiracial 26.6 30.4 7.6
## Other 17.3 23.6 8.6
## AsianAPI 21.7 20.7 6.5
## discharge_c
## simple_race pursuing_pos_opportunity incarcerated transferred_or_moved death
## White 1.2 2.0 18.2 1.2
## Hispanic 0.4 0.7 16.3 1.1
## Black 0.5 2.1 17.0 1.4
## Multiracial 0.0 2.5 15.2 1.3
## Other 0.0 0.5 11.8 1.4
## AsianAPI 1.1 4.3 27.2 0.0
## discharge_c
## simple_race lost_contact other
## White 8.5 9.1
## Hispanic 10.0 9.3
## Black 26.9 5.9
## Multiracial 12.7 3.8
## Other 16.4 20.5
## AsianAPI 7.6 10.9
proportions <- data.frame(proportions)
full_sample <- as.data.frame(prop.table(table(dc_vis$discharge_c)))
names(full_sample) <- c("discharge_c", "Freq")
full_sample$simple_race <- "Full sample"
proportions <- bind_rows(proportions, full_sample)
proportions <- mutate(
proportions,
simple_race = factor(simple_race,
levels = c("Full sample","Black", "White", "Hispanic", "AsianAPI", "Multiracial", "Other"),
labels=c("Full sample","Black (n=586)", "White (n=525)", "Hispanic (n=279)", "Asian/Pacific Islander (n=92)", "Multiracial (n=80)", "Other/unknown (n=225)")),
discharge_c = factor(discharge_c,
levels = c("completed",
"unilateral_termination",
"transferred_or_moved",
"lost_contact",
"other",
"ineligible",
"incarcerated",
"death",
"pursuing_pos_opportunity"),
labels=c("Completed\n(n=431)",
"Unilateral termination\n(n=427)",
"Transferred or moved\n(n=297)",
"Lost contact\n(n=278)",
"Other\n(n=163)",
"Ineligible\n(n=85)",
"Incarcerated/hospitalized\n(n=31)",
"Death\n(n=21)",
"Pursuing positive opportunity\n(n=11)"))
)
fill_colors <- c("grey", scales::viridis_pal()(7))
# fill_colors <- scales::viridis_pal(option="magma")(8)
proportions %>%
ggplot() +
aes(x = discharge_c, y = Freq, fill = simple_race) +
geom_bar(stat="identity", position="dodge") +
scale_y_continuous(labels=scales::percent) +
ylab("") + xlab("Discharge reason") +
guides(fill=guide_legend(title="Race/ethnicity")) +
scale_fill_manual(values=fill_colors) +
theme_bw()
We can compare expected rates by race with actual rates by race. Black people are much more likely to be lost to follow-up.
rate_black <- mean(dc_vis$simple_race == "Black")
ggplot(dc_vis) +
aes(x = fct_infreq(discharge_c), fill=(simple_race == "Black")) +
geom_bar(position="fill") +
geom_hline(yintercept = rate_black) +
xlab("discharge type") +
ggtitle("Discharge types. Proportion black", subtitle="line gives expected proportion") +
theme(axis.text.x = element_text(angle=45, vjust = 1, hjust=1))
Visually, it kind of looked to me as though there was a two-way interaction between race and gender, with black males particularly unlikely to complete the program.
dc_vis %>%
mutate(
race = ifelse(simple_race=="Black", "Black", "Non-black"),
gender = ifelse(male_c, "Male", "Other"),
outcome = ifelse(completed, "completed", "did not complete")
) %>%
ggplot() +
aes(x = outcome, fill=race) +
geom_bar(position="fill") +
xlab("discharge outcome") +
ylab("proportion") +
ggtitle("Discharge outcome", subtitle="Interaction between race and gender") +
theme(axis.text.x = element_text(angle=45, vjust = 1, hjust=1)) +
facet_wrap(~gender)
However, the effect doesn’t appear to be robust.
summary(glm(completed ~ (simple_race == "Black") + male_c +
(simple_race == "Black"):male_c,
family="binomial",
data = dc_vis))
##
## Call:
## glm(formula = completed ~ (simple_race == "Black") + male_c +
## (simple_race == "Black"):male_c, family = "binomial", data = dc_vis)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.997223 0.106364 -9.376
## simple_race == "Black"TRUE -0.221534 0.197178 -1.124
## male_cTRUE 0.002331 0.134391 0.017
## simple_race == "Black"TRUE:male_cTRUE -0.307069 0.252067 -1.218
## Pr(>|z|)
## (Intercept) <0.0000000000000002 ***
## simple_race == "Black"TRUE 0.261
## male_cTRUE 0.986
## simple_race == "Black"TRUE:male_cTRUE 0.223
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1992.6 on 1786 degrees of freedom
## Residual deviance: 1978.8 on 1783 degrees of freedom
## AIC: 1986.8
##
## Number of Fisher Scoring iterations: 4
First some descriptives for the discharged sample.
mean(dc_vis$Age_Entry_Years, na.rm=T)
## [1] 21.87464
sd(dc_vis$Age_Entry_Years, na.rm=T)
## [1] 4.198462
sum(is.na(dc_vis$Age_Entry_Years))
## [1] 178
hist(dc_vis$Age_Entry_Months)
Age is related to discharge
dc_vis %>%
ggplot() +
aes(x = Age_Entry_Years, y = as.numeric(completed)) +
geom_jitter(width = 0, height = 0.05) +
geom_smooth(method="glm", method.args=list(family="binomial")) +
ggtitle("Age at entry is predictive of completion.",
subtitle = "Vertical jitter added. Logistic regression fit overlaid.") +
xlab("Age at entry") +
ylab("") +
scale_y_continuous(breaks = 0:1, labels=c("Not completed", "Completed"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 178 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 178 rows containing missing values (`geom_point()`).
Duration in program is a wonderful predictor of program completion, but in some ways that’s tautological given that patients are approximately defined as completing the program if they last two years.
# duration in program versus program completion
dc_vis %>%
ggplot() +
aes(x = CSC_Duration_Totalmos, y = as.numeric(completed)) +
geom_jitter(width = 0, height = 0.05, alpha=0.5) +
geom_smooth(method="glm", method.args=list(family="binomial")) +
ggtitle("Length of time in program is even more predictive (obviously?)",
subtitle = "Vertical jitter added. Logistic regression fit overlaid.") +
xlab("Time in program") +
ylab("") +
scale_y_continuous(breaks = 0:1, labels=c("Not completed", "Completed"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).
Each year in the program is associated with approximately 3x greater odds of program completion.
fit <- glm( completed ~ CSC_Duration_Totalmos,
data = dc_vis, family=binomial)
exp(coef(fit)[2] * 12) # coefficient for months, times 12, exponentiated
## CSC_Duration_Totalmos
## 2.718064
# confint
exp(confint(fit)[2,] * 12)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## 2.412285 3.077916
Here’s another way of visualizing the relationship between program completion and time in the program. It’s interesting that many people “complete” the program after a very short duration.
dc_vis %>%
ggplot() +
aes(x = CSC_Duration_Totalmos, fill = completed) +
geom_histogram() +
ggtitle("Another way of visualizing completion versus time in program.") +
xlab("Time in program")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (`stat_bin()`).
Age and time in program predict completion even after controlling for each other.
fit <- glm( completed ~ Age_Entry_Years + CSC_Duration_Totalmos,
data = dc_vis, family=binomial)
summary(fit)
##
## Call:
## glm(formula = completed ~ Age_Entry_Years + CSC_Duration_Totalmos,
## family = binomial, data = dc_vis)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.104642 0.372725 -11.013 < 0.0000000000000002 ***
## Age_Entry_Years 0.055244 0.015718 3.515 0.00044 ***
## CSC_Duration_Totalmos 0.091602 0.005926 15.456 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1763.7 on 1608 degrees of freedom
## Residual deviance: 1400.1 on 1606 degrees of freedom
## (178 observations deleted due to missingness)
## AIC: 1406.1
##
## Number of Fisher Scoring iterations: 5
We plot the observed trajectories in self-reported symptoms and discharge type. Keep in mind that the curves plotted over these graphs do not take into account the issue of differing numbers of observations per person.
dc %>%
pivot_longer(cols = c(completed,unilateral_termination,lost_contact),
names_to = "discharge_type", values_to = "outcome") %>%
ggplot() +
aes(x = CSC_Duration_Totalmos, y = CSI_clean, group = NDA_ID, color = outcome) +
geom_line(alpha=0.2) +
geom_smooth(aes(x = CSC_Duration_Totalmos, y = CSI_clean, group = outcome)) +
ggtitle("CSI symptom trajectory versus program completion") +
xlab("Assessment time (months)") +
xlim(c(0,30)) +
facet_wrap(~discharge_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 18003 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 15349 rows containing missing values (`geom_line()`).
These plots suggest there’s not much effect of self-reported symptom severity, but perhaps something is going on with unilateral termination. However, the above graphs don’t account for the multilevel structure of the data, so let’s use a formal model to verify.
csi_fit <- stan_glmer(unilateral_termination ~
# interaction between (scaled) CSI scores and time.
(scale(CSI_clean) + scale(CSC_Duration_Totalmos))^2 +
# account for clustering
(1|hub_c) + (1|Site_ID) + (1|NDA_ID),
data = dc,
adapt_delta=0.99,
family=binomial,
iter = 3000,
cores=4)
Duration in the program is a strong predictor of
unilateral_termination, but CSI scores and changes in CSI
scores over time aren’t significantly predictive.
print(csi_fit, digits=3)
## stan_glmer
## family: binomial [logit]
## formula: unilateral_termination ~ (scale(CSI_clean) + scale(CSC_Duration_Totalmos))^2 +
## (1 | hub_c) + (1 | Site_ID) + (1 | NDA_ID)
## observations: 1103
## ------
## Median MAD_SD
## (Intercept) -6.808 1.413
## scale(CSI_clean) -0.394 0.460
## scale(CSC_Duration_Totalmos) -2.013 0.546
## scale(CSI_clean):scale(CSC_Duration_Totalmos) 0.797 0.482
##
## Error terms:
## Groups Name Std.Dev.
## NDA_ID (Intercept) 8.638
## Site_ID (Intercept) 2.105
## hub_c (Intercept) 2.858
## Num. levels: NDA_ID 726, Site_ID 63, hub_c 7
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
Now do the same with negative symptoms.
dc %>%
pivot_longer(cols = c(completed,unilateral_termination,lost_contact),
names_to = "discharge_type", values_to = "outcome") %>%
ggplot() +
aes(x = CSC_Duration_Totalmos, y = COMPASS_NEG, group = NDA_ID, color = outcome) +
geom_line(alpha=0.2) +
geom_smooth(aes(x = CSC_Duration_Totalmos, y = COMPASS_NEG, group = outcome)) +
ggtitle("COMPASS-10 negative symptoms versus program completion") +
xlab("Assessment time (months)") +
xlim(c(0,30)) +
facet_wrap(~discharge_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17643 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 13607 rows containing missing values (`geom_line()`).
Maybe something is happening for program completion (with higher negative symptoms predicting less likelihood of completion) and lost contact (with greater decreases in negative symptoms predicting retention of contact). We’ll check both.
fit_neg_comp <- stan_glmer(completed ~
(scale(COMPASS_NEG) + scale(CSC_Duration_Totalmos))^2 +
# account for clustering
(1|hub_c) + (1|Site_ID) + (1|NDA_ID),
data = dc,
family=binomial,
cores=4)
# same model, but predicting lost_contact
fit_neg_lost <- update(fit_neg_comp, lost_contact ~ .)
There was no relationship between negative symptoms and lost contact (as parameterized by this model).
print(fit_neg_lost, digits=3)
## stan_glmer
## family: binomial [logit]
## formula: lost_contact ~ scale(COMPASS_NEG) + scale(CSC_Duration_Totalmos) +
## (1 | hub_c) + (1 | Site_ID) + (1 | NDA_ID) + scale(COMPASS_NEG):scale(CSC_Duration_Totalmos)
## observations: 1184
## ------
## Median MAD_SD
## (Intercept) -8.617 1.233
## scale(COMPASS_NEG) 0.047 0.387
## scale(CSC_Duration_Totalmos) -2.080 0.546
## scale(COMPASS_NEG):scale(CSC_Duration_Totalmos) 0.047 0.450
##
## Error terms:
## Groups Name Std.Dev.
## NDA_ID (Intercept) 6.479
## Site_ID (Intercept) 3.677
## hub_c (Intercept) 1.194
## Num. levels: NDA_ID 750, Site_ID 83, hub_c 8
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
But people who completed had statistically significantly lower negative symptoms than people who did not complete.
print(fit_neg_comp, digits=3)
## stan_glmer
## family: binomial [logit]
## formula: completed ~ (scale(COMPASS_NEG) + scale(CSC_Duration_Totalmos))^2 +
## (1 | hub_c) + (1 | Site_ID) + (1 | NDA_ID)
## observations: 1184
## ------
## Median MAD_SD
## (Intercept) -4.703 1.563
## scale(COMPASS_NEG) -1.000 0.460
## scale(CSC_Duration_Totalmos) 4.087 0.664
## scale(COMPASS_NEG):scale(CSC_Duration_Totalmos) -0.117 0.454
##
## Error terms:
## Groups Name Std.Dev.
## NDA_ID (Intercept) 9.234
## Site_ID (Intercept) 4.679
## hub_c (Intercept) 4.406
## Num. levels: NDA_ID 750, Site_ID 83, hub_c 8
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
posterior_interval(fit_neg_comp, pars="scale(COMPASS_NEG)", prob=0.95)
## 2.5% 97.5%
## scale(COMPASS_NEG) -1.940931 -0.1113606
Given this finding, we’ll want to include negative symptom scores in later models. To do this, we’ll need to get an average negative symptom score for each participant and stack it onto the discharge dataframe. We do this with a model because it allows us to get a better estimate of the average given the multilevel structure of the data, and because it allows us to impute missing data for people without COMPASS data.
neg_average_model <- stan_lmer(
COMPASS_NEG ~
# this is an intercept-only model.
# we get the expected average for each person in each site in each hub
(1|NDA_ID) +
(1|Site_ID) +
(1|hub_c),
data=dat,
cores=4,
iter=2000
)
We use the above model to get estimated negative symptom scores for each person.
# first whole dataset
neg_est <- posterior_epred(neg_average_model,
newdata=dat %>% select(NDA_ID,Site_ID,hub_c))
dat$NEG_avg <- apply(neg_est, 2, mean)
# then discharge visit only
neg_est <- posterior_epred(neg_average_model,
newdata=dc_vis %>% select(NDA_ID,Site_ID,hub_c))
dc_vis$NEG_avg <- apply(neg_est, 2, mean)
There doesn’t appear to be any relationship between positive symptom severity (or its trajectory) and discharge type, so we don’t examine this variable further.
dc %>%
pivot_longer(cols = c(completed,unilateral_termination,lost_contact),
names_to = "discharge_type", values_to = "outcome") %>%
ggplot() +
aes(x = CSC_Duration_Totalmos, y = COMPASS_POS, group = NDA_ID, color = outcome) +
geom_line(alpha=0.2) +
geom_smooth(aes(x = CSC_Duration_Totalmos, y = COMPASS_POS, group = outcome)) +
ggtitle("COMPASS-10 positive symptoms versus program completion") +
xlab("Assessment time (months)") +
xlim(c(0,30)) +
facet_wrap(~discharge_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17664 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 13645 rows containing missing values (`geom_line()`).
NEET means not employed, in education, or in training. For this dataset I define NEET as not being employed, in school, or in an internship in the six months prior to the assessment timepoint.
dc_vis %>%
ggplot() +
aes(x = percent_NEET, y = as.numeric(completed)) +
geom_jitter(width = 0, height = 0.05, alpha=0.2) +
geom_smooth(method="glm", method.args=list(family="binomial")) +
ggtitle("Percent of timepoints NEET (not in employment, education, or training)",
subtitle = "Vertical jitter added. Logistic regression fit overlaid.") +
xlab("Percent of timepoints where patient was NEET") +
ylab("") +
scale_y_continuous(breaks = 0:1, labels=c("Not completed", "Completed"))
## `geom_smooth()` using formula = 'y ~ x'
Insurance status may be a proxy for socioeconomic status among young people in the United States citation, although it’s definitely complicated. In the present sample, being on medicaid or uninsured isn’t predictive of any outcome, but being on private insurance is predictive of completion and of not unilaterally terminating.
## medicaid
par(mfrow= c(1,3))
with(dc_vis,
mosaicplot(table(ever_medicaid, completed), shade=TRUE,
main = ""))
with(dc_vis,
mosaicplot(table(ever_medicaid, unilateral_termination), shade=TRUE,
main = "medicaid status by outcome"))
with(dc_vis,
mosaicplot(table(ever_medicaid, lost_contact), shade=TRUE,
main = ""))
## non-insured
with(dc_vis,
mosaicplot(table(ever_uninsured, completed), shade=TRUE,
main = ""))
with(dc_vis,
mosaicplot(table(ever_uninsured, unilateral_termination), shade=TRUE,
main = "non-insured status by outcome"))
with(dc_vis,
mosaicplot(table(ever_uninsured, lost_contact), shade=TRUE,
main = ""))
## private
with(dc_vis,
mosaicplot(table(ever_private, completed), shade=TRUE,
main = ""))
with(dc_vis,
mosaicplot(table(ever_private, unilateral_termination), shade=TRUE,
main = "private insurance status by outcome"))
with(dc_vis,
mosaicplot(table(ever_private, lost_contact), shade=TRUE,
main = ""))
Recovery (quantified by the QPR) looks like a decent predictor of lost contact, where CSC programs seem to be more likely to lose contact with people who have higher recovery ratings.
dc %>%
pivot_longer(cols = c(completed,unilateral_termination,lost_contact),
names_to = "discharge_type", values_to = "outcome") %>%
ggplot() +
aes(x = CSC_Duration_Totalmos, y = QPR_clean, group = NDA_ID, color = outcome) +
geom_line(alpha=0.5) +
geom_smooth(aes(x = CSC_Duration_Totalmos, y = QPR_clean, group = outcome)) +
ggtitle("QPR trajectory versus program completion") +
xlab("Assessment time (months)") +
xlim(0,30) +
facet_wrap(~discharge_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 18306 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 15145 rows containing missing values (`geom_line()`).
We estimate this using QPR score as well as it’s interaction with time in order to model the effect of QPR scores and their trajectories.
fit_QPR <- stan_glmer(lost_contact ~
(scale(QPR_clean) + scale(CSC_Duration_Totalmos))^2 +
# account for clustering
(1|hub_c) + (1|Site_ID) + (1|NDA_ID),
data = dc,
family=binomial,
adapt_delta = 0.99,
cores=4)
It does appear that people with higher QPR scores have
fit_QPR
## stan_glmer
## family: binomial [logit]
## formula: lost_contact ~ (scale(QPR_clean) + scale(CSC_Duration_Totalmos))^2 +
## (1 | hub_c) + (1 | Site_ID) + (1 | NDA_ID)
## observations: 1008
## ------
## Median MAD_SD
## (Intercept) -8.2 1.3
## scale(QPR_clean) 1.0 0.5
## scale(CSC_Duration_Totalmos) -1.7 0.5
## scale(QPR_clean):scale(CSC_Duration_Totalmos) 0.5 0.4
##
## Error terms:
## Groups Name Std.Dev.
## NDA_ID (Intercept) 5.6
## Site_ID (Intercept) 1.6
## hub_c (Intercept) 1.7
## Num. levels: NDA_ID 666, Site_ID 64, hub_c 7
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
Each standard deviation increase in the QPR is associated with between 36% and 606% greater odds of lost contact. Of course this indicates considerable uncertainty, but the effect appears to be present.
exp(posterior_interval(fit_QPR, pars="scale(QPR_clean)", probs = c(0.025, 0.975)))
## 5% 95%
## scale(QPR_clean) 1.355336 6.057725
We’ll want to use QPR scores in later models. To do this, we’ll need to get an average QPR score for each participant and stack it onto the discharge dataframe.
QPR_average_model <- stan_lmer(
QPR_clean ~
# this is an intercept-only model.
# we get the expected average for each person in each site in each hub
(1|NDA_ID) +
(1|Site_ID) +
(1|hub_c),
data=dat,
cores=4,
iter=2000
)
We use the above model to get estimated QPR scores for each person. This code is actually run as the markdown is created.
# first whole dataset
qpr_est <- posterior_epred(QPR_average_model,
newdata=dat %>% select(NDA_ID,Site_ID,hub_c))
dat$QPR_avg <- apply(qpr_est, 2, mean)
# then discharge visit only
qpr_est <- posterior_epred(QPR_average_model,
newdata=dc_vis %>% select(NDA_ID,Site_ID,hub_c))
dc_vis$QPR_avg <- apply(qpr_est, 2, mean)
fit_comp <-
stan_glmer(
completed ~
scale(Age_Entry_Years) +
percent_NEET +
male_c +
ever_private +
scale(QPR_avg) +
scale(NEG_avg) +
(1|Site_ID) +
(1|hub_c) +
# varying intercept for race
(1|simple_race) +
# check if different hubs have different race effects?
(1|Site_ID:simple_race) +
(1|hub_c:simple_race),
data = dc_vis,
cores=4,
family=binomial
)
Interrogate the model.
# overall model fit
print(fit_comp, digits=3)
## stan_glmer
## family: binomial [logit]
## formula: completed ~ scale(Age_Entry_Years) + percent_NEET + male_c +
## ever_private + scale(QPR_avg) + scale(NEG_avg) + (1 | Site_ID) +
## (1 | hub_c) + (1 | simple_race) + (1 | Site_ID:simple_race) +
## (1 | hub_c:simple_race)
## observations: 1609
## ------
## Median MAD_SD
## (Intercept) -1.160 0.465
## scale(Age_Entry_Years) 0.287 0.073
## percent_NEET -0.908 0.283
## male_cTRUE -0.201 0.145
## ever_privateTRUE 0.596 0.148
## scale(QPR_avg) 0.193 0.162
## scale(NEG_avg) -0.406 0.109
##
## Error terms:
## Groups Name Std.Dev.
## Site_ID:simple_race (Intercept) 0.6657
## Site_ID (Intercept) 0.8147
## hub_c:simple_race (Intercept) 0.2364
## hub_c (Intercept) 1.1438
## simple_race (Intercept) 0.2286
## Num. levels: Site_ID:simple_race 310, Site_ID 86, hub_c:simple_race 41, hub_c 7, simple_race 6
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
# fixed effects
plot(fit_comp, pars = names(fixef(fit_comp)), prob_outer=0.95, plotfun="areas")
# main race effects
plot(fit_comp, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:", prob_outer= 0.95, plotfun="areas")
# calculate ORs and confidence intervals using the posterior draws
draws <- as.data.frame(fit_comp, pars=names(fixef(fit_comp)))
# OR for age
mean(exp(draws$`scale(Age_Entry_Years)`))
## [1] 1.33802
# 95%CI
exp(quantile(draws$`scale(Age_Entry_Years)`, prob=c(0.025,0.975)))
## 2.5% 97.5%
## 1.147427 1.553158
# OR for private insurance coverage
mean(exp(draws$ever_privateTRUE))
## [1] 1.833629
# 95%CI
exp(quantile(draws$ever_privateTRUE, prob=c(0.025,0.975)))
## 2.5% 97.5%
## 1.369223 2.417438
# OR for NEET
mean(exp(-draws$percent_NEET)) # flip sign to get OR for *not* being NEET
## [1] 2.571403
# 95%CI
exp(quantile(-draws$percent_NEET, prob=c(0.025,0.975)))
## 2.5% 97.5%
## 1.417653 4.291708
# OR for negative symptoms
mean(exp(draws$`scale(NEG_avg)`))
## [1] 0.6691529
# 95%CI
exp(quantile(draws$`scale(NEG_avg)`, prob=c(0.025,0.975)))
## 2.5% 97.5%
## 0.5364538 0.8191770
Regarding program completion, there’s substantial variation by hub and by site.
plot(fit_comp, regex_pars = "Sigma", prob_outer=0.95, plotfun="areas")
sigma_draws <- as.data.frame(fit_comp, regex_pars = "Sigma")
means <- data.frame(apply(sigma_draws, 2, mean))
CIs <- t(apply(sigma_draws, 2, function(x){quantile(x, probs=c(0.025,0.975))}))
sd_estimates <- cbind(means, CIs); names(sd_estimates)[1] <- "est"
sd_estimates
## est 2.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.44309673 0.04609038721
## Sigma[Site_ID:(Intercept),(Intercept)] 0.66381348 0.24420704599
## Sigma[hub_c:simple_race:(Intercept),(Intercept)] 0.05588683 0.00004686279
## Sigma[hub_c:(Intercept),(Intercept)] 1.30819933 0.06475704902
## Sigma[simple_race:(Intercept),(Intercept)] 0.05227878 0.00005072530
## 97.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 1.0038078
## Sigma[Site_ID:(Intercept),(Intercept)] 1.3017215
## Sigma[hub_c:simple_race:(Intercept),(Intercept)] 0.2952444
## Sigma[hub_c:(Intercept),(Intercept)] 4.7157268
## Sigma[simple_race:(Intercept),(Intercept)] 0.3123491
Here are the hub effects (on the logit scale). Note LEAP is not included because they have fully missing age data and so the 53 LEAP discharges drop out of the regression. Completion rates for LEAP can still be estimated by the model using the observed hub-level distribution.
plot(fit_comp, regex_pars = "t\\) hub_c:(O|A|E|C)", prob_outer=0.95, plotfun="areas")
draws_hub <- as.data.frame(fit_comp, regex_pars = "t\\) hub_c:(O|A|E|L|C)")
difference <- draws_hub$`b[(Intercept) hub_c:ESPRITO]` - draws_hub$`b[(Intercept) hub_c:EPINET-TX]`
mean(exp(difference))
## [1] 3.850913
quantile(exp(difference), prob=c(0.025,0.975))
## 2.5% 97.5%
## 0.995311 11.520883
There is less variability by race, and none of the differences between race are statistically significant.
plot(fit_comp, regex_pars = "t\\) simple_race", prob_outer=0.95, plotfun="areas")
draws_race <- as.data.frame(fit_comp, regex_pars = "t\\) simple_race")
biggest_difference <- draws_race$`b[(Intercept) simple_race:Multiracial]` - draws_race$`b[(Intercept) simple_race:Black]`
mean(exp(biggest_difference))
## [1] 1.189367
quantile(exp(biggest_difference), prob=c(0.025,0.975))
## 2.5% 97.5%
## 0.8019533 2.1170134
fit_lost <-
stan_glmer(
lost_contact ~
scale(Age_Entry_Years) +
percent_NEET +
male_c +
ever_private +
scale(QPR_avg) +
scale(NEG_avg) +
(1|simple_race) +
(1|Site_ID) +
(1|hub_c) +
# varying intercept for race
# check if different hubs have different racial trajectories?
(1 |Site_ID:simple_race) +
(1 |hub_c:simple_race),
data = dc_vis,
adapt_delta=0.99,
cores=4,
family=binomial
)
Interrogate the model.
# overall output
print(fit_lost, digits=3)
## stan_glmer
## family: binomial [logit]
## formula: lost_contact ~ scale(Age_Entry_Years) + percent_NEET + male_c +
## ever_private + scale(QPR_avg) + scale(NEG_avg) + (1 | simple_race) +
## (1 | Site_ID) + (1 | hub_c) + (1 | Site_ID:simple_race) +
## (1 | hub_c:simple_race)
## observations: 1609
## ------
## Median MAD_SD
## (Intercept) -2.255 0.424
## scale(Age_Entry_Years) -0.018 0.082
## percent_NEET 0.027 0.294
## male_cTRUE 0.413 0.166
## ever_privateTRUE -0.450 0.162
## scale(QPR_avg) 0.227 0.161
## scale(NEG_avg) 0.085 0.099
##
## Error terms:
## Groups Name Std.Dev.
## Site_ID:simple_race (Intercept) 0.4780
## Site_ID (Intercept) 0.5350
## hub_c:simple_race (Intercept) 0.2986
## hub_c (Intercept) 0.6265
## simple_race (Intercept) 0.6768
## Num. levels: Site_ID:simple_race 310, Site_ID 86, hub_c:simple_race 41, hub_c 7, simple_race 6
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
# fixed effects
plot(fit_lost, pars = names(fixef(fit_lost)), prob_outer=0.95, plotfun="areas")
# race effects (are substantial for this outcome)
plot(fit_lost, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:", prob_outer= 0.95, plotfun="areas")
# calculate ORs and confidence intervals using posterior draws
draws <- as.data.frame(fit_lost, pars=names(fixef(fit_lost)))
# effect of being male
mean(exp(draws$male_cTRUE))
## [1] 1.533791
# 95% CI
exp(quantile(draws$male_cTRUE, prob=c(0.025,0.975)))
## 2.5% 97.5%
## 1.098561 2.102246
# effect of private insurance
mean(exp(draws$ever_privateTRUE))
## [1] 0.645468
# 95% CI
exp(quantile(draws$ever_privateTRUE, prob=c(0.025,0.975)))
## 2.5% 97.5%
## 0.4593164 0.8844855
There’s substantial race-level variation. Specifically, black people had higher lost contact rates than every other race category (no other race differences significant).
# race effects
draws_race <- as.data.frame(fit_lost, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:")
# black people substantially more likely to have lost contact than white people
black_white_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:White]`)
#OR
mean(exp(black_white_diff))
## [1] 3.021695
#95%CI
quantile(exp(black_white_diff), prob = c(0.025, 0.975))
## 2.5% 97.5%
## 1.206379 5.152579
# difference between black and asian also significant
black_asian_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:AsianAPI]`)
#OR
mean(exp(black_asian_diff))
## [1] 3.725463
#95%CI
quantile(exp(black_asian_diff), prob = c(0.025, 0.975))
## 2.5% 97.5%
## 1.201704 8.494808
# difference between black and hispanic also significant
black_hisp_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:Hispanic]`)
#OR
mean(exp(black_hisp_diff))
## [1] 2.960781
#95%CI
quantile(exp(black_hisp_diff), prob = c(0.025, 0.975))
## 2.5% 97.5%
## 1.192902 5.408228
# difference between black and multiracial
black_multi_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:Multiracial]`)
#OR
mean(exp(black_multi_diff))
## [1] 2.196098
#95%CI
quantile(exp(black_multi_diff), prob = c(0.025, 0.975))
## 2.5% 97.5%
## 0.9087752 4.5260508
# difference between black and other
black_other_diff <- with(draws_race, `b[(Intercept) simple_race:Black]` - `b[(Intercept) simple_race:Other]`)
#OR
mean(exp(black_other_diff))
## [1] 2.324933
#95%CI
quantile(exp(black_other_diff), prob = c(0.025, 0.975))
## 2.5% 97.5%
## 0.9853573 4.5254354
There’s less program- and hub-level variability for lost contact than for program completion, but it’s there. The most substantial variability we see in these random effects is the estimated standard deviation for race. There also appears to be variability in the relationship between race and lost contact across sites.
plot(fit_lost, regex_pars = "Sigma", prob_outer=0.95, plotfun="areas")
sigma_draws <- as.data.frame(fit_lost, regex_pars = "Sigma")
means <- data.frame(apply(sigma_draws, 2, mean))
CIs <- t(apply(sigma_draws, 2, function(x){quantile(x, probs=c(0.025,0.975))}))
sd_estimates <- cbind(means, CIs); names(sd_estimates)[1] <- "est"
sd_estimates
## est 2.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.2284999 0.0007348562
## Sigma[Site_ID:(Intercept),(Intercept)] 0.2862249 0.0055102774
## Sigma[hub_c:simple_race:(Intercept),(Intercept)] 0.0891474 0.0001323044
## Sigma[hub_c:(Intercept),(Intercept)] 0.3924918 0.0037405062
## Sigma[simple_race:(Intercept),(Intercept)] 0.4579949 0.0360529004
## 97.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.7163858
## Sigma[Site_ID:(Intercept),(Intercept)] 0.7498976
## Sigma[hub_c:simple_race:(Intercept),(Intercept)] 0.4685355
## Sigma[hub_c:(Intercept),(Intercept)] 1.6066162
## Sigma[simple_race:(Intercept),(Intercept)] 1.8238435
Here are the hub effects.
plot(fit_lost, regex_pars = "t\\) hub_c:(O|A|E|C|L)", prob_outer=0.95, plotfun="areas")
fit_uni <-
stan_glmer(
unilateral_termination ~
scale(Age_Entry_Years) +
percent_NEET +
male_c +
ever_private +
scale(QPR_avg) +
scale(NEG_avg) +
(1|simple_race) +
(1|Site_ID) +
(1|hub_c) +
# varying intercept for race
# check if different hubs have different racial trajectories?
(1 |Site_ID:simple_race) +
(1 |hub_c:simple_race),
data = dc_vis,
cores=4,
family=binomial
)
Interrogate the model.
# full model
print(fit_uni, digits=3)
## stan_glmer
## family: binomial [logit]
## formula: unilateral_termination ~ scale(Age_Entry_Years) + percent_NEET +
## male_c + ever_private + scale(QPR_avg) + scale(NEG_avg) +
## (1 | simple_race) + (1 | Site_ID) + (1 | hub_c) + (1 | Site_ID:simple_race) +
## (1 | hub_c:simple_race)
## observations: 1609
## ------
## Median MAD_SD
## (Intercept) -1.225 0.283
## scale(Age_Entry_Years) -0.117 0.071
## percent_NEET 0.028 0.229
## male_cTRUE 0.067 0.132
## ever_privateTRUE -0.143 0.132
## scale(QPR_avg) 0.090 0.129
## scale(NEG_avg) 0.088 0.085
##
## Error terms:
## Groups Name Std.Dev.
## Site_ID:simple_race (Intercept) 0.1968
## Site_ID (Intercept) 0.5384
## hub_c:simple_race (Intercept) 0.3121
## hub_c (Intercept) 0.5354
## simple_race (Intercept) 0.2685
## Num. levels: Site_ID:simple_race 310, Site_ID 86, hub_c:simple_race 41, hub_c 7, simple_race 6
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
# plot of fixed effects
plot(fit_uni, pars = names(fixef(fit_uni)), prob_outer=0.95, plotfun="areas")
# age comes close to (but is not) statistically significant
posterior_interval(fit_uni, pars="scale(Age_Entry_Years)", prob=0.95)
## 2.5% 97.5%
## scale(Age_Entry_Years) -0.2603549 0.01726058
# no real race effects
plot(fit_uni, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:", prob_outer= 0.95, plotfun="areas")
There was a fair amount of variability in unilateral termination by program. Less so by hub.
plot(fit_uni, regex_pars = "Sigma", prob_outer=0.95, plotfun = "areas")
sigma_draws <- as.data.frame(fit_uni, regex_pars = "Sigma")
means <- data.frame(apply(sigma_draws, 2, mean))
CIs <- t(apply(sigma_draws, 2, function(x){quantile(x, probs=c(0.025,0.975))}))
sd_estimates <- cbind(means, CIs); names(sd_estimates)[1] <- "est"
sd_estimates
## est 2.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.03873115 0.00003343512
## Sigma[Site_ID:(Intercept),(Intercept)] 0.28992637 0.08599491075
## Sigma[hub_c:simple_race:(Intercept),(Intercept)] 0.09742719 0.00071400301
## Sigma[hub_c:(Intercept),(Intercept)] 0.28669581 0.00180987923
## Sigma[simple_race:(Intercept),(Intercept)] 0.07210927 0.00010043213
## 97.5%
## Sigma[Site_ID:simple_race:(Intercept),(Intercept)] 0.1799816
## Sigma[Site_ID:(Intercept),(Intercept)] 0.6087732
## Sigma[hub_c:simple_race:(Intercept),(Intercept)] 0.3248533
## Sigma[hub_c:(Intercept),(Intercept)] 1.2361803
## Sigma[simple_race:(Intercept),(Intercept)] 0.3734211
plot(fit_uni, regex_pars = "t\\) hub_c:(O|A|E|C)", prob_outer=0.95, plotfun="areas")
draws_hub <- as.data.frame(fit_uni, regex_pars = "t\\) hub_c:(O|A|E|L|C)")
difference <- draws_hub$`b[(Intercept) hub_c:EPINET-TX]` - draws_hub$`b[(Intercept) hub_c:CLHS]`
mean(exp(difference))
## [1] 2.438288
quantile(exp(difference), probs=c(0.025, 0.975))
## 2.5% 97.5%
## 0.9882964 5.4119606
At the request of a reviewer we also include a model for the “transferred or moved” category, although we do not report on this model in the main paper as we find the category both difficult to interpret and in many cases (e.g., moving to a different state) likely unrelated to treatment.
fit_transfer <-
stan_glmer(
transferred_or_moved ~
scale(Age_Entry_Years) +
percent_NEET +
male_c +
ever_private +
scale(QPR_avg) +
scale(NEG_avg) +
(1|simple_race) +
(1|Site_ID) +
(1|hub_c) +
# varying intercept for race
# check if different hubs have different racial trajectories?
(1 |Site_ID:simple_race) +
(1 |hub_c:simple_race),
data = dc_vis,
cores=4,
family=binomial
)
plot(fit_transfer, pars = names(fixef(fit_transfer)), prob_outer=0.95)
We build a plot of all the coefficients of the three primary models.
Start by combining draws for the fixed effects.
draws_uni <- as.data.frame(fit_uni, pars = names(fixef(fit_uni)))
draws_lost <- as.data.frame(fit_lost, pars = names(fixef(fit_lost)))
draws_comp <- as.data.frame(fit_comp, pars = names(fixef(fit_comp)))
# race effects
race_draws_uni <- as.data.frame(fit_uni, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:")
race_draws_lost <- as.data.frame(fit_lost, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:")
race_draws_comp <- as.data.frame(fit_comp, regex_pars = "b\\[\\(Intercept\\)\ simple_race\\:")
draws_uni <- cbind(draws_uni, race_draws_uni)
draws_lost <- cbind(draws_lost, race_draws_lost)
draws_comp <- cbind(draws_comp, race_draws_comp)
# drop intercept
draws_uni <- draws_uni[-1]; draws_lost <- draws_lost[-1]; draws_comp <- draws_comp[-1]
# add random effects for race
draws_combined <- rbind(
data.frame(draws_uni, model = "Unilateral termination"),
data.frame(draws_lost, model = "Lost contact"),
data.frame(draws_comp, model = "Program completion")
)
# rename for readability
names(draws_combined) <- c("Age (standardized)", "Percent of visits NEET", "Male","Ever privately insured", "QPR (standardized)","Negative Sx (standardized)", "Race:White", "Race:Hispanic", "Race:Black", "Race:Multiracial", "Race:Other", "Race:Asian/Pacific Islander", "Model")
Then build the plot.
par_labels <- c("Age (standardized)", "Percent of visits NEET", "Male", "Ever privately insured" , "QPR (standardized)","Negative Sx (standardized)","Race:White", "Race:Hispanic", "Race:Black", "Race:Asian/Pacific Islander","Race:Multiracial", "Race:Other")
bounds <- 1.6
coef_plot_uni <- bayesplot::mcmc_intervals(
draws_combined %>% subset(Model == "Unilateral termination", select=-c(Model)),
pars = par_labels,
prob_outer = 0.95
) + xlim(-bounds,bounds) + ggtitle("Unilateral termination")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
coef_plot_comp <- bayesplot::mcmc_intervals(
draws_combined %>% subset(Model == "Program completion", select=-c(Model)),
pars = par_labels,
prob_outer = 0.95
) + xlim(-bounds,bounds) + ggtitle("Program completion")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
coef_plot_lost <- bayesplot::mcmc_intervals(
draws_combined %>% subset(Model == "Lost contact", select=-c(Model)),
pars = par_labels,
prob_outer = 0.95
) + xlim(-bounds,bounds) + ggtitle("Lost contact")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
gridExtra::grid.arrange(coef_plot_comp, coef_plot_lost, coef_plot_uni, ncol=1)
We think our estimate of completion rate is an underestimate because the longer you stay in CSC the more likely you are to complete, and so all these people in the dataset who haven’t yet discharged become increasingly likely to complete the longer we track them.
We want a lower bound on our completion rate underestimate, so we fit a predictive model (basically the above model but with program duration added as a predictor, which is very, very strongly correlated with program completion).
pred_comp <-
stan_glmer(
completed ~
CSC_Duration_Totalmos + #### THIS IS THE ONLY NEW VARIABLE
scale(Age_Entry_Years) +
percent_NEET +
male_c +
ever_private +
scale(QPR_avg) +
scale(NEG_avg) +
(1|simple_race) +
(1|Site_ID) +
(1|hub_c) +
(1|Site_ID:simple_race) +
(1|hub_c:simple_race),
data = dc_vis,
cores=4,
adapt_delta = 0.99,
family=binomial
)
We use this model to get predictions for the people who haven’t discharged yet.
non_missing_cases <-
dc_vis %>% select(CSC_Duration_Totalmos,
Age_Entry_Years,
percent_NEET,
simple_race,
male_c,
ever_private,
Site_ID,
QPR_avg,
hub_c,
completed
) %>%
complete.cases()
## Adding missing grouping variables: `NDA_ID`
non_missing_discharged <- dc_vis[non_missing_cases,]
Let’s see how good the model is for in-sample predictions
ppred <- posterior_epred(pred_comp, newdata = non_missing_discharged)
pred <- colMeans(ppred)
# model-predicted completion rate
mean(pred)
## [1] 0.2314023
# actual completion rate
mean(non_missing_discharged$completed)
## [1] 0.2374145
Let’s plot model predicted program completion versus actuzal completion.
non_missing_discharged$pred <- pred
non_missing_discharged %>%
ggplot() +
aes(x = pred, y = completed) +
geom_jitter(width=0, height=0.05, alpha=0.2)
We get good (~88%) accuracy.
confusionMatrix(factor(non_missing_discharged$pred >= 0.5, levels=c(FALSE, TRUE), labels=c("not completed", "completed")), # pred
factor(non_missing_discharged$completed, levels=c(FALSE, TRUE), labels=c("not completed", "completed")) # truth
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction not completed completed
## not completed 1172 149
## completed 55 233
##
## Accuracy : 0.8732
## 95% CI : (0.856, 0.8891)
## No Information Rate : 0.7626
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.6174
##
## Mcnemar's Test P-Value : 0.0000000000745
##
## Sensitivity : 0.9552
## Specificity : 0.6099
## Pos Pred Value : 0.8872
## Neg Pred Value : 0.8090
## Prevalence : 0.7626
## Detection Rate : 0.7284
## Detection Prevalence : 0.8210
## Balanced Accuracy : 0.7826
##
## 'Positive' Class : not completed
##
We use the above model to predict for the people who haven’t discharged yet
latest <- dat %>%
group_by(NDA_ID) %>%
slice(which.max(Assessment_Type)) %>%
subset(!(Assessment_Type %in% c(26,27))) %>%
ungroup()
non_missing_latest <-
latest %>% select(CSC_Duration_Totalmos,
Age_Entry_Years,
percent_NEET,
simple_race,
male_c,
ever_private,
QPR_avg,
NEG_avg,
Site_ID,
hub_c
) %>%
complete.cases()
non_missing_undischarged <- latest[non_missing_latest,]
ppred <- posterior_epred(pred_comp, newdata = non_missing_undischarged)
pred <- colMeans(ppred)
Model-predicted completion rates for people who haven’t discharged yet are actually lower than in the group that’s already discharged.
mean(pred)
## [1] 0.1812476
This may partially be due to the fact that people who haven’t yet discharged are more likely to be earlier in the course of the program, and if they continue (which they will) their predicted and actual completion rates would both rise.
mean(non_missing_undischarged$CSC_Duration_Totalmos)
## [1] 13.68466
mean(non_missing_discharged$CSC_Duration_Totalmos)
## [1] 16.04351
non_missing_undischarged$discharged <- FALSE
non_missing_discharged$discharged <- TRUE
bind_rows(non_missing_undischarged, non_missing_discharged) %>%
ggplot() +
aes(x = CSC_Duration_Totalmos, fill = discharged) +
geom_density(alpha=0.5) +
ggtitle("distribution of the most recent observation",
subtitle="by discharge status")
In any case, here is the predicted total hub-wide completion rate under various assumptions.
Assumption 1: that the latest observed enrollment duration is the final enrollment duration.
ppred <- posterior_epred(pred_comp,
newdata =
bind_rows(non_missing_undischarged,
non_missing_discharged) %>%
select(CSC_Duration_Totalmos,
Age_Entry_Years,
percent_NEET,
simple_race,
male_c,
ever_private,
QPR_avg,
NEG_avg,
Site_ID,
hub_c
))
pred <- colMeans(ppred)
mean(pred)
## [1] 0.2022029
…that the non-discharged folks will discharge at (at least) the mean of the discharged folks
av_disch <- mean(non_missing_discharged$CSC_Duration_Totalmos)
non_missing_undischarged$CSC_Duration_Totalmos <-
ifelse(non_missing_undischarged$CSC_Duration_Totalmos < av_disch, av_disch,non_missing_undischarged$CSC_Duration_Totalmos )
ppred <- posterior_epred(pred_comp,
newdata =
bind_rows(non_missing_undischarged,
non_missing_discharged) %>%
select(CSC_Duration_Totalmos,
Age_Entry_Years,
percent_NEET,
simple_race,
male_c,
ever_private,
QPR_avg,
NEG_avg,
Site_ID,
hub_c
))
pred <- colMeans(ppred)
mean(pred)
## [1] 0.2384769
…that the non-discharged folks will discharge at (at least) the upper quartile of the discharged folks
(upper_quartile <- quantile(non_missing_discharged$CSC_Duration_Totalmos, 0.75))
## 75%
## 24
non_missing_undischarged$CSC_Duration_Totalmos <-
ifelse(non_missing_undischarged$CSC_Duration_Totalmos < upper_quartile, upper_quartile,non_missing_undischarged$CSC_Duration_Totalmos )
ppred <- posterior_epred(pred_comp,
newdata =
bind_rows(non_missing_undischarged,
non_missing_discharged) %>%
select(CSC_Duration_Totalmos,
Age_Entry_Years,
percent_NEET,
simple_race,
male_c,
ever_private,
QPR_avg,
NEG_avg,
Site_ID,
hub_c
))
pred <- colMeans(ppred)
mean(pred)
## [1] 0.292774