This analysis builds a logistic regression model to predict whether outreach patients meet a predefined referral rule. The model uses only simple screening variables that can be collected quickly in the field:
The outcome referral_needed was constructed in Project 1
using a rule that combines diagnostic category and visual acuity. Here,
the goal is to see how well we can approximate that rule using only age
and visual acuity information, without diagnosis.
# Read the cleaned dataset produced in Project 1.
# Path assumes this Rmd is in project3_referral_model/
akurba <- readr::read_csv('C:\\Users\\Micah\\Desktop\\R_portfolio_project\\ophthal_analytics\\project1_clean_qc\\data_processed\\akurba_with_referral.csv')
glimpse(akurba)
## Rows: 243
## Columns: 13
## $ Patient_ID <chr> "P001", "P002", "P003", "P004", "P005", "P006", "P007"…
## $ Age <dbl> 45, 70, 54, 50, 72, 50, 53, 30, 47, 60, 37, 4, 4, 35, …
## $ Sex <chr> "Female", "Male", "Female", "Female", "Female", "Male"…
## $ va_re <chr> "6/36", "6/36", "6/24", "6/36", "6/36", "6/9", "HM", "…
## $ va_le <chr> "6/12", "6/60", "6/18", "6/36", "6/24", "6/9", "6/6", …
## $ diagnosis <chr> "Binasal Pterygium", "Le Senile Cataract", "Presbyopia…
## $ diagnosis_cat <chr> "Conjunctival/surface disorders", "Cataract-related", …
## $ age_group <chr> "Middle_aged", "Elderly", "Middle_aged", "Middle_aged"…
## $ va_re_rank <dbl> 6, 6, 5, 6, 6, 2, 10, 1, 1, 7, 1, NA, NA, 1, 1, 1, 1, …
## $ va_le_rank <dbl> 3, 7, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 2, 1, 1, 1, 6…
## $ better_eye_rank <dbl> 3, 6, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 1, 1, 1, 1, 6…
## $ who_va_cat <chr> "Mild VI", "Severe VI", "Moderate VI", "Severe VI", "S…
## $ referral_needed <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, …
# Prepare factors and drop rows with missing values in the variables used.
akurba <- akurba %>%
mutate(
# Outcome as factor with 0 = no referral, 1 = referral
referral_needed = factor(referral_needed, levels = c(0, 1)),
# Ensure predictors are factors with stable ordering
sex = factor(Sex),
age_group = factor(
age_group,
levels = c("Child", "Adult", "Middle_aged", "Elderly")
),
who_va_cat = factor(
who_va_cat,
levels = c("Normal", "Mild VI", "Moderate VI", "Severe VI", "Blindness"),
ordered = TRUE
)
) %>%
# Keep only rows with complete data for outcome and predictors
drop_na(referral_needed, age_group, sex, who_va_cat)
# Check outcome distribution after cleaning
table(akurba$referral_needed)
##
## 0 1
## 119 108
# Referral status by age group
table(akurba$referral_needed, akurba$age_group)
##
## Child Adult Middle_aged Elderly
## 0 18 66 32 3
## 1 5 20 54 29
# Referral status by better-eye category
table(akurba$referral_needed, akurba$who_va_cat)
##
## Normal Mild VI Moderate VI Severe VI Blindness
## 0 113 6 0 0 0
## 1 24 20 12 32 20
set.seed(2025) # for reproducibility
split <- initial_split(akurba, prop = 0.7, strata = referral_needed)
train_data <- training(split)
test_data <- testing(split)
# Check class proportions in each split
prop.table(table(train_data$referral_needed))
##
## 0 1
## 0.5253165 0.4746835
prop.table(table(test_data$referral_needed))
##
## 0 1
## 0.5217391 0.4782609
model_reduced <- glm(
referral_needed ~ age_group + sex + who_va_cat,
data = train_data,
family = binomial
)
summary(model_reduced)
##
## Call:
## glm(formula = referral_needed ~ age_group + sex + who_va_cat,
## family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.1809 1030.6030 0.011 0.991
## age_groupAdult -0.1324 0.7877 -0.168 0.867
## age_groupMiddle_aged 0.5881 0.7678 0.766 0.444
## age_groupElderly 1.5171 1.0859 1.397 0.162
## sexMale -0.1455 0.5133 -0.284 0.777
## who_va_cat.L 18.6835 1976.2905 0.009 0.992
## who_va_cat.Q -6.3473 2522.4210 -0.003 0.998
## who_va_cat.C -5.2438 1765.9681 -0.003 0.998
## who_va_cat^4 6.4111 2802.8319 0.002 0.998
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 218.63 on 157 degrees of freedom
## Residual deviance: 114.67 on 149 degrees of freedom
## AIC: 132.67
##
## Number of Fisher Scoring iterations: 18
model_tidy <- broom::tidy(
model_reduced,
exponentiate = TRUE, # report odds ratios
conf.int = TRUE # include confidence intervals
)
model_tidy
## # A tibble: 9 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 7.17e+4 1031. 0.0108 0.991 5.28e-40 NA
## 2 age_groupAdult 8.76e-1 0.788 -0.168 0.867 2.00e- 1 4.79e 0
## 3 age_groupMiddle_aged 1.80e+0 0.768 0.766 0.444 4.30e- 1 9.49e 0
## 4 age_groupElderly 4.56e+0 1.09 1.40 0.162 5.53e- 1 4.32e 1
## 5 sexMale 8.65e-1 0.513 -0.284 0.777 3.07e- 1 2.35e 0
## 6 who_va_cat.L 1.30e+8 1976. 0.00945 0.992 1.84e-37 2.32e209
## 7 who_va_cat.Q 1.75e-3 2522. -0.00252 0.998 2.04e-27 1.61e 22
## 8 who_va_cat.C 5.28e-3 1766. -0.00297 0.998 1.84e-19 8.00e 13
## 9 who_va_cat^4 6.09e+2 2803. 0.00229 0.998 1.67e-21 5.92e 26
test_pred <- test_data %>%
mutate(
# Predicted probability of referral
pred_prob = predict(model_reduced, newdata = test_data, type = "response"),
# Classify using 0.5 threshold
pred_class = if_else(pred_prob >= 0.5, "1", "0"),
pred_class = factor(pred_class, levels = c("0", "1"))
)
conf_mat_0_5 <- yardstick::conf_mat(
test_pred,
truth = referral_needed,
estimate = pred_class
)
conf_mat_0_5
## Truth
## Prediction 0 1
## 0 35 5
## 1 1 28
metrics_0_5 <- yardstick::metrics(
test_pred,
truth = referral_needed,
estimate = pred_class
)
metrics_0_5
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.913
## 2 kap binary 0.825
roc_obj <- pROC::roc(
response = as.numeric(test_data$referral_needed) - 1,
predictor = test_pred$pred_prob
)
plot(
roc_obj,
main = "ROC curve – referral prediction model",
print.auc = TRUE
)
test_pred <- test_pred %>%
mutate(
pred_class_0_3 = if_else(pred_prob >= 0.3, "1", "0"),
pred_class_0_3 = factor(pred_class_0_3, levels = c("0", "1"))
)
# Confusion matrix at threshold 0.3
conf_mat_0_3 <- yardstick::conf_mat(
test_pred,
truth = referral_needed,
estimate = pred_class_0_3
)
conf_mat_0_3
## Truth
## Prediction 0 1
## 0 35 4
## 1 1 29
# Metrics at threshold 0.3
metrics_0_3 <- yardstick::metrics(
test_pred,
truth = referral_needed,
estimate = pred_class_0_3
)
metrics_0_3
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.928
## 2 kap binary 0.854
referral_needed is a rule-based label, not
a reflection of clinician decision-making.