Project 3: Referral Prediction Model

1. Introduction

This analysis builds a logistic regression model to predict whether outreach patients meet a predefined referral rule. The model uses only simple screening variables that can be collected quickly in the field:

age group
sex
better-eye visual acuity category

The outcome referral_needed was constructed in Project 1 using a rule that combines diagnostic category and visual acuity. Here, the goal is to see how well we can approximate that rule using only age and visual acuity information, without diagnosis.

2. Data import and preprocessing

# Read the cleaned dataset produced in Project 1.
# Path assumes this Rmd is in project3_referral_model/
akurba <- readr::read_csv('C:\\Users\\Micah\\Desktop\\R_portfolio_project\\ophthal_analytics\\project1_clean_qc\\data_processed\\akurba_with_referral.csv')

glimpse(akurba)

## Rows: 243
## Columns: 13
## $ Patient_ID      <chr> "P001", "P002", "P003", "P004", "P005", "P006", "P007"…
## $ Age             <dbl> 45, 70, 54, 50, 72, 50, 53, 30, 47, 60, 37, 4, 4, 35, …
## $ Sex             <chr> "Female", "Male", "Female", "Female", "Female", "Male"…
## $ va_re           <chr> "6/36", "6/36", "6/24", "6/36", "6/36", "6/9", "HM", "…
## $ va_le           <chr> "6/12", "6/60", "6/18", "6/36", "6/24", "6/9", "6/6", …
## $ diagnosis       <chr> "Binasal Pterygium", "Le Senile Cataract", "Presbyopia…
## $ diagnosis_cat   <chr> "Conjunctival/surface disorders", "Cataract-related", …
## $ age_group       <chr> "Middle_aged", "Elderly", "Middle_aged", "Middle_aged"…
## $ va_re_rank      <dbl> 6, 6, 5, 6, 6, 2, 10, 1, 1, 7, 1, NA, NA, 1, 1, 1, 1, …
## $ va_le_rank      <dbl> 3, 7, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 2, 1, 1, 1, 6…
## $ better_eye_rank <dbl> 3, 6, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 1, 1, 1, 1, 6…
## $ who_va_cat      <chr> "Mild VI", "Severe VI", "Moderate VI", "Severe VI", "S…
## $ referral_needed <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, …

# Prepare factors and drop rows with missing values in the variables used.
akurba <- akurba %>%
  mutate(
    # Outcome as factor with 0 = no referral, 1 = referral
    referral_needed = factor(referral_needed, levels = c(0, 1)),

    # Ensure predictors are factors with stable ordering
    sex = factor(Sex),

    age_group = factor(
      age_group,
      levels = c("Child", "Adult", "Middle_aged", "Elderly")
    ),

     who_va_cat = factor(
      who_va_cat,
      levels = c("Normal", "Mild VI", "Moderate VI", "Severe VI", "Blindness"),
      ordered = TRUE
    )
  ) %>%
  # Keep only rows with complete data for outcome and predictors
  drop_na(referral_needed, age_group, sex, who_va_cat)

# Check outcome distribution after cleaning
table(akurba$referral_needed)

## 
##   0   1 
## 119 108

3. Quick exploration of outcome by predictors

# Referral status by age group
table(akurba$referral_needed, akurba$age_group)

##    
##     Child Adult Middle_aged Elderly
##   0    18    66          32       3
##   1     5    20          54      29

# Referral status by better-eye category
table(akurba$referral_needed, akurba$who_va_cat)

##    
##     Normal Mild VI Moderate VI Severe VI Blindness
##   0    113       6           0         0         0
##   1     24      20          12        32        20

4. Train–test split

set.seed(2025)  # for reproducibility

split <- initial_split(akurba, prop = 0.7, strata = referral_needed)

train_data <- training(split)
test_data  <- testing(split)

# Check class proportions in each split
prop.table(table(train_data$referral_needed))

## 
##         0         1 
## 0.5253165 0.4746835

prop.table(table(test_data$referral_needed))

## 
##         0         1 
## 0.5217391 0.4782609

5. Logistic regression model

model_reduced <- glm(
  referral_needed ~ age_group + sex + who_va_cat,
  data = train_data,
  family = binomial
)

summary(model_reduced)

## 
## Call:
## glm(formula = referral_needed ~ age_group + sex + who_va_cat, 
##     family = binomial, data = train_data)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)            11.1809  1030.6030   0.011    0.991
## age_groupAdult         -0.1324     0.7877  -0.168    0.867
## age_groupMiddle_aged    0.5881     0.7678   0.766    0.444
## age_groupElderly        1.5171     1.0859   1.397    0.162
## sexMale                -0.1455     0.5133  -0.284    0.777
## who_va_cat.L           18.6835  1976.2905   0.009    0.992
## who_va_cat.Q           -6.3473  2522.4210  -0.003    0.998
## who_va_cat.C           -5.2438  1765.9681  -0.003    0.998
## who_va_cat^4            6.4111  2802.8319   0.002    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 218.63  on 157  degrees of freedom
## Residual deviance: 114.67  on 149  degrees of freedom
## AIC: 132.67
## 
## Number of Fisher Scoring iterations: 18

model_tidy <- broom::tidy(
  model_reduced,
  exponentiate = TRUE,  # report odds ratios
  conf.int = TRUE       # include confidence intervals
)

model_tidy

## # A tibble: 9 × 7
##   term                 estimate std.error statistic p.value conf.low conf.high
##   <chr>                   <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)           7.17e+4  1031.      0.0108    0.991 5.28e-40 NA       
## 2 age_groupAdult        8.76e-1     0.788  -0.168     0.867 2.00e- 1  4.79e  0
## 3 age_groupMiddle_aged  1.80e+0     0.768   0.766     0.444 4.30e- 1  9.49e  0
## 4 age_groupElderly      4.56e+0     1.09    1.40      0.162 5.53e- 1  4.32e  1
## 5 sexMale               8.65e-1     0.513  -0.284     0.777 3.07e- 1  2.35e  0
## 6 who_va_cat.L          1.30e+8  1976.      0.00945   0.992 1.84e-37  2.32e209
## 7 who_va_cat.Q          1.75e-3  2522.     -0.00252   0.998 2.04e-27  1.61e 22
## 8 who_va_cat.C          5.28e-3  1766.     -0.00297   0.998 1.84e-19  8.00e 13
## 9 who_va_cat^4          6.09e+2  2803.      0.00229   0.998 1.67e-21  5.92e 26

6. Performance on the held-out test set

6.1 Predictions and confusion matrix at 0.5 threshold

test_pred <- test_data %>%
  mutate(
    # Predicted probability of referral
    pred_prob = predict(model_reduced, newdata = test_data, type = "response"),

    # Classify using 0.5 threshold
    pred_class = if_else(pred_prob >= 0.5, "1", "0"),
    pred_class = factor(pred_class, levels = c("0", "1"))
  )

conf_mat_0_5 <- yardstick::conf_mat(
  test_pred,
  truth = referral_needed,
  estimate = pred_class
)
conf_mat_0_5

##           Truth
## Prediction  0  1
##          0 35  5
##          1  1 28

metrics_0_5 <- yardstick::metrics(
  test_pred,
  truth = referral_needed,
  estimate = pred_class
)
metrics_0_5

## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.913
## 2 kap      binary         0.825

6.2 ROC curve and AUC

roc_obj <- pROC::roc(
  response  = as.numeric(test_data$referral_needed) - 1,
  predictor = test_pred$pred_prob
)

plot(
  roc_obj,
  main = "ROC curve – referral prediction model",
  print.auc = TRUE
)

7. Sensitivity analysis with a lower threshold

test_pred <- test_pred %>%
  mutate(
    pred_class_0_3 = if_else(pred_prob >= 0.3, "1", "0"),
    pred_class_0_3 = factor(pred_class_0_3, levels = c("0", "1"))
  )

# Confusion matrix at threshold 0.3
conf_mat_0_3 <- yardstick::conf_mat(
  test_pred,
  truth = referral_needed,
  estimate = pred_class_0_3
)
conf_mat_0_3

##           Truth
## Prediction  0  1
##          0 35  4
##          1  1 29

# Metrics at threshold 0.3
metrics_0_3 <- yardstick::metrics(
  test_pred,
  truth = referral_needed,
  estimate = pred_class_0_3
)
metrics_0_3

## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.928
## 2 kap      binary         0.854

8. Limitations and conclusion

The outcome referral_needed is a rule-based label, not a reflection of clinician decision-making.
The model cannot capture more subtle or contextual judgement calls made in practice.
Visual acuity alone explains most of the variance: categories like “Severe” and “Very_poor” lead to near-complete separation.
This analysis demonstrates how far one can get with very simple, low-cost inputs such as age and VA alone.