knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE
)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(broom)

1. Data and context

This analysis uses outreach data from Akurba community. Each row represents a patient seen during the outreach, with age, sex, presenting VA in each eye, and a clinician-assigned provisional diagnosis.

The goal here is to:

Estimate the prevalence of visual impairment using WHO-aligned categories.

Describe the pattern of eye diseases observed.

Explore how visual impairment varies by age and sex.

2. Load cleaned dataset

# Adjust this path/name to match your cleaned file from Project 1
akurba <- read_csv("data/akurba_with_referral.csv") %>%
  clean_names()

glimpse(akurba)
## Rows: 243
## Columns: 13
## $ patient_id      <chr> "P001", "P002", "P003", "P004", "P005", "P006", "P007"…
## $ age             <dbl> 45, 70, 54, 50, 72, 50, 53, 30, 47, 60, 37, 4, 4, 35, …
## $ sex             <chr> "Female", "Male", "Female", "Female", "Female", "Male"…
## $ va_re           <chr> "6/36", "6/36", "6/24", "6/36", "6/36", "6/9", "HM", "…
## $ va_le           <chr> "6/12", "6/60", "6/18", "6/36", "6/24", "6/9", "6/6", …
## $ diagnosis       <chr> "Binasal Pterygium", "Le Senile Cataract", "Presbyopia…
## $ diagnosis_cat   <chr> "Conjunctival/surface disorders", "Cataract-related", …
## $ age_group       <chr> "Middle_aged", "Elderly", "Middle_aged", "Middle_aged"…
## $ va_re_rank      <dbl> 6, 6, 5, 6, 6, 2, 10, 1, 1, 7, 1, NA, NA, 1, 1, 1, 1, …
## $ va_le_rank      <dbl> 3, 7, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 2, 1, 1, 1, 6…
## $ better_eye_rank <dbl> 3, 6, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 1, 1, 1, 1, 6…
## $ better_eye_cat  <chr> "Moderate", "Severe", "Moderate", "Severe", "Severe", …
## $ referral_needed <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, …

Assumptions (adjust if needed):

age = numeric age in years

sex = coded “M”/“F” or similar

better_eye_cat = WHO-based visual acuity category for the better eye

diagnosis_cat = grouped diagnostic category (e.g. “Refractive”, “Cataract-related”, “Other”)

If your column names differ, fix them here once and use those consistently.

akurba <- akurba %>%
  mutate(
    sex = case_when(
      sex %in% c("M", "Male", "male") ~ "Male",
      sex %in% c("F", "Female", "female") ~ "Female",
      TRUE ~ NA_character_
      ),
    age_group = case_when(
      age < 18 ~ "0–17",
      age >= 18 & age < 40 ~ "18–39",
      age >= 40 & age < 60 ~ "40–59",
      age >= 60 ~ "60+",
      TRUE ~ NA_character_
      ),
    age_group = factor(age_group, levels = c("0–17", "18–39", "40–59", "60+"))
    )

akurba %>% count(sex)
## # A tibble: 2 × 2
##   sex        n
##   <chr>  <int>
## 1 Female   139
## 2 Male     104
akurba %>% count(age_group)
## # A tibble: 4 × 2
##   age_group     n
##   <fct>     <int>
## 1 0–17         33
## 2 18–39        57
## 3 40–59        98
## 4 60+          55

3. WHO visual impairment categories

We treat better_eye_cat as a factor in the order of increasing severity.

akurba <- akurba %>%
  mutate(
    better_eye_cat = fct_relevel(
      better_eye_cat,
      c("Normal", "Mild", "Moderate", "Severe", "Blindness")
      )
    )

akurba %>% count(better_eye_cat)
## # A tibble: 5 × 2
##   better_eye_cat     n
##   <fct>          <int>
## 1 Moderate          38
## 2 Severe            32
## 3 Normal_mild      137
## 4 Very_poor         20
## 5 <NA>              16

Create a binary “any visual impairment” variable (moderate or worse).

akurba <- akurba %>%
  mutate(
    vi_any = case_when(
      better_eye_cat %in% c("Moderate", "Severe", "Blindness") ~ 1L,
      better_eye_cat %in% c("Normal", "Mild") ~ 0L,
      TRUE ~ NA_integer_
      )
    )

akurba %>% count(vi_any)
## # A tibble: 2 × 2
##   vi_any     n
##    <int> <int>
## 1      1    70
## 2     NA   173

4. Overall prevalence of visual impairment

vi_overall <- akurba %>%
  summarise(
    n = sum(!is.na(vi_any)),
    n_vi = sum(vi_any == 1L, na.rm = TRUE),
    prev_vi = n_vi / n) %>%
  mutate(prev_vi_pct = percent(prev_vi, accuracy = 0.1))

vi_overall
## # A tibble: 1 × 4
##       n  n_vi prev_vi prev_vi_pct
##   <int> <int>   <dbl> <chr>      
## 1    70    70       1 100.0%

5. Visual impairment by age and sex

vi_age_sex <- akurba %>%
  filter(!is.na(vi_any), !is.na(age_group), !is.na(sex)) %>%
  group_by(age_group, sex) %>%
  summarise(
    n = n(),
    n_vi = sum(vi_any == 1L),
    prev_vi = n_vi / n,
    .groups = "drop"
    ) %>%
  mutate(prev_vi_pct = percent(prev_vi, accuracy = 0.1))

vi_age_sex
## # A tibble: 8 × 6
##   age_group sex        n  n_vi prev_vi prev_vi_pct
##   <fct>     <chr>  <int> <int>   <dbl> <chr>      
## 1 0–17      Female     1     1       1 100.0%     
## 2 0–17      Male       1     1       1 100.0%     
## 3 18–39     Female     3     3       1 100.0%     
## 4 18–39     Male       3     3       1 100.0%     
## 5 40–59     Female    26    26       1 100.0%     
## 6 40–59     Male       9     9       1 100.0%     
## 7 60+       Female    10    10       1 100.0%     
## 8 60+       Male      17    17       1 100.0%

Visual:

ggplot(vi_age_sex, aes(x = age_group, y = prev_vi, fill = sex)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = percent_format(accuracy = 1), limits = c(0, 1)) +
  labs(
    title = "Prevalence of visual impairment (Moderate or worse) by age group and sex",
    x = "Age group",
    y = "Prevalence"
    )

6. Pattern of diagnostic categories

Overall frequency of diagnostic groups:

diag_overall <- akurba %>%
  count(diagnosis_cat) %>%
  mutate(
    prop = n / sum(n),
    prop_pct = percent(prop, accuracy = 0.1)
    ) %>%
  arrange(desc(n))

diag_overall
## # A tibble: 7 × 4
##   diagnosis_cat                      n    prop prop_pct
##   <chr>                          <int>   <dbl> <chr>   
## 1 Cataract-related                  72 0.296   29.6%   
## 2 Conjunctival/surface disorders    71 0.292   29.2%   
## 3 Refractive disorders              71 0.292   29.2%   
## 4 Others                            16 0.0658  6.6%    
## 5 Glaucoma-related                   9 0.0370  3.7%    
## 6 Corneal disorders                  3 0.0123  1.2%    
## 7 Trauma-related                     1 0.00412 0.4%

Plot:

ggplot(diag_overall, aes(x = reorder(diagnosis_cat, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Distribution of diagnostic categories – Akurba outreach",
    x = "Diagnostic category",
    y = "Number of patients"
    )

Diagnostic patterns by age group (optional):

diag_age <- akurba %>%
  filter(!is.na(age_group), !is.na(diagnosis_cat)) %>%
  count(age_group, diagnosis_cat) %>%
  group_by(age_group) %>%
  mutate(
    prop = n / sum(n),
    prop_pct = percent(prop, accuracy = 0.1),
    .groups = "drop"
    )

diag_age
## # A tibble: 23 × 6
## # Groups:   age_group [4]
##    age_group diagnosis_cat                      n   prop prop_pct .groups
##    <fct>     <chr>                          <int>  <dbl> <chr>    <chr>  
##  1 0–17      Cataract-related                   3 0.0909 9.1%     drop   
##  2 0–17      Conjunctival/surface disorders    24 0.727  72.7%    drop   
##  3 0–17      Corneal disorders                  1 0.0303 3.0%     drop   
##  4 0–17      Others                             2 0.0606 6.1%     drop   
##  5 0–17      Refractive disorders               3 0.0909 9.1%     drop   
##  6 18–39     Cataract-related                   3 0.0526 5.3%     drop   
##  7 18–39     Conjunctival/surface disorders    31 0.544  54.4%    drop   
##  8 18–39     Corneal disorders                  1 0.0175 1.8%     drop   
##  9 18–39     Glaucoma-related                   2 0.0351 3.5%     drop   
## 10 18–39     Others                             4 0.0702 7.0%     drop   
## # ℹ 13 more rows

Plot:

ggplot(diag_age, aes(x = age_group, y = prop, fill = diagnosis_cat)) +
  geom_col(position = "fill") +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    title = "Relative distribution of diagnostic categories by age group",
    x = "Age group",
    y = "Proportion"
    )

7. Simple model: predictors of visual impairment

We now fit a basic logistic regression with outcome “any visual impairment” and predictors age and sex. This is not meant to be a publish-ready model, just a demonstration of workflow.

glm_data <- akurba %>%
  filter(!is.na(vi_any), !is.na(age), !is.na(sex)) %>%
  mutate(
    sex = factor(sex),
    vi_any = as.factor(vi_any)
    )

summary(glm_data$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   50.00   55.00   53.51   60.00   80.00
table(glm_data$sex)
## 
## Female   Male 
##     40     30
fit_vi <- glm(
  vi_any ~ age + sex,
  data = glm_data,
  family = binomial
  )

summary(fit_vi)
## 
## Call:
## glm(formula = vi_any ~ age + sex, family = binomial, data = glm_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.657e+01  1.697e+05       0        1
## age          5.120e-16  3.168e+03       0        1
## sexMale      1.405e-14  8.881e+04       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 0.0000e+00  on 69  degrees of freedom
## Residual deviance: 4.0611e-10  on 67  degrees of freedom
## AIC: 6
## 
## Number of Fisher Scoring iterations: 25

Tidy output with odds ratios:

# Get ORs, but skip profile-based CIs to avoid approx() error
fit_vi_tidy <- broom::tidy(fit_vi, exponentiate = TRUE, conf.int = FALSE)

fit_vi_tidy
## # A tibble: 3 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) 2.90e-12   169698. -1.57e- 4   1.000
## 2 age         1   e+ 0     3168.  1.62e-19   1    
## 3 sexMale     1.00e+ 0    88806.  1.58e-19   1

You can interpret this along the lines of:

OR per year increase in age

OR for Female vs Male, adjusted for age

8. Export summary tables

write_csv(vi_overall, "data/akurba_vi_overall.csv")
write_csv(vi_age_sex, "data/akurba_vi_by_age_sex.csv")
write_csv(diag_overall, "data/akurba_diag_overall.csv")
write_csv(diag_age, "data/akurba_diag_by_age.csv")
write_csv(fit_vi_tidy, "data/akurba_vi_glm_results.csv")