Project 2 – Prevalence and Pattern of Eye Diseases and Visual Impairment in Akurba Outreach

1. Data and context
2. Load cleaned dataset
3. WHO visual impairment categories
4. Overall prevalence of visual impairment
5. Visual impairment by age and sex
6. Pattern of diagnostic categories
7. Simple model: predictors of visual impairment
8. Export summary tables

knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE
)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(broom)

1. Data and context

This analysis uses outreach data from Akurba community. Each row represents a patient seen during the outreach, with age, sex, presenting VA in each eye, and a clinician-assigned provisional diagnosis.

The goal here is to:

Estimate the prevalence of visual impairment using WHO-aligned categories.

Describe the pattern of eye diseases observed.

Explore how visual impairment varies by age and sex.

2. Load cleaned dataset

# Adjust this path/name to match your cleaned file from Project 1
akurba <- read_csv("data/akurba_with_referral.csv") %>%
  clean_names()

glimpse(akurba)

## Rows: 243
## Columns: 13
## $ patient_id      <chr> "P001", "P002", "P003", "P004", "P005", "P006", "P007"…
## $ age             <dbl> 45, 70, 54, 50, 72, 50, 53, 30, 47, 60, 37, 4, 4, 35, …
## $ sex             <chr> "Female", "Male", "Female", "Female", "Female", "Male"…
## $ va_re           <chr> "6/36", "6/36", "6/24", "6/36", "6/36", "6/9", "HM", "…
## $ va_le           <chr> "6/12", "6/60", "6/18", "6/36", "6/24", "6/9", "6/6", …
## $ diagnosis       <chr> "Binasal Pterygium", "Le Senile Cataract", "Presbyopia…
## $ diagnosis_cat   <chr> "Conjunctival/surface disorders", "Cataract-related", …
## $ age_group       <chr> "Middle_aged", "Elderly", "Middle_aged", "Middle_aged"…
## $ va_re_rank      <dbl> 6, 6, 5, 6, 6, 2, 10, 1, 1, 7, 1, NA, NA, 1, 1, 1, 1, …
## $ va_le_rank      <dbl> 3, 7, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 2, 1, 1, 1, 6…
## $ better_eye_rank <dbl> 3, 6, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 1, 1, 1, 1, 6…
## $ better_eye_cat  <chr> "Moderate", "Severe", "Moderate", "Severe", "Severe", …
## $ referral_needed <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, …

Assumptions (adjust if needed):

age = numeric age in years

sex = coded “M”/“F” or similar

better_eye_cat = WHO-based visual acuity category for the better eye

diagnosis_cat = grouped diagnostic category (e.g. “Refractive”, “Cataract-related”, “Other”)

If your column names differ, fix them here once and use those consistently.

akurba <- akurba %>%
  mutate(
    sex = case_when(
      sex %in% c("M", "Male", "male") ~ "Male",
      sex %in% c("F", "Female", "female") ~ "Female",
      TRUE ~ NA_character_
      ),
    age_group = case_when(
      age < 18 ~ "0–17",
      age >= 18 & age < 40 ~ "18–39",
      age >= 40 & age < 60 ~ "40–59",
      age >= 60 ~ "60+",
      TRUE ~ NA_character_
      ),
    age_group = factor(age_group, levels = c("0–17", "18–39", "40–59", "60+"))
    )

akurba %>% count(sex)

## # A tibble: 2 × 2
##   sex        n
##   <chr>  <int>
## 1 Female   139
## 2 Male     104

akurba %>% count(age_group)

## # A tibble: 4 × 2
##   age_group     n
##   <fct>     <int>
## 1 0–17         33
## 2 18–39        57
## 3 40–59        98
## 4 60+          55

3. WHO visual impairment categories

We treat better_eye_cat as a factor in the order of increasing severity.

akurba <- akurba %>%
  mutate(
    better_eye_cat = fct_relevel(
      better_eye_cat,
      c("Normal", "Mild", "Moderate", "Severe", "Blindness")
      )
    )

akurba %>% count(better_eye_cat)

## # A tibble: 5 × 2
##   better_eye_cat     n
##   <fct>          <int>
## 1 Moderate          38
## 2 Severe            32
## 3 Normal_mild      137
## 4 Very_poor         20
## 5 <NA>              16

Create a binary “any visual impairment” variable (moderate or worse).

akurba <- akurba %>%
  mutate(
    vi_any = case_when(
      better_eye_cat %in% c("Moderate", "Severe", "Blindness") ~ 1L,
      better_eye_cat %in% c("Normal", "Mild") ~ 0L,
      TRUE ~ NA_integer_
      )
    )

akurba %>% count(vi_any)

## # A tibble: 2 × 2
##   vi_any     n
##    <int> <int>
## 1      1    70
## 2     NA   173

4. Overall prevalence of visual impairment

vi_overall <- akurba %>%
  summarise(
    n = sum(!is.na(vi_any)),
    n_vi = sum(vi_any == 1L, na.rm = TRUE),
    prev_vi = n_vi / n) %>%
  mutate(prev_vi_pct = percent(prev_vi, accuracy = 0.1))

vi_overall

## # A tibble: 1 × 4
##       n  n_vi prev_vi prev_vi_pct
##   <int> <int>   <dbl> <chr>      
## 1    70    70       1 100.0%

5. Visual impairment by age and sex

vi_age_sex <- akurba %>%
  filter(!is.na(vi_any), !is.na(age_group), !is.na(sex)) %>%
  group_by(age_group, sex) %>%
  summarise(
    n = n(),
    n_vi = sum(vi_any == 1L),
    prev_vi = n_vi / n,
    .groups = "drop"
    ) %>%
  mutate(prev_vi_pct = percent(prev_vi, accuracy = 0.1))

vi_age_sex

## # A tibble: 8 × 6
##   age_group sex        n  n_vi prev_vi prev_vi_pct
##   <fct>     <chr>  <int> <int>   <dbl> <chr>      
## 1 0–17      Female     1     1       1 100.0%     
## 2 0–17      Male       1     1       1 100.0%     
## 3 18–39     Female     3     3       1 100.0%     
## 4 18–39     Male       3     3       1 100.0%     
## 5 40–59     Female    26    26       1 100.0%     
## 6 40–59     Male       9     9       1 100.0%     
## 7 60+       Female    10    10       1 100.0%     
## 8 60+       Male      17    17       1 100.0%

Visual:

ggplot(vi_age_sex, aes(x = age_group, y = prev_vi, fill = sex)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = percent_format(accuracy = 1), limits = c(0, 1)) +
  labs(
    title = "Prevalence of visual impairment (Moderate or worse) by age group and sex",
    x = "Age group",
    y = "Prevalence"
    )

6. Pattern of diagnostic categories

Overall frequency of diagnostic groups:

diag_overall <- akurba %>%
  count(diagnosis_cat) %>%
  mutate(
    prop = n / sum(n),
    prop_pct = percent(prop, accuracy = 0.1)
    ) %>%
  arrange(desc(n))

diag_overall

## # A tibble: 7 × 4
##   diagnosis_cat                      n    prop prop_pct
##   <chr>                          <int>   <dbl> <chr>   
## 1 Cataract-related                  72 0.296   29.6%   
## 2 Conjunctival/surface disorders    71 0.292   29.2%   
## 3 Refractive disorders              71 0.292   29.2%   
## 4 Others                            16 0.0658  6.6%    
## 5 Glaucoma-related                   9 0.0370  3.7%    
## 6 Corneal disorders                  3 0.0123  1.2%    
## 7 Trauma-related                     1 0.00412 0.4%

Plot:

ggplot(diag_overall, aes(x = reorder(diagnosis_cat, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Distribution of diagnostic categories – Akurba outreach",
    x = "Diagnostic category",
    y = "Number of patients"
    )

Diagnostic patterns by age group (optional):

diag_age <- akurba %>%
  filter(!is.na(age_group), !is.na(diagnosis_cat)) %>%
  count(age_group, diagnosis_cat) %>%
  group_by(age_group) %>%
  mutate(
    prop = n / sum(n),
    prop_pct = percent(prop, accuracy = 0.1),
    .groups = "drop"
    )

diag_age

## # A tibble: 23 × 6
## # Groups:   age_group [4]
##    age_group diagnosis_cat                      n   prop prop_pct .groups
##    <fct>     <chr>                          <int>  <dbl> <chr>    <chr>  
##  1 0–17      Cataract-related                   3 0.0909 9.1%     drop   
##  2 0–17      Conjunctival/surface disorders    24 0.727  72.7%    drop   
##  3 0–17      Corneal disorders                  1 0.0303 3.0%     drop   
##  4 0–17      Others                             2 0.0606 6.1%     drop   
##  5 0–17      Refractive disorders               3 0.0909 9.1%     drop   
##  6 18–39     Cataract-related                   3 0.0526 5.3%     drop   
##  7 18–39     Conjunctival/surface disorders    31 0.544  54.4%    drop   
##  8 18–39     Corneal disorders                  1 0.0175 1.8%     drop   
##  9 18–39     Glaucoma-related                   2 0.0351 3.5%     drop   
## 10 18–39     Others                             4 0.0702 7.0%     drop   
## # ℹ 13 more rows

Plot:

ggplot(diag_age, aes(x = age_group, y = prop, fill = diagnosis_cat)) +
  geom_col(position = "fill") +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    title = "Relative distribution of diagnostic categories by age group",
    x = "Age group",
    y = "Proportion"
    )

7. Simple model: predictors of visual impairment

We now fit a basic logistic regression with outcome “any visual impairment” and predictors age and sex. This is not meant to be a publish-ready model, just a demonstration of workflow.

glm_data <- akurba %>%
  filter(!is.na(vi_any), !is.na(age), !is.na(sex)) %>%
  mutate(
    sex = factor(sex),
    vi_any = as.factor(vi_any)
    )

summary(glm_data$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   50.00   55.00   53.51   60.00   80.00

table(glm_data$sex)

## 
## Female   Male 
##     40     30

fit_vi <- glm(
  vi_any ~ age + sex,
  data = glm_data,
  family = binomial
  )

summary(fit_vi)

## 
## Call:
## glm(formula = vi_any ~ age + sex, family = binomial, data = glm_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.657e+01  1.697e+05       0        1
## age          5.120e-16  3.168e+03       0        1
## sexMale      1.405e-14  8.881e+04       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 0.0000e+00  on 69  degrees of freedom
## Residual deviance: 4.0611e-10  on 67  degrees of freedom
## AIC: 6
## 
## Number of Fisher Scoring iterations: 25

Tidy output with odds ratios:

# Get ORs, but skip profile-based CIs to avoid approx() error
fit_vi_tidy <- broom::tidy(fit_vi, exponentiate = TRUE, conf.int = FALSE)

fit_vi_tidy

## # A tibble: 3 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) 2.90e-12   169698. -1.57e- 4   1.000
## 2 age         1   e+ 0     3168.  1.62e-19   1    
## 3 sexMale     1.00e+ 0    88806.  1.58e-19   1

You can interpret this along the lines of:

OR per year increase in age

OR for Female vs Male, adjusted for age

8. Export summary tables

write_csv(vi_overall, "data/akurba_vi_overall.csv")
write_csv(vi_age_sex, "data/akurba_vi_by_age_sex.csv")
write_csv(diag_overall, "data/akurba_diag_overall.csv")
write_csv(diag_age, "data/akurba_diag_by_age.csv")
write_csv(fit_vi_tidy, "data/akurba_vi_glm_results.csv")