knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE
)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(broom)
This analysis uses outreach data from Akurba community. Each row represents a patient seen during the outreach, with age, sex, presenting VA in each eye, and a clinician-assigned provisional diagnosis.
The goal here is to:
Estimate the prevalence of visual impairment using WHO-aligned categories.
Describe the pattern of eye diseases observed.
Explore how visual impairment varies by age and sex.
# Adjust this path/name to match your cleaned file from Project 1
akurba <- read_csv("data/akurba_with_referral.csv") %>%
clean_names()
glimpse(akurba)
## Rows: 243
## Columns: 13
## $ patient_id <chr> "P001", "P002", "P003", "P004", "P005", "P006", "P007"…
## $ age <dbl> 45, 70, 54, 50, 72, 50, 53, 30, 47, 60, 37, 4, 4, 35, …
## $ sex <chr> "Female", "Male", "Female", "Female", "Female", "Male"…
## $ va_re <chr> "6/36", "6/36", "6/24", "6/36", "6/36", "6/9", "HM", "…
## $ va_le <chr> "6/12", "6/60", "6/18", "6/36", "6/24", "6/9", "6/6", …
## $ diagnosis <chr> "Binasal Pterygium", "Le Senile Cataract", "Presbyopia…
## $ diagnosis_cat <chr> "Conjunctival/surface disorders", "Cataract-related", …
## $ age_group <chr> "Middle_aged", "Elderly", "Middle_aged", "Middle_aged"…
## $ va_re_rank <dbl> 6, 6, 5, 6, 6, 2, 10, 1, 1, 7, 1, NA, NA, 1, 1, 1, 1, …
## $ va_le_rank <dbl> 3, 7, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 2, 1, 1, 1, 6…
## $ better_eye_rank <dbl> 3, 6, 4, 6, 5, 2, 1, 1, 1, 5, 1, NA, NA, 1, 1, 1, 1, 6…
## $ better_eye_cat <chr> "Moderate", "Severe", "Moderate", "Severe", "Severe", …
## $ referral_needed <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, …
Assumptions (adjust if needed):
age = numeric age in years
sex = coded “M”/“F” or similar
better_eye_cat = WHO-based visual acuity category for the better eye
diagnosis_cat = grouped diagnostic category (e.g. “Refractive”, “Cataract-related”, “Other”)
If your column names differ, fix them here once and use those consistently.
akurba <- akurba %>%
mutate(
sex = case_when(
sex %in% c("M", "Male", "male") ~ "Male",
sex %in% c("F", "Female", "female") ~ "Female",
TRUE ~ NA_character_
),
age_group = case_when(
age < 18 ~ "0–17",
age >= 18 & age < 40 ~ "18–39",
age >= 40 & age < 60 ~ "40–59",
age >= 60 ~ "60+",
TRUE ~ NA_character_
),
age_group = factor(age_group, levels = c("0–17", "18–39", "40–59", "60+"))
)
akurba %>% count(sex)
## # A tibble: 2 × 2
## sex n
## <chr> <int>
## 1 Female 139
## 2 Male 104
akurba %>% count(age_group)
## # A tibble: 4 × 2
## age_group n
## <fct> <int>
## 1 0–17 33
## 2 18–39 57
## 3 40–59 98
## 4 60+ 55
We treat better_eye_cat as a factor in the order of increasing severity.
akurba <- akurba %>%
mutate(
better_eye_cat = fct_relevel(
better_eye_cat,
c("Normal", "Mild", "Moderate", "Severe", "Blindness")
)
)
akurba %>% count(better_eye_cat)
## # A tibble: 5 × 2
## better_eye_cat n
## <fct> <int>
## 1 Moderate 38
## 2 Severe 32
## 3 Normal_mild 137
## 4 Very_poor 20
## 5 <NA> 16
Create a binary “any visual impairment” variable (moderate or worse).
akurba <- akurba %>%
mutate(
vi_any = case_when(
better_eye_cat %in% c("Moderate", "Severe", "Blindness") ~ 1L,
better_eye_cat %in% c("Normal", "Mild") ~ 0L,
TRUE ~ NA_integer_
)
)
akurba %>% count(vi_any)
## # A tibble: 2 × 2
## vi_any n
## <int> <int>
## 1 1 70
## 2 NA 173
vi_overall <- akurba %>%
summarise(
n = sum(!is.na(vi_any)),
n_vi = sum(vi_any == 1L, na.rm = TRUE),
prev_vi = n_vi / n) %>%
mutate(prev_vi_pct = percent(prev_vi, accuracy = 0.1))
vi_overall
## # A tibble: 1 × 4
## n n_vi prev_vi prev_vi_pct
## <int> <int> <dbl> <chr>
## 1 70 70 1 100.0%
vi_age_sex <- akurba %>%
filter(!is.na(vi_any), !is.na(age_group), !is.na(sex)) %>%
group_by(age_group, sex) %>%
summarise(
n = n(),
n_vi = sum(vi_any == 1L),
prev_vi = n_vi / n,
.groups = "drop"
) %>%
mutate(prev_vi_pct = percent(prev_vi, accuracy = 0.1))
vi_age_sex
## # A tibble: 8 × 6
## age_group sex n n_vi prev_vi prev_vi_pct
## <fct> <chr> <int> <int> <dbl> <chr>
## 1 0–17 Female 1 1 1 100.0%
## 2 0–17 Male 1 1 1 100.0%
## 3 18–39 Female 3 3 1 100.0%
## 4 18–39 Male 3 3 1 100.0%
## 5 40–59 Female 26 26 1 100.0%
## 6 40–59 Male 9 9 1 100.0%
## 7 60+ Female 10 10 1 100.0%
## 8 60+ Male 17 17 1 100.0%
Visual:
ggplot(vi_age_sex, aes(x = age_group, y = prev_vi, fill = sex)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = percent_format(accuracy = 1), limits = c(0, 1)) +
labs(
title = "Prevalence of visual impairment (Moderate or worse) by age group and sex",
x = "Age group",
y = "Prevalence"
)
Overall frequency of diagnostic groups:
diag_overall <- akurba %>%
count(diagnosis_cat) %>%
mutate(
prop = n / sum(n),
prop_pct = percent(prop, accuracy = 0.1)
) %>%
arrange(desc(n))
diag_overall
## # A tibble: 7 × 4
## diagnosis_cat n prop prop_pct
## <chr> <int> <dbl> <chr>
## 1 Cataract-related 72 0.296 29.6%
## 2 Conjunctival/surface disorders 71 0.292 29.2%
## 3 Refractive disorders 71 0.292 29.2%
## 4 Others 16 0.0658 6.6%
## 5 Glaucoma-related 9 0.0370 3.7%
## 6 Corneal disorders 3 0.0123 1.2%
## 7 Trauma-related 1 0.00412 0.4%
Plot:
ggplot(diag_overall, aes(x = reorder(diagnosis_cat, n), y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Distribution of diagnostic categories – Akurba outreach",
x = "Diagnostic category",
y = "Number of patients"
)
Diagnostic patterns by age group (optional):
diag_age <- akurba %>%
filter(!is.na(age_group), !is.na(diagnosis_cat)) %>%
count(age_group, diagnosis_cat) %>%
group_by(age_group) %>%
mutate(
prop = n / sum(n),
prop_pct = percent(prop, accuracy = 0.1),
.groups = "drop"
)
diag_age
## # A tibble: 23 × 6
## # Groups: age_group [4]
## age_group diagnosis_cat n prop prop_pct .groups
## <fct> <chr> <int> <dbl> <chr> <chr>
## 1 0–17 Cataract-related 3 0.0909 9.1% drop
## 2 0–17 Conjunctival/surface disorders 24 0.727 72.7% drop
## 3 0–17 Corneal disorders 1 0.0303 3.0% drop
## 4 0–17 Others 2 0.0606 6.1% drop
## 5 0–17 Refractive disorders 3 0.0909 9.1% drop
## 6 18–39 Cataract-related 3 0.0526 5.3% drop
## 7 18–39 Conjunctival/surface disorders 31 0.544 54.4% drop
## 8 18–39 Corneal disorders 1 0.0175 1.8% drop
## 9 18–39 Glaucoma-related 2 0.0351 3.5% drop
## 10 18–39 Others 4 0.0702 7.0% drop
## # ℹ 13 more rows
Plot:
ggplot(diag_age, aes(x = age_group, y = prop, fill = diagnosis_cat)) +
geom_col(position = "fill") +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
labs(
title = "Relative distribution of diagnostic categories by age group",
x = "Age group",
y = "Proportion"
)
We now fit a basic logistic regression with outcome “any visual impairment” and predictors age and sex. This is not meant to be a publish-ready model, just a demonstration of workflow.
glm_data <- akurba %>%
filter(!is.na(vi_any), !is.na(age), !is.na(sex)) %>%
mutate(
sex = factor(sex),
vi_any = as.factor(vi_any)
)
summary(glm_data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 50.00 55.00 53.51 60.00 80.00
table(glm_data$sex)
##
## Female Male
## 40 30
fit_vi <- glm(
vi_any ~ age + sex,
data = glm_data,
family = binomial
)
summary(fit_vi)
##
## Call:
## glm(formula = vi_any ~ age + sex, family = binomial, data = glm_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.657e+01 1.697e+05 0 1
## age 5.120e-16 3.168e+03 0 1
## sexMale 1.405e-14 8.881e+04 0 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 0.0000e+00 on 69 degrees of freedom
## Residual deviance: 4.0611e-10 on 67 degrees of freedom
## AIC: 6
##
## Number of Fisher Scoring iterations: 25
Tidy output with odds ratios:
# Get ORs, but skip profile-based CIs to avoid approx() error
fit_vi_tidy <- broom::tidy(fit_vi, exponentiate = TRUE, conf.int = FALSE)
fit_vi_tidy
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.90e-12 169698. -1.57e- 4 1.000
## 2 age 1 e+ 0 3168. 1.62e-19 1
## 3 sexMale 1.00e+ 0 88806. 1.58e-19 1
You can interpret this along the lines of:
OR per year increase in age
OR for Female vs Male, adjusted for age
write_csv(vi_overall, "data/akurba_vi_overall.csv")
write_csv(vi_age_sex, "data/akurba_vi_by_age_sex.csv")
write_csv(diag_overall, "data/akurba_diag_overall.csv")
write_csv(diag_age, "data/akurba_diag_by_age.csv")
write_csv(fit_vi_tidy, "data/akurba_vi_glm_results.csv")