# Install packages if not already installed (optional)
# install.packages(c('tidyverse', 'janitor', 'skimr', 'here', 'naniar', 'scales'))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
library(here)
## here() starts at C:/Users/Micah/Desktop/R_portfolio_project/ophthal_analytics
library(naniar)
##
## Attaching package: 'naniar'
##
## The following object is masked from 'package:skimr':
##
## n_complete
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
data_raw <- read.csv("data_raw/akurba_cleaned_anonymised (1).csv")
glimpse(data_raw)
## Rows: 243
## Columns: 7
## $ Patient_ID <chr> "P001", "P002", "P003", "P004", "P005", "P006", …
## $ Age <int> 45, 70, 54, 50, 72, 50, 53, 30, 47, 60, 37, 4, 4…
## $ Sex <chr> "Female", "Male", "Female", "Female", "Female", …
## $ VA.RE. <chr> "6/36", "6/36", "6/24", "6/36", "6/36", "6/9", "…
## $ VA.LE. <chr> "6/12", "6/60", "6/18", "6/36", "6/24", "6/9", "…
## $ Provisional.Diagnosis <chr> "Binasal Pterygium", "Le Senile Cataract", "Pres…
## $ Diagnostic.Category <chr> "Conjunctival/surface disorders", "Cataract-rela…
names(data_raw)
## [1] "Patient_ID" "Age" "Sex"
## [4] "VA.RE." "VA.LE." "Provisional.Diagnosis"
## [7] "Diagnostic.Category"
Content:
Dimensions: number of rows and columns.
Missingness by variable.
Duplicates on patient_id if present.
Simple range checks: age, VA.
dim(data_raw)
## [1] 243 7
skim(data_raw)
| Name | data_raw |
| Number of rows | 243 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Patient_ID | 0 | 1 | 4 | 4 | 0 | 243 | 0 |
| Sex | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| VA.RE. | 0 | 1 | 0 | 13 | 10 | 20 | 0 |
| VA.LE. | 0 | 1 | 0 | 13 | 11 | 21 | 0 |
| Provisional.Diagnosis | 0 | 1 | 1 | 33 | 0 | 55 | 0 |
| Diagnostic.Category | 0 | 1 | 6 | 30 | 0 | 7 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0 | 1 | 42.32 | 19.33 | 0 | 30 | 45 | 56 | 87 | ▃▃▇▅▂ |
vis_miss(data_raw)
Check age range:
data_raw %>% summarise(min_age = min(Age, na.rm = TRUE),
max_age = max(Age, na.rm = TRUE))
## min_age max_age
## 1 0 87
data_clean <- data_raw %>%
rename(va_re = VA.RE., va_le = VA.LE., diagnosis=Provisional.Diagnosis, diagnosis_cat=Diagnostic.Category) %>%
mutate(
Sex = case_when(
Sex %in% c("M", "Male", "male") ~ "Male",
Sex %in% c("F", "Female", "female") ~ "Female",
TRUE ~ NA_character_
),
age_group = case_when(
Age < 18 ~ "Child",
Age >= 18 & Age < 45 ~ "Adult",
Age >= 45 & Age < 65 ~ "Middle_aged",
Age >= 65 ~ "Elderly",
TRUE ~ NA_character_
),
age_group = factor(age_group,
levels = c("Child", "Adult", "Middle_aged", "Elderly")
)
)
# Frequency table for 'Sex'
data_clean %>%
count(Sex) %>%
mutate(percentage = n / sum(n) * 100)
## Sex n percentage
## 1 Female 139 57.20165
## 2 Male 104 42.79835
head(data_clean)
## Patient_ID Age Sex va_re va_le diagnosis
## 1 P001 45 Female 6/36 6/12 Binasal Pterygium
## 2 P002 70 Male 6/36 6/60 Le Senile Cataract
## 3 P003 54 Female 6/24 6/18 Presbyopia
## 4 P004 50 Female 6/36 6/36 Bilateral Elo
## 5 P005 72 Female 6/36 6/24 Le Senile Cataract
## 6 P006 50 Male 6/9 6/9 Presbyopia
## diagnosis_cat age_group
## 1 Conjunctival/surface disorders Middle_aged
## 2 Cataract-related Elderly
## 3 Refractive disorders Middle_aged
## 4 Cataract-related Middle_aged
## 5 Cataract-related Elderly
## 6 Refractive disorders Middle_aged
Working va_rank() function according to WHO classification of visual impairment
va_rank <- function(x) {
s <- toupper(trimws(as.character(x)))
s[s %in% c("", "NA", "NAN")] <- NA_character_
out <- rep(NA_integer_, length(s))
# WHO-style very poor vision
out[s %in% c("6/6")] <- 1L
out[s %in% c("6/9")] <- 2L
out[s %in% c("6/12")] <- 3L
out[s %in% c("6/18")] <- 4L # mild VI starts here
out[s %in% c("6/24")] <- 5L
out[s %in% c("6/36")] <- 6L
out[s %in% c("6/60")] <- 7L # moderate VI
out[s %in% c("3/60")] <- 8L # severe VI
out[s %in% c("1/60")] <- 9L # blindness
out[s %in% c("CF", "CF@2M", "CF @ 2M")] <- 9L # treat as <1/60
out[s %in% c("HM", "PL")] <- 10L # blindness
out[s %in% c("NPL", "NLP")] <- 11L # no perception
# Handle general 6/XX entries not explicitly covered
frac_idx <- grepl("^6/[0-9]+", s) & is.na(out)
dens <- suppressWarnings(as.integer(sub("^6/([0-9]+).*", "\\1", s[frac_idx])))
out[frac_idx] <- dplyr::case_when(
dens <= 6 ~ 1L,
dens <= 9 ~ 2L,
dens <= 12 ~ 3L,
dens <= 18 ~ 4L,
dens <= 24 ~ 5L,
dens <= 36 ~ 6L,
dens <= 60 ~ 7L,
dens <= 120 ~ 8L,
TRUE ~ 9L
)
return(out)
}
data_clean <- data_clean %>%
mutate(
# Rank each eye
va_re_rank = va_rank(va_re),
va_le_rank = va_rank(va_le),
# Better eye rank
better_eye_rank = pmin(va_re_rank, va_le_rank, na.rm = TRUE),
better_eye_rank = ifelse(is.infinite(better_eye_rank), NA, better_eye_rank),
# WHO-aligned VA categories
who_va_cat = case_when(
better_eye_rank %in% 1:2 ~ "Normal",
better_eye_rank == 3 ~ "Mild VI", # 6/12
better_eye_rank == 4 ~ "Moderate VI", # 6/18
better_eye_rank %in% 5:6 ~ "Severe VI", # 6/24–6/36
better_eye_rank >= 7 ~ "Blindness", # 6/60 and worse
TRUE ~ NA_character_
)
) %>%
mutate(
# Convert to ordered WHO factor
who_va_cat = factor(
who_va_cat,
levels = c("Normal", "Mild VI", "Moderate VI", "Severe VI", "Blindness"),
ordered = TRUE
)
)
high_risk_cats <- c(
"Cataract-related",
"Glaucoma-related",
"Corneal disorders",
"Trauma-related",
"Others"
)
data_clean <- data_clean %>%
mutate(
referral_needed = case_when(
# 1. High risk diagnostic categories
diagnosis_cat %in% high_risk_cats ~ 1L,
# 2. Moderate VI or worse in better eye
who_va_cat %in% c("Moderate VI", "Severe VI", "Blindness") ~ 1L,
# 3. Cannot assess VA properly
is.na(who_va_cat) ~ 1L,
# 4. Everyone else
TRUE ~ 0L
)
)
head(data_clean)
## Patient_ID Age Sex va_re va_le diagnosis
## 1 P001 45 Female 6/36 6/12 Binasal Pterygium
## 2 P002 70 Male 6/36 6/60 Le Senile Cataract
## 3 P003 54 Female 6/24 6/18 Presbyopia
## 4 P004 50 Female 6/36 6/36 Bilateral Elo
## 5 P005 72 Female 6/36 6/24 Le Senile Cataract
## 6 P006 50 Male 6/9 6/9 Presbyopia
## diagnosis_cat age_group va_re_rank va_le_rank
## 1 Conjunctival/surface disorders Middle_aged 6 3
## 2 Cataract-related Elderly 6 7
## 3 Refractive disorders Middle_aged 5 4
## 4 Cataract-related Middle_aged 6 6
## 5 Cataract-related Elderly 6 5
## 6 Refractive disorders Middle_aged 2 2
## better_eye_rank who_va_cat referral_needed
## 1 3 Mild VI 0
## 2 6 Severe VI 1
## 3 4 Moderate VI 1
## 4 6 Severe VI 1
## 5 5 Severe VI 1
## 6 2 Normal 0
write.csv(data_clean, "data_processed/akurba_with_referral.csv", row.names = FALSE)
ref_by_age <- data_clean %>%
group_by(age_group) %>%
summarise(
n = n(),
referrals = sum(referral_needed == 1, na.rm = TRUE),
referral_rate = referrals / n,
referral_rate_pct = percent(referral_rate, accuracy = 0.1)
) %>%
arrange(age_group)
ref_by_age
## # A tibble: 4 × 5
## age_group n referrals referral_rate referral_rate_pct
## <fct> <int> <int> <dbl> <chr>
## 1 Child 33 15 0.455 45.5%
## 2 Adult 87 21 0.241 24.1%
## 3 Middle_aged 88 56 0.636 63.6%
## 4 Elderly 35 32 0.914 91.4%
#Plot
ggplot(ref_by_age, aes(x = age_group, y = referral_rate)) +
geom_col() +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
labs(
title = "Referral rate by age group",
x = "Age group",
y = "Referral rate"
)
Referral rates increased with age. Children had the lowest referral proportion, while elderly patients had the highest. This is consistent with the expected burden of cataract and other age related ocular disease in older age groups
ref_by_diag <- data_clean %>%
group_by(diagnosis_cat) %>%
summarise(
n = n(),
referrals = sum(referral_needed == 1, na.rm = TRUE),
referral_rate = referrals / n,
referral_rate_pct = percent(referral_rate, accuracy = 0.1)
) %>%
arrange(desc(referral_rate))
ref_by_diag
## # A tibble: 7 × 5
## diagnosis_cat n referrals referral_rate referral_rate_pct
## <chr> <int> <int> <dbl> <chr>
## 1 Cataract-related 72 72 1 100.0%
## 2 Corneal disorders 3 3 1 100.0%
## 3 Glaucoma-related 9 9 1 100.0%
## 4 Others 16 16 1 100.0%
## 5 Trauma-related 1 1 1 100.0%
## 6 Conjunctival/surface disorders 71 13 0.183 18.3%
## 7 Refractive disorders 71 10 0.141 14.1%
ggplot(ref_by_diag,
aes(x = reorder(diagnosis_cat, referral_rate),
y = referral_rate)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
labs(
title = "Referral rate by diagnostic category",
x = "Diagnostic category",
y = "Referral rate"
)
Cataract-related and glaucoma-related diagnoses had the highest referral proportions, as expected for conditions requiring specialist evaluation and possible surgery. Conditions grouped as ‘Refractive error’ or ‘Allergic conjunctivitis’ showed lower referral rates, reflecting management at the outreach level
data_clean <- data_clean %>%
mutate(
who_va_cat = factor(
who_va_cat,
levels = c("Normal", "Mild VI", "Moderate VI", "Severe VI", "Blindness")
)
)
who_va_dist <- data_clean %>%
count(who_va_cat) %>%
mutate(
prop = n / sum(n),
prop_pct = percent(prop, accuracy = 0.1)
) %>%
arrange(who_va_cat)
who_va_dist
## who_va_cat n prop prop_pct
## 1 Normal 137 0.56378601 56.4%
## 2 Mild VI 26 0.10699588 10.7%
## 3 Moderate VI 12 0.04938272 4.9%
## 4 Severe VI 32 0.13168724 13.2%
## 5 Blindness 20 0.08230453 8.2%
## 6 <NA> 16 0.06584362 6.6%
ggplot(data_clean, aes(x = who_va_cat)) +
geom_bar() +
labs(
title = "Distribution of better eye visual acuity category",
x = "Better eye category",
y = "Number of patients"
)
Most patients presented with [Normal_mild / Moderate / Severe] visual acuity in the better eye, with [X%] having severe or very poor vision. This highlights the proportion of attendees with advanced visual impairment at first contact.