01_data_clean

# Install packages if not already installed (optional)
# install.packages(c('tidyverse', 'janitor', 'skimr', 'here', 'naniar', 'scales'))

Load packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(skimr)
library(here)

## here() starts at C:/Users/Micah/Desktop/R_portfolio_project/ophthal_analytics

library(naniar)

## 
## Attaching package: 'naniar'
## 
## The following object is masked from 'package:skimr':
## 
##     n_complete

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Read our data

data_raw <- read.csv("data_raw/akurba_cleaned_anonymised (1).csv")

Let’s take a peek at our dataset

glimpse(data_raw)

## Rows: 243
## Columns: 7
## $ Patient_ID            <chr> "P001", "P002", "P003", "P004", "P005", "P006", …
## $ Age                   <int> 45, 70, 54, 50, 72, 50, 53, 30, 47, 60, 37, 4, 4…
## $ Sex                   <chr> "Female", "Male", "Female", "Female", "Female", …
## $ VA.RE.                <chr> "6/36", "6/36", "6/24", "6/36", "6/36", "6/9", "…
## $ VA.LE.                <chr> "6/12", "6/60", "6/18", "6/36", "6/24", "6/9", "…
## $ Provisional.Diagnosis <chr> "Binasal Pterygium", "Le Senile Cataract", "Pres…
## $ Diagnostic.Category   <chr> "Conjunctival/surface disorders", "Cataract-rela…

View of our columns

names(data_raw)

## [1] "Patient_ID"            "Age"                   "Sex"                  
## [4] "VA.RE."                "VA.LE."                "Provisional.Diagnosis"
## [7] "Diagnostic.Category"

Basic data quality checks

Content:

Dimensions: number of rows and columns.

Missingness by variable.

Duplicates on patient_id if present.

Simple range checks: age, VA.

dim(data_raw)

## [1] 243   7

skim(data_raw)

Data summary
Name	data_raw
Number of rows	243
Number of columns	7
_______________________
Column type frequency:
character	6
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	empty	n_unique
Patient_ID	1	4	4	0	243
Sex	1	4	6	0	2
VA.RE.	1	0	13	10	20
VA.LE.	1	0	13	11	21
Provisional.Diagnosis	1	1	33	0	55
Diagnostic.Category	1	6	30	0	7

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	0	1	42.32	19.33	0	30	45	56	87	▃▃▇▅▂

vis_miss(data_raw)

Check age range:

data_raw %>% summarise(min_age = min(Age, na.rm = TRUE),
                         max_age = max(Age, na.rm = TRUE))

##   min_age max_age
## 1       0      87

Key variables

data_clean <- data_raw %>%
  rename(va_re = VA.RE., va_le = VA.LE., diagnosis=Provisional.Diagnosis, diagnosis_cat=Diagnostic.Category) %>%
  mutate(
    Sex = case_when(
      Sex %in% c("M", "Male", "male") ~ "Male",
      Sex %in% c("F", "Female", "female") ~ "Female",
      TRUE ~ NA_character_
    ),
    age_group = case_when(
      Age < 18 ~ "Child",
      Age >= 18 & Age < 45 ~ "Adult",
      Age >= 45 & Age < 65 ~ "Middle_aged",
      Age >= 65 ~ "Elderly",
      TRUE ~ NA_character_
    ),
    age_group = factor(age_group,
      levels = c("Child", "Adult", "Middle_aged", "Elderly")
    )
  )

Sex frequency table

# Frequency table for 'Sex'
data_clean %>%
  count(Sex) %>%
  mutate(percentage = n / sum(n) * 100)

##      Sex   n percentage
## 1 Female 139   57.20165
## 2   Male 104   42.79835

View of cleaned columns

head(data_clean)

##   Patient_ID Age    Sex va_re va_le          diagnosis
## 1       P001  45 Female  6/36  6/12  Binasal Pterygium
## 2       P002  70   Male  6/36  6/60 Le Senile Cataract
## 3       P003  54 Female  6/24  6/18         Presbyopia
## 4       P004  50 Female  6/36  6/36      Bilateral Elo
## 5       P005  72 Female  6/36  6/24 Le Senile Cataract
## 6       P006  50   Male   6/9   6/9         Presbyopia
##                    diagnosis_cat   age_group
## 1 Conjunctival/surface disorders Middle_aged
## 2               Cataract-related     Elderly
## 3           Refractive disorders Middle_aged
## 4               Cataract-related Middle_aged
## 5               Cataract-related     Elderly
## 6           Refractive disorders Middle_aged

Visual acuity feature engineering

Working va_rank() function according to WHO classification of visual impairment

va_rank <- function(x) {
  s <- toupper(trimws(as.character(x)))
  s[s %in% c("", "NA", "NAN")] <- NA_character_

  out <- rep(NA_integer_, length(s))

  # WHO-style very poor vision
  out[s %in% c("6/6")]   <- 1L
  out[s %in% c("6/9")]   <- 2L
  out[s %in% c("6/12")]  <- 3L
  out[s %in% c("6/18")]  <- 4L  # mild VI starts here
  out[s %in% c("6/24")]  <- 5L
  out[s %in% c("6/36")]  <- 6L
  out[s %in% c("6/60")]  <- 7L  # moderate VI
  out[s %in% c("3/60")]  <- 8L  # severe VI
  out[s %in% c("1/60")]  <- 9L  # blindness
  out[s %in% c("CF", "CF@2M", "CF @ 2M")] <- 9L  # treat as <1/60
  out[s %in% c("HM", "PL")] <- 10L  # blindness
  out[s %in% c("NPL", "NLP")] <- 11L  # no perception

  # Handle general 6/XX entries not explicitly covered
  frac_idx <- grepl("^6/[0-9]+", s) & is.na(out)
  dens <- suppressWarnings(as.integer(sub("^6/([0-9]+).*", "\\1", s[frac_idx])))

  out[frac_idx] <- dplyr::case_when(
    dens <= 6  ~ 1L,
    dens <= 9  ~ 2L,
    dens <= 12 ~ 3L,
    dens <= 18 ~ 4L,
    dens <= 24 ~ 5L,
    dens <= 36 ~ 6L,
    dens <= 60 ~ 7L,
    dens <= 120 ~ 8L,
    TRUE        ~ 9L
  )

  return(out)
}

Cleaning VA according to WHO Classification of blindness

data_clean <- data_clean %>%
  mutate(
    # Rank each eye
    va_re_rank = va_rank(va_re),
    va_le_rank = va_rank(va_le),

    # Better eye rank
    better_eye_rank = pmin(va_re_rank, va_le_rank, na.rm = TRUE),
    better_eye_rank = ifelse(is.infinite(better_eye_rank), NA, better_eye_rank),

    # WHO-aligned VA categories
    who_va_cat = case_when(
      better_eye_rank %in% 1:2 ~ "Normal",
      better_eye_rank == 3     ~ "Mild VI",       # 6/12
      better_eye_rank == 4     ~ "Moderate VI",   # 6/18
      better_eye_rank %in% 5:6 ~ "Severe VI",     # 6/24–6/36
      better_eye_rank >= 7     ~ "Blindness",     # 6/60 and worse
      TRUE ~ NA_character_
    )
  ) %>%
  mutate(
    # Convert to ordered WHO factor
    who_va_cat = factor(
      who_va_cat,
      levels = c("Normal", "Mild VI", "Moderate VI", "Severe VI", "Blindness"),
      ordered = TRUE
    )
  )

Categorization of visual impairment

high_risk_cats <- c(
  "Cataract-related",
  "Glaucoma-related",
  "Corneal disorders",
  "Trauma-related",
  "Others"
)

data_clean <- data_clean %>%
  mutate(
    referral_needed = case_when(
      # 1. High risk diagnostic categories
      diagnosis_cat %in% high_risk_cats ~ 1L,

      # 2. Moderate VI or worse in better eye
      who_va_cat %in% c("Moderate VI", "Severe VI", "Blindness") ~ 1L,

      # 3. Cannot assess VA properly
      is.na(who_va_cat) ~ 1L,

      # 4. Everyone else
      TRUE ~ 0L
    )
  )

View of cleaned VA

head(data_clean)

##   Patient_ID Age    Sex va_re va_le          diagnosis
## 1       P001  45 Female  6/36  6/12  Binasal Pterygium
## 2       P002  70   Male  6/36  6/60 Le Senile Cataract
## 3       P003  54 Female  6/24  6/18         Presbyopia
## 4       P004  50 Female  6/36  6/36      Bilateral Elo
## 5       P005  72 Female  6/36  6/24 Le Senile Cataract
## 6       P006  50   Male   6/9   6/9         Presbyopia
##                    diagnosis_cat   age_group va_re_rank va_le_rank
## 1 Conjunctival/surface disorders Middle_aged          6          3
## 2               Cataract-related     Elderly          6          7
## 3           Refractive disorders Middle_aged          5          4
## 4               Cataract-related Middle_aged          6          6
## 5               Cataract-related     Elderly          6          5
## 6           Refractive disorders Middle_aged          2          2
##   better_eye_rank  who_va_cat referral_needed
## 1               3     Mild VI               0
## 2               6   Severe VI               1
## 3               4 Moderate VI               1
## 4               6   Severe VI               1
## 5               5   Severe VI               1
## 6               2      Normal               0

Cleaned dataset Export

write.csv(data_clean, "data_processed/akurba_with_referral.csv", row.names = FALSE)

Summary tables

Referral rate by age_group

ref_by_age <- data_clean %>%
  group_by(age_group) %>%
  summarise(
    n = n(),
    referrals = sum(referral_needed == 1, na.rm = TRUE),
    referral_rate = referrals / n,
    referral_rate_pct = percent(referral_rate, accuracy = 0.1)
  ) %>%
  arrange(age_group)

ref_by_age

## # A tibble: 4 × 5
##   age_group       n referrals referral_rate referral_rate_pct
##   <fct>       <int>     <int>         <dbl> <chr>            
## 1 Child          33        15         0.455 45.5%            
## 2 Adult          87        21         0.241 24.1%            
## 3 Middle_aged    88        56         0.636 63.6%            
## 4 Elderly        35        32         0.914 91.4%

#Plot
ggplot(ref_by_age, aes(x = age_group, y = referral_rate)) +
  geom_col() +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    title = "Referral rate by age group",
    x = "Age group",
    y = "Referral rate"
  )

Referral rates increased with age. Children had the lowest referral proportion, while elderly patients had the highest. This is consistent with the expected burden of cataract and other age related ocular disease in older age groups

Referral rate by Diagnostic.Category

ref_by_diag <- data_clean %>%
  group_by(diagnosis_cat) %>%
  summarise(
    n = n(),
    referrals = sum(referral_needed == 1, na.rm = TRUE),
    referral_rate = referrals / n,
    referral_rate_pct = percent(referral_rate, accuracy = 0.1)
  ) %>%
  arrange(desc(referral_rate))

ref_by_diag

## # A tibble: 7 × 5
##   diagnosis_cat                      n referrals referral_rate referral_rate_pct
##   <chr>                          <int>     <int>         <dbl> <chr>            
## 1 Cataract-related                  72        72         1     100.0%           
## 2 Corneal disorders                  3         3         1     100.0%           
## 3 Glaucoma-related                   9         9         1     100.0%           
## 4 Others                            16        16         1     100.0%           
## 5 Trauma-related                     1         1         1     100.0%           
## 6 Conjunctival/surface disorders    71        13         0.183 18.3%            
## 7 Refractive disorders              71        10         0.141 14.1%

ggplot(ref_by_diag,
       aes(x = reorder(diagnosis_cat, referral_rate),
           y = referral_rate)) +
  geom_col() +
  coord_flip() +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    title = "Referral rate by diagnostic category",
    x = "Diagnostic category",
    y = "Referral rate"
  )

Cataract-related and glaucoma-related diagnoses had the highest referral proportions, as expected for conditions requiring specialist evaluation and possible surgery. Conditions grouped as ‘Refractive error’ or ‘Allergic conjunctivitis’ showed lower referral rates, reflecting management at the outreach level

Distribution of better_eye_cat

data_clean <- data_clean %>%
  mutate(
    who_va_cat = factor(
      who_va_cat,
      levels = c("Normal", "Mild VI", "Moderate VI", "Severe VI", "Blindness")
    )
  )

who_va_dist <- data_clean %>%
  count(who_va_cat) %>%
  mutate(
    prop = n / sum(n),
    prop_pct = percent(prop, accuracy = 0.1)
  ) %>%
  arrange(who_va_cat)

who_va_dist

##    who_va_cat   n       prop prop_pct
## 1      Normal 137 0.56378601    56.4%
## 2     Mild VI  26 0.10699588    10.7%
## 3 Moderate VI  12 0.04938272     4.9%
## 4   Severe VI  32 0.13168724    13.2%
## 5   Blindness  20 0.08230453     8.2%
## 6        <NA>  16 0.06584362     6.6%

ggplot(data_clean, aes(x = who_va_cat)) +
  geom_bar() +
  labs(
    title = "Distribution of better eye visual acuity category",
    x = "Better eye category",
    y = "Number of patients"
  )

Most patients presented with [Normal_mild / Moderate / Severe] visual acuity in the better eye, with [X%] having severe or very poor vision. This highlights the proportion of attendees with advanced visual impairment at first contact.

01_data_clean_qc(WHO)