#####Balancing Altruism, Risk and Ethics: Vietnamese Students’ views on Human Challenge Studies.####

---
title: "R Script for manuscript Balancing Altruism, Risk and Ethics: Vietnamese Students’ views on Human Challenge Studies."
output:
  word_document: default
  html_document: default
date: "2026-03-17"
---
  
#load libraries
library(tidyverse)
library(janitor)
library(stringr)
library(readr)
library(stringi)
library(readxl)
library(openxlsx)
library(dplyr)
library(ggplot2)
library(tidyr)
library(knitr)
library(RColorBrewer)
library(scales)
library(stringr)
library(MASS)
library(purrr)
library(brant)
library(ordinal)
library(psych)
library(tibble)
library(summarytools)
library(gtsummary)

#################################################################
## 1. LOAD THE DATASET
#################################################################
cs_t3_raw <- readxl::read_excel("Comprehensive_survey_data_3_raw.xlsx")

cs_t3_working <- cs_t3_raw %>%
  rename_with(
    ~ str_replace_all(.x, " ", "."))

names(cs_t3_working)


## 2. DESCRIPTIVE ANALYSIS OF DEMOGRAPHIC INFORMATION
#################################################################

### Demographic information of students answering surveys

demo <- cs_t3_working %>%
  transmute(
    Sex = factor(Sex),
    Age = Age.at.enrollment,
    Class.school.year = factor(Class.school.year),
    Department = factor(Department),
    Faculty = factor(Faculty),
    Ethnicity = factor(Ethnicity),
    Self_report.SES = factor(Self_report.SES)
  )

tbl_demo <- demo %>%
  tbl_summary(
    type = list(
      everything() ~ "categorical",
      Age ~ "continuous"
    ),
    statistic = list(
      Age ~ "{median} ({p25}, {p75})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = list(
      Age ~ 1,
      all_categorical() ~ c(0, 1)
    ),
    missing = "no"
  ) %>%
  bold_labels()

tbl_demo

### Demographic information of students participating in FGDs and IDIs
fgd_idi_raw <- readxl::read_excel("fgd_idi_raw.xlsx")

fgd_idi <- fgd_idi_raw %>%
  transmute(
    Sex = factor(Sex),
    Age = age_at_enrollment,
    Class_school_year = factor(Class_school_year),
    Department = factor(department),
    Faculty = factor(faculty),
    Ethnicity = factor(ethnicity),
    Self_report_SES = factor(Self_reported_ses)
  )

tbl_fdg_idi <- fgd_idi %>%
  tbl_summary(
    type = list(
      everything() ~ "categorical",
      Age ~ "continuous"
    ),
    statistic = list(
      Age ~ "{median} ({p25}, {p75})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = list(
      Age ~ 1,
      all_categorical() ~ c(0, 1)
    ),
    missing = "no"
  ) %>%
  bold_labels()

tbl_fdg_idi


## 3. RECODE THE DATA SET 
#################################################################

### Five-level Likert Responses  
response_mapping_5 <- c("strongly_disagree" = 1, "very_poor" = 1, "very_negative" = 1, 
                      "disagree" = 2, "poor" = 2, "somewhat_negative" = 2, 
                      "neutral" = 3, "average" = 3,
                      "agree" = 4, "good" = 4, "somewhat_positive" = 4,
                      "strongly_agree" = 5, "excellent" = 5, "very_positive" = 5, 
                      "no" = 0, "yes" = 1)

cs_t3_working_5 <- cs_t3_working %>%
    mutate(across(matches("^(q8_|q9_|q15_|q16_|q17_|q18_|q24_|q25_|q26_|q27_|q33|q34_)"), 
          ~ recode(.x, !!!response_mapping_5, .default = NA_real_)))

likert_5 <- cs_t3_working_5 %>% 
  dplyr::select(No,
                starts_with(c("q8_", "q17_", "q26_", "q16_","q25_",
                              "q34_","q9_", "q18_","q27_")))

#### Descriptive analysis of 5-level Likert responses ####
likert_5_long <- likert_5 %>% 
  pivot_longer(
    cols = -No,
    names_to = "composite", 
    values_to = "response"
  ) %>% 
  separate(composite, 
           into = c("question", "study_type", "section"), 
           sep = "_",  # Split 1st & 2nd _ only
           fill = "right",
           extra = "merge"  # Keeps remaining underscores together
  ) %>% 
  dplyr::select(No, section, question, study_type, response) %>% 
  mutate(
    section = factor(section), 
    question = factor(question), 
    study_type = factor(study_type, levels = c("HCS", "CT", "Obs")), 
    response = factor(response, levels = c(1, 2, 3, 4, 5), 
                      labels = c("Strongly disagree", "Disagree","Neutral", "Agree", "Strongly agree"), ordered = T)
  )

likert_5_summary <- likert_5_long %>%
  group_by(study_type, question, section, response) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(study_type, question, section) %>%
  mutate(
    pct = round(100 * n / sum(n), 1)
  ) %>%
  dplyr::select(question, section, study_type, response, n, pct)

likert_5_summary_wide <- likert_5_summary %>%
  filter(!is.na(response)) %>% 
  mutate(response = factor(response, levels = c(
    "Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree"
  ))) %>%
  arrange(response) %>%
  pivot_wider(
    names_from = response,
    values_from = c(n,pct)
  )

##### Save to excel file ##### 
write.xlsx(likert_5_summary_wide, "likert_5_summary_wide.xlsx")

### Converting 5-level Likert scale to three-level Likert scale 
response_mapping_3 <- c("strongly_disagree" = 1, "very_poor" = 1, "very_negative" = 1, 
                        "disagree" = 1, "poor" = 1, "somewhat_negative" = 1, 
                        "neutral" = 2, "average" = 2,
                        "agree" = 3, "good" = 3, "somewhat_positive" = 3,
                        "strongly_agree" = 3, "excellent" = 3, "very_positive" = 3, 
                        "no" = 0, "yes" = 1)


cs_t3_working_3 <- cs_t3_working %>%
  mutate(across(matches("^(q8_|q9_|q15_|q16_|q17_|q18_|q24_|q25_|q26_|q27_|q33_|q34_)"), 
                ~ recode(.x, !!!response_mapping_3, .default = NA_real_)))


likert_3 <- cs_t3_working_3 %>% 
  dplyr::select(No, 
                starts_with(c("q8_", "q17_", "q26_", "q16_","q25_", "q34_",
                              "q9_", "q18_","q27_"))) 

#### Descriptive analysis of 3-level Likert responses ####
likert_3_long <- likert_3 %>% 
  pivot_longer(
    cols = -No,
    names_to = "composite", 
    values_to = "response"
  ) %>% 
  separate(composite, 
           into = c("question", "study_type", "section"), 
           sep = "_",  # Split 1st & 2nd _ only
           fill = "right",
           extra = "merge"  # Keeps remaining underscores together
  ) %>% 
  dplyr::select(No, section, question, study_type, response) %>% 
  mutate(
    section = factor(section), 
    question = factor(question), 
    study_type = factor(study_type, levels = c("HCS", "CT", "Obs")), 
    response = factor(response, levels = c(1, 2, 3), labels = c("Disagree", "Neutral", "Agree"), ordered = T)
  )

likert_3_summary <- likert_3_long %>%
  group_by(study_type, question, section, response) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(study_type, question, section) %>%
  mutate(
    pct = round(100 * n / sum(n), 1)
  ) %>%
  dplyr::select(question, section, study_type, response, n, pct)

likert_3_summary_wide <- likert_3_summary %>%
  dplyr::select(-n) %>%
  filter(!is.na(response)) %>% 
  pivot_wider(
    names_from = response,
    values_from = pct
  ) 
 
###save
write.xlsx(
  x = list(
    Summary       = likert_3_summary,
    Summary_Wide  = likert_3_summary_wide
  ),
  file = "likert_3_summary_combined.xlsx"
)

### 4. MISSING DATA ###
#################################################################

missing_data_question <- likert_3_long %>% 
  group_by(question) %>% 
  summarise(
    total = n(), 
    missing = sum(is.na(response)), 
    percent_missing = 100 *missing / total
  ) %>% 
  arrange(desc(percent_missing))

missing_data_studytype <- likert_3_long %>% 
  group_by(study_type) %>% 
  summarise(
    total = n(), 
    missing = sum(is.na(response)), 
    percent_missing = 100 *missing / total
  ) %>% 
  arrange(desc(percent_missing))

missing_data_question_studytype <- likert_3_long %>% 
  group_by(question, study_type) %>% 
  summarise(
    total = n(), 
    missing = sum(is.na(response)), 
    non_missing = sum(!is.na(response)),
    percent_missing = 100 *missing / total,
    .groups = "drop"
  ) %>% 
  arrange(desc(percent_missing))

### 5. COMPARISON - ORDINAL REGRESSION
#################################################################

#storage function 
store_clmm_results <- function(model, label) {
  # fixed effects
  fixef_df <- summary(model)$coefficients
  
  fixef_table <- as.data.frame(fixef_df) %>%
    tibble::rownames_to_column("term") %>%
    filter(!grepl("\\|", term)) %>%  
    mutate(
      OR = exp(Estimate),
      lower_CI = exp(Estimate - 1.96 * `Std. Error`),
      upper_CI = exp(Estimate + 1.96 * `Std. Error`),
      p_value = `Pr(>|z|)`,
      sig = ifelse(p_value < 0.05, "sig", "ns"),
      model_label = label
    ) %>%
    dplyr::select(model_label, term, Estimate, OR, lower_CI, upper_CI, p_value, sig) 
  
  # random effects
  ranef_summary <- VarCorr(model)
  
  if ("No" %in% names(ranef_summary)) {
    re_sd <- attr(ranef_summary$No, "stddev")
    re_var <- re_sd^2
    
    random_effects_table <- tibble(
      model_label = label,
      term = "random_effect",
      Estimate = NA,
      OR = NA,
      lower_CI = NA,
      upper_CI = NA,
      p_value = NA,
      sig = NA,
      variance = round(re_var, 4),
      sd = round(re_sd, 4)
    )
    
    # Bind fixed and random together
    final_table <- bind_rows(fixef_table, random_effects_table)
    
  } else {
    final_table <- fixef_table
  }
  
  return(final_table)
}

###loop - 5 level likert scale
questions_5 <- likert_5_long %>%
  distinct(section) %>%
  pull(section) %>%
  as.character() 

results_likert_5 <- tibble()

for (q in questions_5) {
  df_q <- likert_5_long %>%
    mutate(study_type = relevel(factor(study_type), ref = "HCS")) %>% 
    filter(section == q)
  model <- clmm(response ~ study_type + (1 | No), data = df_q)
  
  results_likert_5 <- bind_rows(results_likert_5, store_clmm_results(model, q))
}

results_likert_5 <- results_likert_5 %>%
  mutate(
    p_value_adj = ifelse(!is.na(p_value), p.adjust(p_value, method = "fdr"), NA),
    sig_fdr = case_when(
      is.na(p_value_adj) ~ NA_character_,
      p_value_adj < 0.05 ~ "sig",
      TRUE ~ "ns"
    ), 
    p_value_adj = round(p_value_adj, 4)
  )
#save results
write.xlsx(results_likert_5, "results_likert_5.xlsx")

###loop - 3 level Likert scale
questions_3 <- likert_3_long %>%
  distinct(section) %>%
  pull(section) %>%
  as.character() 

results_likert_3 <- tibble()

for (q in questions_3) {
  df_q <- likert_3_long %>%
    mutate(study_type = relevel(factor(study_type), ref = "HCS")) %>% 
    filter(section == q)
  model <- clmm(response ~ study_type + (1 | No), data = df_q)
  
  results_likert_3 <- bind_rows(results_likert_3, store_clmm_results(model, q))
}

results_likert_3 <- results_likert_3 %>%
  mutate(
    p_value_adj = ifelse(!is.na(p_value), p.adjust(p_value, method = "fdr"), NA),
    sig_fdr = case_when(
      is.na(p_value_adj) ~ NA_character_,
      p_value_adj < 0.05 ~ "sig",
      TRUE ~ "ns"
    ), 
    p_value_adj = round(p_value_adj, 4)
  )

#save results
write.csv(results_likert_3, "results_likert_3.csv", row.names = FALSE)

### 6. FIGURES
################################################################################

likert_colors <- c(
  #"Disagree" = "#ca0020",  # strong red
  "Disagree"           = "#f4a582",  # light red
  "Neutral"            = "#eeeeee",  # neutral grey
  "Agree"              = "#92c5de"  # light blue
  #  "Agree"     = "#0571b0"   # strong blue
)


###FIGURE 3. Students’ self-reported knowledge and overall views towards human challenge studies, clinical trials and observational studies. 
################################################################################

likert_3_view <- likert_3_summary_wide %>% 
  filter(question %in% c("q8", "q17", "q26", "q16","q25", "q34"))
section_order <- c( "self_reported_understanding", "over_views")

view_long <- likert_3_view %>%
    mutate(
    Neutral_neg = Neutral / 2,
    Neutral_pos = Neutral / 2
  ) %>%
  pivot_longer(cols = c("Disagree", "Neutral_neg", "Neutral_pos", "Agree"),
               names_to = "response", values_to = "percent") %>%
  mutate(
    response = factor(response, levels = c("Disagree", "Neutral_neg", "Neutral_pos", "Agree")),
    signed_percent = case_when(
      response == "Disagree"     ~ -percent,
      response == "Neutral_neg"  ~ -percent,
      TRUE                       ~ percent
    ),
    section = factor(section, levels = section_order),
    study_type = factor(study_type, levels = c("HCS", "CT", "Obs")),
    response_label = str_replace(response, "_neg|_pos", ""), 
    study_type = fct_recode(study_type)
  )

sig_lines <- results_likert_3 %>%
  filter(term %in% c("study_typeCTs", "study_typeObs"),
         model_label %in% section_order) %>%
  mutate(
    group1 = "HCS",
    group2 = ifelse(term == "study_typeCTs", "CT", "Obs"),
    y_pos = 110,  # adjust based on max height
    label = case_when(
      p_value_adj < 0.001 ~ "***",
      p_value_adj < 0.01  ~ "**",
      p_value_adj < 0.05  ~ "*",
      TRUE                ~ "ns"
    ), 
    y_pos = case_when(
      model_label == "over_views" & group2 == "CT"  ~ 40,
      model_label == "over_views" & group2 == "Obs" ~ 100,
      model_label == "self_reported_understanding" & group2 == "CT"  ~ 40,
      model_label == "self_reported_understanding" & group2 == "Obs" ~ 100,
      TRUE ~ y_pos  
    )) %>%  
  rename(section = model_label) %>% 
  mutate(section = factor(section, levels = section_order))


section_labels <- c(
  over_views          = "Overall views",
  self_reported_understanding            = "Self-reported level of understanding"
)

ggplot(view_long, aes(x = study_type, y = signed_percent, fill = response_label)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_fill_manual(
    name = "Response (Understanding/Views)", 
    breaks = c("Agree","Neutral","Disagree"),
    labels = c("Good/Positive", "Average/Neutral", "Poor/Negative"),
    values = c(Agree = "#92c5de", Neutral = "#eeeeee", Disagree = "#f4a582")) +
  facet_wrap(~ section, labeller = labeller(section = section_labels)) +
  geom_hline(yintercept = 0, color = "black") +
  scale_y_continuous(breaks = c(-50, -25, 25, 50,75, 100), limits = c(-55, 105)) +
  labs(x = "", y = "Proportion of students (%)") +
  theme_minimal(base_size = 12) +
  theme(text = element_text(family = "Helvetica"),
        axis.text.x = element_text(angle = 0, hjust = 0.5, face = "bold", family = "Helvetica", size = 11),
        #panel.grid.major.y = element_line(color = "grey85", linetype = "solid", size = 0.1),
        #  panel.grid.minor.y = element_blank(),
        # panel.grid.major.x = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.x = element_blank(), 
        legend.title = element_text(size = 10),
        plot.margin = margin(10, 10, 20, 10)) +
  
  # Significance lines
  geom_segment(data = sig_lines,
               aes(x = group1, xend = group2, y = y_pos, yend = y_pos),
               inherit.aes = FALSE, linewidth = 0.4) +
  geom_segment(data = sig_lines,
               aes(x = group1, xend = group1, y = y_pos, yend = y_pos - 2),
               inherit.aes = FALSE,
               linewidth = 0.4
  ) +
  
  geom_segment(data = sig_lines,
               aes(x = group2, xend = group2, y = y_pos, yend = y_pos - 2),
               inherit.aes = FALSE,
               linewidth = 0.4
  ) +
  
  geom_text(data = sig_lines,
            aes(
              x = ifelse(group2 == "CT", 1.5, 2), 
              y = y_pos + 5, 
              label = label),
            inherit.aes = FALSE,
            size = 4) 

ggsave("Fig_3.png", 
       width = 10, 
       height = 6, 
       dpi = 300)
ggsave("Fig_3.tiff", 
       width = 10, 
       height = 6, 
       dpi = 300)


### FIGURE4: Students’ views on overall acceptability of human challenge studies, clinical trials and observational studies.
################################################################################

likert_3_attitude <- likert_3_summary_wide %>% 
  filter(section %in% c("attitude_asking.parents.permission",
                        "attitude_being.happy.to.participate",
                        "attitude_being.important.to.society",
                        "attitude_ensuring.participants.understanding",
                        "attitude_offering.incentives",
                        "attitude_safe.to.participate"))


attitude_order <- c( "attitude_being.happy.to.participate",
                     "attitude_asking.parents.permission",
                     "attitude_safe.to.participate",
                     "attitude_being.important.to.society",
                     "attitude_ensuring.participants.understanding",
                     "attitude_offering.incentives"
                     )

attitudes_long <- likert_3_attitude %>%
  mutate(
    Neutral_neg = Neutral / 2,
    Neutral_pos = Neutral / 2
  ) %>%
  pivot_longer(cols = c("Disagree", "Neutral_neg", "Neutral_pos", "Agree"),
               names_to = "response", values_to = "percent") %>%
  mutate(
    response = factor(response, levels = c("Disagree", "Neutral_neg", "Neutral_pos", "Agree")),
    signed_percent = case_when(
      response == "Disagree"     ~ -percent,
      response == "Neutral_neg"  ~ -percent,
      TRUE                       ~ percent
    ),
    section = factor(section, levels = attitude_order),
    study_type = factor(study_type, levels = c("HCS", "CT", "Obs")),
    response_label = str_replace(response, "_neg|_pos", ""), 
    study_type = fct_recode(study_type)
  )

sig_lines <- results_likert_3 %>%
  filter(term %in% c("study_typeCT", "study_typeObs"),
         model_label %in% attitude_order) %>%
  mutate(
    group1 = "HCS",
    group2 = ifelse(term == "study_typeCT", "CT", "Obs"),
    y_pos = 110,  # adjust based on max height
    label = case_when(
      p_value_adj < 0.001 ~ "***",
      p_value_adj < 0.01  ~ "**",
      p_value_adj < 0.05  ~ "*",
      TRUE                ~ "ns"
    ), 
    y_pos = case_when(
      model_label == "attitude_being.important.to.society" & group2 == "CT"  ~ 40,
      model_label == "attitude_being.important.to.society" & group2 == "Obs" ~ 100,
      model_label == "attitude_safe.to.participate" & group2 == "CT"  ~ 40,
      model_label == "attitude_safe.to.participate" & group2 == "Obs" ~ 100,
      model_label == "attitude_being.happy.to.participate" & group2 == "CT"    ~ 40,
      model_label == "attitude_being.happy.to.participate" & group2 == "Obs"   ~ 100,
      model_label == "attitude_asking.parents.permission" & group2 == "CT"    ~ 40,
      model_label == "attitude_asking.parents.permission" & group2 == "Obs"   ~ 100,
      model_label == "attitude_ensuring.participants.understanding" & group2 == "CT"    ~ 40,
      model_label == "attitude_ensuring.participants.understanding" & group2 == "Obs"   ~100,
      model_label == "attitude_offering.incentives" & group2 == "CT"    ~ 40,
      model_label == "attitude_offering.incentives" & group2 == "Obs"   ~ 100,
      TRUE ~ y_pos  
    )) %>%  
  rename(section = model_label) %>% 
  mutate(section = factor(section, levels = attitude_order))


attitude_labels <- c(
  attitude_being.happy.to.participate = "I would be happy to participate",
  attitude_asking.parents.permission = "I would ask my parents' permission\n before participating",
  attitude_safe.to.participate= "It is safe to participate",
  attitude_being.important.to.society = "It is important to society",
  attitude_ensuring.participants.understanding = "Researchers should ensure\n participants' understanding",
  attitude_offering.incentives = "Additional incentives should be offered"
)

fig4a <- ggplot(attitudes_long, aes(x = study_type, y = signed_percent, fill = response_label)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_fill_manual(values = likert_colors, name = "Response", breaks = c("Agree", "Neutral", "Disagree")) +
  facet_wrap(~ section, ncol = 3, labeller = labeller(section = attitude_labels)) +
  geom_hline(yintercept = 0, color = "black") +
  scale_y_continuous(breaks = c(-50, -25, 25, 50,75, 100), limits = c(-55, 105)) +
  labs(x = "", y = "% of students") +
  theme_minimal(base_size = 12) +
  theme(text = element_text(family = "Helvetica"),
        axis.text.x = element_text(angle = 0, hjust = 0.5, face = "bold", family = "Helvetica", size = 11),
        panel.grid.major.y = element_line(color = "grey85", linetype = "solid", linewidth = 0.1),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank()) +
  # Significance lines
  geom_segment(data = sig_lines,
               aes(x = group1, xend = group2, y = y_pos, yend = y_pos),
               inherit.aes = FALSE, 
               linewidth = 0.4) +
  geom_segment(data = sig_lines,
               aes(x = group1, xend = group1, y = y_pos, yend = y_pos - 2),
               inherit.aes = FALSE,
               linewidth = 0.4) +
  geom_segment(data = sig_lines,
               aes(x = group2, xend = group2, y = y_pos, yend = y_pos - 2),
               inherit.aes = FALSE,
               linewidth = 0.4) +
  geom_text(data = sig_lines,
            aes(
              x = ifelse(group2 == "CT", 1.5, 2), 
              y = y_pos + 5, 
              label = label),
            inherit.aes = FALSE,
            size = 4)
# fig4a

ggsave("Fig4A.pdf", width = 10, height = 6, dpi = 300)
ggsave("Fig4A.png", width = 10, height = 6, dpi = 300)
ggsave("Fig4A.tiff", width = 10, height = 6, dpi = 300)

##### Acceptability score #####
#Adding test to delete question 4 and 7 in the "thought" section

likert_5_accept <- cs_t3_working_5 %>% 
  dplyr::select(No, Sex, Class.school.year, Faculty, Ethnicity, Self_report.SES,
                starts_with(c("q9_", "q18_","q27_"))) %>%
  dplyr::select(-c("q9_Obs_attitude_offering.incentives",
                   "q18_CT_attitude_offering.incentives",
                   "q27_HCS_attitude_offering.incentives",
                   "q9_Obs_attitude_asking.parents.permission",
                   "q18_CT_attitude_asking.parents.permission",
                   "q27_HCS_attitude_asking.parents.permission"))

## REFER THE FINAL SUBTOTAL SCORE TO POMP 0–100  
# https://pmc.ncbi.nlm.nih.gov/articles/PMC9634523/
#https://www.tandfonline.com/doi/abs/10.1207/S15327906MBR3403_2
#The POMP scores are defined as follows: POMP = ((observed score – minimum possible) / (maximum possible – minimum possible)) × 100. 
#This represents the percentage of a measure’s total that a specific score represents
#################################################################

pomp_thought <- function(x) 100*(x-1)/4
likert_5_accept$thought_OBS_100 <- pomp_thought(rowMeans(likert_5_accept[, grepl("^q9_", names(likert_5_accept))], na.rm = TRUE))
likert_5_accept$thought_CT_100 <- pomp_thought(rowMeans(likert_5_accept[, grepl("^q18_", names(likert_5_accept))], na.rm = TRUE))
likert_5_accept$thought_HCS_100 <- pomp_thought(rowMeans(likert_5_accept[, grepl("^q27_", names(likert_5_accept))], na.rm = TRUE))

## CALCULATE THE SUBTOTAL SCORE OF THOUGHT SECTION AFTER REFERING TO POMP 100
vars_thought_100 <- c("thought_OBS_100", "thought_CT_100", "thought_HCS_100")
summary(likert_5_accept[, vars_thought_100])
sapply(likert_5_accept[, vars_thought_100], sd) # this is standard deviation (SD)

############################################################
# COMPARE SUBTOTAL SCORE OF THOUGHT SECTION BETWEEN HCT versus OBS
diff_thought_100_HvO <- likert_5_accept$thought_OBS_100 - likert_5_accept$thought_HCS_100
if (shapiro.test(diff_thought_100_HvO)$p.value > .05) {
  # If p< 0.05, reject the hypothesis that the distribution is normal. Use paired Wilcoxon signed-rank
  # If p> 0.05, accept the hypothesis that the distribution is normal. Use paired t-test
  # ≈ Normal distribution  → paired t-test
  tt_thought_HvO <- t.test(likert_5_accept$thought_OBS_100, likert_5_accept$thought_HCS_100,
                           paired = TRUE, conf.level = .95)
  print(tt_thought_HvO)
} else {
  # Abnormal distribution → Wilcoxon signed-rank
  wt_thought_HvO <- wilcox.test(likert_5_accept$thought_OBS_100, likert_5_accept$thought_HCS_100,
                                paired = TRUE, conf.int = TRUE)
  print(wt_thought_HvO)
}

# COMPARE SUBTOTAL SCORE OF THOUGHT SECTION BETWEEN HCT versus CT
diff_thought_100_HvC <- likert_5_accept$thought_CT_100 - likert_5_accept$thought_HCS_100
if (shapiro.test(diff_thought_100_HvC)$p.value > .05) {
  # If p< 0.05, reject the hypothesis that the distribution is normal. Use paired Wilcoxon signed-rank
  # If p> 0.05, accept the hypothesis that the distribution is normal. Use paired t-test
  # ≈ Normal distribution  → paired t-test
  tt_thought_HvC <- t.test(likert_5_accept$thought_CT_100, likert_5_accept$thought_HCS_100,
                           paired = TRUE, conf.level = .95)
  print(tt_thought_HvC)
} else {
  # Abnormal distribution → Wilcoxon signed-rank
  wt_thought_HvC <- wilcox.test(likert_5_accept$thought_CT_100, likert_5_accept$thought_HCS_100,
                                paired = TRUE, conf.int = TRUE)
  print(wt_thought_HvC)
}

##### MAKE BOX PLOT - FIGURE 4B ######
likert_5_accept_long <- likert_5_accept %>%
  dplyr::select(all_of(vars_thought_100)) %>%
  pivot_longer(cols = everything(),
               names_to = "Variable",
               values_to = "Score") %>%
  mutate(Variable = recode(Variable,
                           "thought_HCS_100"  = "HCS",
                           "thought_CT_100"   = "CT",
                           "thought_OBS_100"  = "Obs"),
         Variable = factor(Variable, levels = c("Obs","CT","HCS"))) 

library(ggplot2)
library(ggsignif)

fig4b <- ggplot(data = likert_5_accept_long, aes(x = Variable, y = Score)) +
  geom_boxplot(fill = "#92c5de", color = "#2C3E50") +
  labs(y = "Acceptability score (%)", x = "") +
  coord_flip() + 
  scale_y_continuous(limits = c(-0.5, 101), expand = c(0, 0)) +
  stat_summary(
    fun.min = min, fun.max = min, fun = min,
    geom = "text",
    aes(label = Variable),
    hjust = 1.5,             # nudge left
    fontface = "bold"
  ) +
  geom_signif(
    comparisons = list(c("HCS","CT"), c("HCS","Obs")),
    map_signif_level = FALSE,
    annotations = c("", ""),       # blank text
    y_position = c(68.5, 70),        # where the brackets go
    tip_length = 0.01
  ) +
  annotate("text", x = 2.45, y = 72.5, label = "*\n*\n*", size = 4, hjust = 1, lineheight = 0.3) + # for HCT–CT
  annotate("text", x = 1.95,   y = 74, label = "*\n*\n*", size = 4, hjust = 1, lineheight = 0.3) +
  theme_minimal() + 
  theme(
    legend.position = "none", 
    panel.grid = element_blank(), 
    axis.text.x = element_text(face = "bold"), 
    axis.title.y = element_text(face = "bold"), 
    axis.text.y = element_blank(), 
    axis.line.x = element_line(color = "black", linewidth = 0.5), 
    plot.margin = margin(30, 10, 15, 10))

# fig4b
ggsave("Fig4B.png", width = 10, height = 4, dpi = 300)


###Combine Figure 4A and Figure 4B
library(cowplot)
plot_grid(
  fig4a, fig4b,
  nrow = 2,           # Stack vertically
  rel_heights = c(5, 2),  # A 3x taller than B
  labels = c("A", "B"),   # Add A, B labels
  label_size = 14
  )
ggsave("Fig4.png", width = 10, height = 8, dpi = 300)

###### Differences among various groups of students
library('gtsummary')
likert_5_accept <- likert_5_accept |> 
  mutate(Self_report.SES = fct_na_value_to_level(Self_report.SES, "Missing"))

tbl_output <- list()
for (i in c('Sex', 'Class.school.year', 'Faculty', 'Ethnicity', 'Self_report.SES')) {
  print(i)
  tbl_subout <- likert_5_accept |> 
    tbl_summary(include = c(i, thought_OBS_100, thought_CT_100, thought_HCS_100), 
                by = i,
                type = list(c('thought_OBS_100', 'thought_CT_100', 'thought_HCS_100') ~ 'continuous'), 
                statistic = list(c('thought_OBS_100', 'thought_CT_100', 'thought_HCS_100') ~ "{mean} ({sd})")) |>
    add_n() |> add_p(pvalue_fun = ~style_pvalue(.x, digits = 2))|> as_tibble()  # default run wilcoxson test
  tbl_output <- c(tbl_output, list(tbl_subout))
}
print(tbl_output)

###Tests
kruskal.test(thought_OBS_100 ~ Self_report.SES, data = likert_5_accept)
kruskal.test(thought_CT_100 ~ Self_report.SES, data = likert_5_accept)
kruskal.test(thought_HCS_100 ~ Self_report.SES, data = likert_5_accept)

kruskal.test(thought_OBS_100 ~ Ethnicity, data = likert_5_accept)
kruskal.test(thought_CT_100 ~ Ethnicity, data = likert_5_accept)
kruskal.test(thought_HCS_100 ~ Ethnicity, data = likert_5_accept)

names(tbl_output) <- c("gender", "faculty", "aca_year", "ethinicity","ses")
write.xlsx(tbl_output, file = "demographic_tables.xlsx")


##### FIGURE 5: RISK AND BURDEN TO PARTICIPANTS #####
### Descriptive analysis of students' perception regarding risks and burdens of HCS, CT and Obs   
risk_burden <- cs_t3_working %>% 
  dplyr::select(No, 
                starts_with(c("q14_", "q23_","q32_"))) %>%
  rename(
    "Obs" = "q14_Obs_risk/burden_participants", 
    "CT" = "q23_CT_risk/burden_participants", 
    "HCS" = "q32_HCS_risk/burden_participants"
  )

risk_burden_long <- risk_burden %>%
  pivot_longer(cols = c(Obs, CT, HCS),
               names_to = "study_type",
               values_to = "risks_raw") %>% 
  separate_rows(risks_raw, sep = ",") %>%
  mutate(risks_raw = str_trim(risks_raw)) %>% 
  filter(!risks_raw %in% c("Khác", "NA", "no_risk"))

risk_burden_summary <- risk_burden_long%>%
  group_by(study_type, risks_raw) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(study_type) %>%
  mutate(pct = round(n / 242 * 100, 1)) %>%
  ungroup() 


#chi2

risk_burden_chi <- table(risk_burden_long$study_type, risk_burden_long$risks_raw)

chi2_results <- tibble(risk = colnames(risk_burden_chi)) %>% 
  rowwise() %>% 
  mutate(
    p_CT = chisq.test(risk_burden_chi[c("CT", "HCS"), risk])$p.value, 
    p_Obs = chisq.test(risk_burden_chi[c("Obs", "HCS"), risk])$p.value
  ) %>% 
  ungroup() %>% 
  mutate(
    p_CT_FDR = p.adjust(p_CT, method = "BH"), 
    p_Obs_FDR = p.adjust(p_Obs, method = "BH")
  )
#save the results
write.xlsx(chi2_results, "risk_burden_results.xlsx")

### Drawing figure 5
AE_labels <- c(
  physical_SE      = "Physical",
  mentalhealth_AE  = "Mental health",
  social_AE        = "Social",
  occupation_AE    = "Occupational",
  economic_AE      = "Economic",
  confidential_info_disclosed       = "Disclosure of confidential information"
)

risk_burden_label <- risk_burden_summary %>%
  mutate(
    risks_label = recode(risks_raw, !!!AE_labels)
  )

risk_order <- risk_burden_label %>%
  filter(study_type == "HCS") %>% 
  arrange(pct) %>%
  pull(risks_label)


risk_burden_label <- risk_burden_label %>%
  mutate(
    risks_label = factor(risks_label, levels = risk_order),
    study_type = factor(study_type, levels = c("HCS", "CT", "Obs")), 
    study_type = fct_recode(study_type, HCS = "HCS")
  ) %>%
  mutate(
    light_pct = pmin(pct, 50),
    dark_pct = ifelse(pct > 50, pct - 50, 0)
  )

sig_lines_tally <- chi2_results %>%
  pivot_longer(cols = c(p_CT_FDR, p_Obs_FDR), 
               names_to = "comparison", values_to = "p_adj") %>%
  mutate(
    group2 = ifelse(comparison == "p_CT_FDR", "CT", "Obs"),
    label = case_when(
      p_adj < 0.001 ~ "***",
      p_adj < 0.01  ~ "**",
      p_adj < 0.05  ~ "*",
      TRUE          ~ "ns"
    ),
    study_type = group2,
    risks_label = recode(risk, !!!AE_labels),  # match to the labels in your plot
  ) %>% 
  left_join(risk_burden_label, by = c("study_type", "risks_label")) %>%
  mutate(label_y = pct + 2) %>% 
  mutate(
    risks_label = factor(risks_label, levels = levels(risk_burden_label$risks_label)),
    study_type = factor(study_type, levels = levels(risk_burden_label$study_type))
  )

ct_y_range <- risk_burden_label %>%
  filter(study_type == "CT") %>%
  summarise(
    y_min = min(as.numeric(risks_label)) - 0.2,
    y_max = max(as.numeric(risks_label)) + 0.2
  ) %>% 
  mutate(
    study_type = factor("CT", levels = c("HCS", "CT", "Obs"))  # fix facet order
  )


ggplot(risk_burden_label, aes(x = pct, y = risks_label)) +
  geom_col(aes(x = light_pct), fill = "#b3d7e9", width = 0.7) +
  geom_col(aes(x = dark_pct), 
           fill = "#4393c3", width = 0.7, position = position_nudge(x=50)) +
  facet_wrap(~ study_type, nrow = 1) +
  scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, 25)) +
  scale_y_discrete(expand = expansion(mult = c(0.01, 0.01))) +
  labs(
    x = "% selected by students",
    y = "Adverse events"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.spacing = unit(0.5, "lines"),
    plot.margin = margin(5, 5, 5, 5),
    strip.text = element_text(face = "bold"),
    axis.text.x = element_text(angle = 0),
    axis.text.y = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  geom_vline(xintercept = c(100), color = "grey40", size = 0.3) +
  # geom_vline(xintercept = c(50), color = "grey40", size = 0.3) +
  # geom_vline(xintercept = c(0,), color = "black", size = 0.6) +
  #geom_vline(data = filter(tally_label, study_type %in% c("HCT", "CT", "ObS")),
  #        aes(xintercept = 100),
  #       color = "grey85", size = 3) +
  geom_text(data = 
              filter(sig_lines_tally, study_type == "Obs"),
            aes(x = label_y, y = risks_label, label = label),
            inherit.aes = FALSE,
            size = 4,
            hjust = 0) +
  geom_segment(data = ct_y_range,
               aes(x = 88, xend = 88, y = y_min, yend = y_max),
               inherit.aes = FALSE,
               size = 0.6) +
  geom_segment(data = ct_y_range,
               aes(x = 86, xend = 88, y = y_max, yend = y_max),
               inherit.aes = FALSE,
               size = 0.6) +
  geom_segment(data = ct_y_range,
               aes(x = 86, xend = 88, y = y_min, yend = y_min),
               inherit.aes = FALSE,
               size = 0.6) +
  geom_text(data = ct_y_range,
            aes(x = 90, y = (y_min + y_max)/2, label = "ns"),
            inherit.aes = FALSE,
            size = 4,
            hjust = 0) 

ggsave("Fig5.pdf", width = 12, height = 4, dpi = 300)
ggsave("Fig5.png", width = 12, height = 4, dpi = 300)
ggsave("Fig5.tiff", width = 12, height = 4, dpi = 300)
