#### AllSeasons: TableA2 G=tests  ###############
# Script author: Peter King (p.king1@leeds.ac.uk)
# Last Edited: 30/09/2025
# Change:
# - Changing gtest to g.test
# - reformatting table to add freq, stat, p,
# - and using anonymised data
# - misc formatting fixes


# *****************************************************************************
#### Section 0: Setting up ####
## Find session information in 00_ReplicatePaper.R
# *****************************************************************************


## Libraries here: ************************************************************
rm(list = ls())
library(tidyverse)
library(magrittr)
library(here)
library(data.table)
library(Rfast)
library(janitor)
library(rstatix)
library(DescTools)


# **************************************************************************
#### Section 1: Import Data  ####
# **************************************************************************


## Latest version:
Data <-
  here("SurveyData", "AllSeasons_dataframe_Step3_Anonymised.csv") %>%
  fread() %>%
  data.frame()



# **************************************************************************
#### Section 2: Transform Data  ####
# **************************************************************************


## New version with more categories
Data$DistanceCategories <- cut(
  Data$MilesDistance,
  breaks = c(-1, 1.609344, 4.02336, 11.265408, 23.335488, 32.18688),
  labels = c(1, 2, 3, 4, 5)
) %>% as.numeric()


## Add bracktes for reporting
Data$AgeLevels <-
  cut(
    Data$ExactAge,
    breaks = c(0, 29, 39, 49, 59, 69, 120),
    labels = c(1, 2, 3, 4, 5, 6)
  ) %>% as.numeric()



## collect the raw levels into brackets for ease of reporting
Data$IncomeBracket <- cut(
  Data$IncomeLevels_PlusMissing,
  breaks = c(-2, 20501, 26800, 51999, 52000),
  labels = c(1, 2, 3, 4)
) %>% as.numeric()


# To simplify analysis I just calculate england or not
## (scotland and wales bundled together here sorry guys)
Data$EnglandDummy <- ifelse(Data$Country == 0, 1, 0)



# **************************************************************************
#### Section 3: Setup functions  ####
# **************************************************************************


# Add MostRecentVisit to Variable vector and reorder alphabetically
Variable <- c(
  "AgeLevels",
  "Country",
  "DistanceCategories",
  "EthnicityDummyWhite",
  "Gender",
  "IncomeBracket",
  "Impairment",
  "ClassDummy",
  "Urbanicity",
  "MostRecentVisit"
)


# Create proper category labels mapping
CategoryLabels <- list(
  Gender = c("0" = "Men", "1" = "Women"),
  AgeLevels = c("1" = "18 – 29", "2" = "30 – 39", "3" = "40 – 49",
                "4" = "50 – 59", "5" = "60 – 69", "6" = "70+"),
  EthnicityDummyWhite = c("0" = "Non-white", "1" = "White"),
  IncomeBracket = c("1" = "< £20,500", "2" = "£20,501 to £26,800",
                    "3" = "£26,801 to £54,000", "4" = "£54,000+"),
  Country = c("0" = "England", "1" = "Wales", "2" = "Scotland"),
  Urbanicity = c("0" = "Rural", "1" = "Urban"),
  ClassDummy = c("0" = "ABC1", "1" = "C2DE"),
  Impairment = c("0" = "None", "1" = "Any"),
  DistanceCategories = c("1" = "<1.6 km", "2" = "1.6 - 4km", "3" = "4.1 - 15km",
                         "4" = "16 - 31km", "5" = "32km+"),
  MostRecentVisit = c("0" = "Did not visit", "1" = "Once or twice a season",
                      "2" = "Once or twice a month", "3" = "Once a week",
                      "4" = "Several times a week", "5" = "Every day")
)


# Variable name labels for table
VariableNames <- c(
  AgeLevels = "Age (years)",
  Country = "Country of residence",
  DistanceCategories = "Distance to nearby forest",
  EthnicityDummyWhite = "Ethnicity",
  Gender = "Gender",
  IncomeBracket = "Household income (gross annual)",
  Impairment = "Sensory impairment",
  ClassDummy = "Social grade",
  Urbanicity = "Urbanicity",
  MostRecentVisit = "Visit frequency to nearby forest in the season of sampling"
)




# --- set new reference season here (Spring = 1) ---
ref_season <- 1L
target_seasons <- c(0L, 2L, 3L)   # comparisons (keeps Comparison = 1,2,3 mapping)
season_label_map <- c("0" = "Winter", "1" = "Spring", "2" = "Summer", "3" = "Autumn")


# **************************************************************************
#### Section 4: Updated function  ####
# **************************************************************************


# Modified function to extract just G-statistic and add stars
AddStarsToStat <- function(test_string) {
  # Extract statistic and p-value from format "stat (pval stars)"
  stat <- as.numeric(sub(" \\(.*", "", test_string))
  pval <- as.numeric(sub(".*\\(([0-9.]+).*", "\\1", test_string))

  stars <- ifelse(pval < 0.01, "***",
                  ifelse(pval < 0.05, "**",
                         ifelse(pval < 0.1, "*", "")))

  paste0(round(stat, 2), stars)
}


## Custom function that adds significance stars
AddStars <- function(Data) {
  ifelse(Data < 0.01,
         paste0("***"),
         ifelse(Data < 0.05,
                paste0("**"),
                ifelse(Data < 0.1,
                       paste0("*"),
                       paste0(" "))))
}


# Robust G-test + descriptives using base table (reference = ref_season)
perform_GT_with_descriptives_perseason_safe <- function(data,
                                                        comparison_label,
                                                        var1,
                                                        ref_season = 1L,
                                                        target_season) {
  # basic checks
  if (!var1 %in% names(data)) stop("variable not found in data: ", var1)
  if (!"Season" %in% names(data)) stop("Season column not found in data")

  # drop rows with NA in var1 or Season (these would break table)
  dat <- data[!is.na(data[[var1]]) & !is.na(data$Season), , drop = FALSE]

  # build contingency table with columns ordered as ref, target
  tbl <- as.matrix(table(factor(dat[[var1]]), factor(dat$Season)))
  # ensure both columns exist in that order
  cols_needed <- as.character(c(ref_season, target_season))
  # if a season column is missing, create it with zeros
  for (cname in cols_needed) {
    if (!cname %in% colnames(tbl)) {
      tbl <- cbind(tbl, setNames(rep(0L, nrow(tbl)), cname))
    }
  }
  tbl <- tbl[, cols_needed, drop = FALSE]

  ref_counts <- as.integer(tbl[, 1])
  other_counts <- as.integer(tbl[, 2])
  cats <- rownames(tbl)
  # if there are zero rows entirely, keep them (so Category labels align)
  # compute formatted "n (pp%)" strings per column (percentage relative to that column)
  sum_ref <- sum(ref_counts)
  sum_other <- sum(other_counts)

  if (sum_ref == 0) {
    ref_formatted <- rep("0 (0.00%)", length(ref_counts))
  } else {
    ref_formatted <- paste0(ref_counts, " (", sprintf("%.2f%%", 100 * ref_counts / sum_ref), ")")
  }

  if (sum_other == 0) {
    other_formatted <- rep("0 (0.00%)", length(other_counts))
  } else {
    other_formatted <- paste0(other_counts, " (", sprintf("%.2f%%", 100 * other_counts / sum_other), ")")
  }

  # G-test: only run if both columns have at least one observation
  if (sum_ref > 0 && sum_other > 0) {
    # expected probabilities from ref column (avoid exact zeros by tiny epsilon if needed)
    pvec <- ref_counts / sum_ref
    if (any(pvec == 0)) {
      pvec[pvec == 0] <- .Machine$double.eps
      pvec <- pvec / sum(pvec)
    }
    # create a 2-column matrix of counts (rows = categories)
    counts_mat <- cbind(ref_counts, other_counts)
    # ensure counts_mat has integer mode
    storage.mode(counts_mat) <- "integer"
    test_res <- tryCatch({
      DescTools::GTest(as.table(counts_mat), p = pvec, rescale.p = TRUE)
    }, error = function(e) NULL)

    if (!is.null(test_res)) {
      TestData_Format <- paste0(round(test_res$statistic, 3),
                                " (",
                                round(test_res$p.value, 3),
                                " ",
                                test_res$p.value %>% AddStars(),
                                ")")
    } else {
      TestData_Format <- NA_character_
    }
  } else {
    TestData_Format <- NA_character_
  }

  out_df <- data.frame(
    Variable = rep(var1, length(cats)),
    Season = rep(comparison_label, length(cats)),
    Category = cats,
    Ref = ref_formatted,
    Other = other_formatted,
    GT = rep(TestData_Format, length(cats)),
    stringsAsFactors = FALSE
  )

  # ensure Category is character (so later matching works)
  out_df$Category <- as.character(out_df$Category)

  return(out_df)
}


# **************************************************************************
#### Section 4: Updated function  ####
# **************************************************************************


# Replace the loop with this (keeps Comparison = 1,2,3 semantics)
results_list_variables <- list()
for (V in Variable) {
  results_temp_season <- list()
  for (i in seq_along(target_seasons)) {
    tgt <- target_seasons[i]
    result <- perform_GT_with_descriptives_perseason_safe(
      data = Data %>% dplyr::filter(Season %in% c(ref_season, tgt)),
      comparison_label = i,
      var1 = V,
      ref_season = ref_season,
      target_season = tgt
    )
    # stable column names: Variable, Comparison, Category, Ref, Other, GT
    colnames(result) <- c("Variable", "Comparison", "Category", "Ref", "Other", "GT")
    results_temp_season[[i]] <- result
  }
  # bind_rows is safe here
  results_list_variables[[V]] <- dplyr::bind_rows(results_temp_season)
}


# continue with your downstream processing as before
Table_GTest_Results <- dplyr::bind_rows(results_list_variables, .id = "Variable") %>% data.frame()


# **************************************************************************
#### Section 5: Updated function  ####
# **************************************************************************


# make a tibble and detect which Variable column exists
tbl <- Table_GTest_Results %>% tibble::as_tibble()


var_col <- if ("Variable.1" %in% names(tbl)) {
  "Variable.1"
} else if ("Variable" %in% names(tbl)) {
  "Variable"
} else {
  stop("No 'Variable' column found in Table_GTest_Results")
}


# clean GT and create Category_Label robustly
tbl <- tbl %>%
  mutate(GT_Clean = sapply(GT, AddStarsToStat))


tbl$Category_Label <- mapply(function(var, cat) {
  # ensure cat is character for lookup
  cat_chr <- as.character(cat)
  if (var %in% names(CategoryLabels)) {
    lbls <- CategoryLabels[[var]]
    if (cat_chr %in% names(lbls)) lbls[[cat_chr]] else cat_chr
  } else {
    cat_chr
  }
}, tbl[[var_col]], tbl$Category, USE.NAMES = FALSE)


# select and pivot using the detected var_col
out_table <- tbl %>%
  select(
    Variable = !!rlang::sym(var_col),
    Category = Category_Label,
    Ref, Other, GT_Clean, Comparison
  ) %>%
  pivot_wider(
    names_from = Comparison,
    values_from = c(Other, GT_Clean),
    names_sep = "_"
  ) %>%
  rename(
    Spring = Ref,
    Winter = Other_1,
    Summer = Other_2,
    Autumn = Other_3
  )

# show result
TableA2_Final <- out_table


# **************************************************************************
#### Section 6: Tidy table  ####
# **************************************************************************


# Add G-test rows
TableA2_WithTests <- TableA2_Final %>%
  group_by(Variable) %>%
  group_modify(~ {
    bind_rows(
      .x,
      data.frame(
        Category = "G test statistic vs Spring",
        Winter = unique(.x$GT_Clean_1)[1],
        Spring = "",
        Summer = unique(.x$GT_Clean_2)[1],
        Autumn = unique(.x$GT_Clean_3)[1]
      )
    )
  }) %>%
  ungroup() %>%
  dplyr::select(Variable, Category, Winter, Spring, Summer, Autumn) %>%
  mutate(
    Variable = ifelse(duplicated(Variable), "", VariableNames[Variable])
  )


# Add column headers with n
colnames(TableA2_WithTests) <- c(
  "Variable",
  "Category",
  "Winter (n=1879)",
  "Spring (n=1769)",
  "Summer (n=1767)",
  "Autumn (n=1748)"
)

TableA2_WithTests



# **************************************************************************
#### Section 7: Export  ####
# **************************************************************************


TableA2_WithTests %>% fwrite(sep = "#",
                             quote = FALSE,
                             file = paste0(here(
                               "OtherOutput/Tables",
                               "TableA2_GTests_VariablesVsSpring.txt"
                             )))

