R-Script Epidemiològic

Anàlisi de Regressió GAM

Aquest codi implementa un Model Additiu Generalitzat per analitzar l'impacte del NO2 sobre la salut, ajustant per meteorologia i estacionalitat.

analisi_ambiental.R
# ==============================================================================
# 1. PREPARATION & LIBRARIES
# ==============================================================================
packages <- c("mgcv", "readr", "dplyr", "lubridate", "ggplot2")
for (p in packages) {
    if (!require(p, character.only = TRUE)) install.packages(p)
    library(p, character.only = TRUE)
}

# ==============================================================================
# 2. DATA LOADING, CLEANING & FEATURE ENGINEERING (Corrected)
# ==============================================================================
# List of public holidays (Catalonia/Spain)
holidays_cat <- c("01-01", "01-06", "05-01", "06-24", "08-15", "09-11", 
                  "10-12", "11-01", "12-06", "12-08", "12-25", "12-26")

df <- read_csv("meteocontamsivicMartorell.csv", show_col_types = FALSE) %>%
    arrange(date) %>% 
    # Renaming native Catalan columns to English for consistency
    rename(
        total_cases = casos_total,
        total_population = poblacio_total
    ) %>%
    mutate(
        date = as.Date(date),
        day_of_week = as.factor(wday(date, week_start = 1)),
        time_numeric = as.numeric(date),
        
        # Seasonality and Inertia
        day_of_year = yday(date), 
        cases_yesterday = log(lag(total_cases, n=1) + 1),
        
        # Moving average for NO2 (Lag 0-2)
        no2_lag02 = (no2 + lag(no2, n=1) + lag(no2, n=2)) / 3,
        
        # Control variables
        pandemic = as.factor(ifelse(date >= as.Date("2020-03-14") & date <= as.Date("2022-03-31"), "Yes", "No")),
        day_month = format(date, "%m-%d"),
        is_holiday = as.factor(ifelse(day_month %in% holidays_cat, "Yes", "No")),
        
        # Threshold Variables for mean temperatures
        degrees_above_25 = ifelse(temp_mean > 25, temp_mean - 25, 0),
        degrees_below_15 = ifelse(temp_mean < 15, 15 - temp_mean, 0),
        
        # EXTREME WEATHER VARIABLES (Heatwaves & Coldwaves)
        heatwave = ifelse(temp_max > 34, temp_max - 34, 0),
        coldwave = ifelse(temp_min < 5, 5 - temp_min, 0)
    ) %>%
    # 🔥 A QUÍ ESTÀ LA CORRECCIÓ: Sense pm10 ni pm2.5
    select(date, total_cases, total_population, no2_lag02, temp_mean, temp_max, temp_min, rh, 
           time_numeric, day_of_week, day_of_year, cases_yesterday, pandemic, is_holiday,
           degrees_above_25, degrees_below_15, heatwave, coldwave) %>%
    filter(
        complete.cases(.)
    )

cat("Clean rows ready for modeling:", nrow(df), "\n")

# ==============================================================================
# 3. BASE GAM MODEL (Advanced Epidemiological Model)
# ==============================================================================
model_base <- gam(
    total_cases ~ 
        s(no2_lag02, bs = "cr", k = 5) +               
        s(temp_mean, bs = "cr", k = 10) +         
        s(rh, bs = "cr", k = 5) +                
        s(day_of_year, bs = "cc", k = 12) +            
        s(time_numeric, bs = "cr", k = 30) +      
        s(cases_yesterday, bs = "cr", k = 5) +          
        day_of_week + is_holiday + pandemic +
        offset(log(total_population)),             
    family = nb(), 
    data = df,
    method = "REML"
)

# ==============================================================================
# 4. MODEL DIAGNOSTICS & SUMMARY
# ==============================================================================
summary(model_base)

par(mfrow = c(2, 2))
gam.check(model_base) 
par(mfrow = c(1, 1))

# ==============================================================================
# 5. RELATIVE RISK: NO2 (Lag 0-2)
# ==============================================================================
model_lin_no2 <- gam(
    total_cases ~ 
        no2_lag02 +  # Linear term
        s(temp_mean, bs = "cr", k = 10) + s(rh, bs = "cr", k = 5) +
        s(day_of_year, bs = "cc", k = 12) + s(time_numeric, bs = "cr", k = 30) +   
        s(cases_yesterday, bs = "cr", k = 5) + day_of_week + is_holiday + pandemic + offset(log(total_population)),
    family = nb(), data = df, method = "REML"
)

coef_no2 <- coef(model_lin_no2)["no2_lag02"]
se_no2 <- summary(model_lin_no2)$se["no2_lag02"]
rr_10units <- exp(coef_no2 * 10)
ic_inf_no2 <- exp((coef_no2 - 1.96 * se_no2) * 10)
ic_sup_no2 <- exp((coef_no2 + 1.96 * se_no2) * 10)

cat("\n--- RR: NO2 (Lag 0-2) ---\n")
cat(sprintf("RR per 10 µg/m³ increase: %.4f [%.4f - %.4f]\n", rr_10units, ic_inf_no2, ic_sup_no2))

# ==============================================================================
# 6. DOSE-RESPONSE PLOT: NO2
# ==============================================================================
plot_data_no2 <- plot(model_base, select = 1) 
df_plot_no2 <- data.frame(
    no2 = plot_data_no2[[1]]$x, effect = plot_data_no2[[1]]$fit, se = plot_data_no2[[1]]$se
)

plot_no2 <- ggplot(df_plot_no2, aes(x = no2, y = effect)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") + 
    geom_ribbon(aes(ymin = effect - 1.96*se, ymax = effect + 1.96*se), alpha = 0.2, fill = "blue") +
    geom_line(color = "darkblue", linewidth = 1.2) +
    labs(title = "Dose-Response: Impact of NO2 on Morbidity", 
         subtitle = "Adjusted for environment, seasonality, and inertia",
         x = "NO2 Concentration (µg/m3, Lag 0-2)", y = "Effect on log(cases)") +
    theme_minimal()
print(plot_no2)

# ==============================================================================
# 7. DOSE-RESPONSE PLOT: MEAN TEMPERATURE
# ==============================================================================
plot_data_temp <- plot(model_base, select = 2) 
df_plot_temp <- data.frame(
    temperature = plot_data_temp[[2]]$x, effect = plot_data_temp[[2]]$fit, se = plot_data_temp[[2]]$se
)

plot_temp <- ggplot(df_plot_temp, aes(x = temperature, y = effect)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red", linewidth = 1) + 
    geom_ribbon(aes(ymin = effect - 1.96*se, ymax = effect + 1.96*se), alpha = 0.2, fill = "orange") +
    geom_line(color = "darkorange3", linewidth = 1.2) +
    labs(title = "Dose-Response: Impact of Mean Temperature", 
         subtitle = "Showing the J/U-shaped non-linear effect",
         x = "Daily Mean Temperature (ºC)", y = "Effect on log(cases)") +
    theme_minimal()
print(plot_temp)

# ==============================================================================
# 8. RELATIVE RISK: MODERATE HEAT (>25ºC Mean)
# ==============================================================================
model_heat_mean <- gam(
    total_cases ~ 
        degrees_above_25 + degrees_below_15 + 
        s(no2_lag02, bs = "cr", k = 5) + s(rh, bs = "cr", k = 5) + s(day_of_year, bs = "cc", k = 12) +            
        s(time_numeric, bs = "cr", k = 30) + s(cases_yesterday, bs = "cr", k = 5) +          
        day_of_week + is_holiday + pandemic + offset(log(total_population)),             
    family = nb(), data = df, method = "REML"
)

coef_heat_m <- coef(model_heat_mean)["degrees_above_25"]
se_heat_m <- summary(model_heat_mean)$se["degrees_above_25"]
rr_heat_m <- exp(coef_heat_m * 1)
ic_inf_heat_m <- exp((coef_heat_m - 1.96 * se_heat_m) * 1)
ic_sup_heat_m <- exp((coef_heat_m + 1.96 * se_heat_m) * 1)

cat("\n--- RR: MODERATE HEAT (Mean Temp > 25ºC) ---\n")
cat(sprintf("Increase per 1ºC extra: %.2f%% [%.2f%% to %.2f%%]\n", 
            (rr_heat_m - 1)*100, (ic_inf_heat_m - 1)*100, (ic_sup_heat_m - 1)*100))

# ==============================================================================
# 9. RELATIVE RISK: HEATWAVE (Max Temp > 34ºC)
# ==============================================================================
model_heatwave <- gam(
    total_cases ~ 
        heatwave + 
        s(no2_lag02, bs = "cr", k = 5) + s(rh, bs = "cr", k = 5) + s(day_of_year, bs = "cc", k = 12) +            
        s(time_numeric, bs = "cr", k = 30) + s(cases_yesterday, bs = "cr", k = 5) +          
        day_of_week + is_holiday + pandemic + offset(log(total_population)),             
    family = nb(), data = df, method = "REML"
)

coef_hw <- coef(model_heatwave)["heatwave"]
se_hw <- summary(model_heatwave)$se["heatwave"]
rr_hw <- exp(coef_hw * 1)
ic_inf_hw <- exp((coef_hw - 1.96 * se_hw) * 1)
ic_sup_hw <- exp((coef_hw + 1.96 * se_hw) * 1)

cat("\n--- RR: EXTREME HEATWAVE (Max Temp > 34ºC) ---\n")
cat(sprintf("Increase per 1ºC extra above 34ºC: %.2f%% [%.2f%% to %.2f%%]\n", 
            (rr_hw - 1)*100, (ic_inf_hw - 1)*100, (ic_sup_hw - 1)*100))

# ==============================================================================
# 10. RELATIVE RISK: COLDWAVE (Min Temp < 5ºC)
# ==============================================================================
model_coldwave <- gam(
    total_cases ~ 
        coldwave + 
        s(no2_lag02, bs = "cr", k = 5) + s(rh, bs = "cr", k = 5) + s(day_of_year, bs = "cc", k = 12) +            
        s(time_numeric, bs = "cr", k = 30) + s(cases_yesterday, bs = "cr", k = 5) +          
        day_of_week + is_holiday + pandemic + offset(log(total_population)),             
    family = nb(), data = df, method = "REML"
)

coef_cw <- coef(model_coldwave)["coldwave"]
se_cw <- summary(model_coldwave)$se["coldwave"]
rr_cw <- exp(coef_cw * 1)
ic_inf_cw <- exp((coef_cw - 1.96 * se_cw) * 1)
ic_sup_cw <- exp((coef_cw + 1.96 * se_cw) * 1)

cat("\n--- RR: EXTREME COLDWAVE (Min Temp < 5ºC) ---\n")
cat(sprintf("Increase per 1ºC drop below 5ºC: %.2f%% [%.2f%% to %.2f%%]\n", 
            (rr_cw - 1)*100, (ic_inf_cw - 1)*100, (ic_sup_cw - 1)*100))
cat("======================================================================\n")