## ----setup, include = FALSE---------------------------------------------------
# Check if required packages are available
has_suggested <- requireNamespace("sf", quietly = TRUE) &&
  requireNamespace("disdat", quietly = TRUE) &&
  requireNamespace("purrr", quietly = TRUE) &&
  requireNamespace("tidyr", quietly = TRUE) &&
  requireNamespace("tibble", quietly = TRUE) &&
  requireNamespace("ggplot2", quietly = TRUE)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  out.width = "100%",
  eval = has_suggested
)

## ----check-packages, echo = FALSE, results = "asis", eval = TRUE--------------
if (!has_suggested) {
  cat("**Note:** This vignette requires the following packages to run: `sf`, `disdat`, `purrr`, `tidyr`, `tibble`, and `ggplot2`. Please install them to view the full vignette output.\n\n")
  cat("```r\n")
  cat('install.packages(c("sf", "disdat", "purrr", "tidyr", "tibble", "ggplot2"))\n')
  cat("```\n\n")
}

## ----libraries, message=FALSE-------------------------------------------------
library(dplyr)
library(purrr)
library(tidyr)
library(tibble)
library(ggplot2)
library(terra)
library(sf)
library(disdat)
library(MIAmaxent)

## ----load-data----------------------------------------------------------------
# Choose a species (change this to model a different species!)
selected_species <- "swi01" # `disdat` has anonymized species codes: swi01 to swi30

# Define predictor variables
predictors <- c(
  "bcc", "calc", "ccc", "ddeg", "nutri", "pday",
  "precyy", "sfroyy", "slope", "sradyy", "swb", "tavecc", "topo"
)

# Extract presence-only (PO) data for the selected species
swi_po <- disPo("SWI") # function from `disdat` package
po_records <- swi_po[swi_po$spid == selected_species, ]
cat("Number of presence records:", nrow(po_records), "\n")

# Background data (10,000 random locations)
background <- disBg("SWI") # function from `disdat` package

# Combine presence and background into training data frame
# RV = 1 for presences, NA for background
train_data <- rbind(
  data.frame(RV = 1, po_records[, predictors]),
  data.frame(RV = NA, background[, predictors])
)

cat("Training presences:", sum(train_data$RV == 1, na.rm = TRUE), "\n")

## ----plot-po, fig.height=4----------------------------------------------------
studyarea <- disBorder("SWI")
po_sf <- st_as_sf(po_records, coords = c("x", "y"), crs = 21781)
ggplot() +
  geom_sf(data = studyarea, fill = "lightgrey") +
  geom_sf(data = po_sf, color = "blue", size = 0.5) +
  labs(
    title = paste("Presence records for species", selected_species),
    x = "Easting (m)", y = "Northing (m)"
  ) +
  theme_minimal()

## -----------------------------------------------------------------------------
# Rename columns to more informative names
col_name_map <- c(
  RV = "RV",
  bcc = "broadleaf.cover",
  calc = "bedrock.calcareous",
  ccc = "conifer.cover",
  ddeg = "growing.degree.days",
  nutri = "soil.nutrients",
  pday = "precip.days",
  precyy = "annual.precip",
  sfroyy = "summer.frost.freq",
  slope = "slope",
  sradyy = "solar.radiation",
  swb = "water.balance",
  tavecc = "temp.coldest.month",
  topo = "topographic.position"
)
names(train_data) <- col_name_map[names(train_data)]

# Convert the categorical variable to factor
train_data$bedrock.calcareous <- as.factor(train_data$bedrock.calcareous)

## ----single-fop, fig.height=4-------------------------------------------------
plotFOP(train_data, "temp.coldest.month")

## ----fop-data-extraction, echo=FALSE, fig.show='hide'-------------------------
# Identify continuous variables (exclude RV and categorical variables)
continuous_vars <- setdiff(names(train_data), c("RV", "bedrock.calcareous"))

# Extract FOP data for all continuous variables
fop_results <- map(continuous_vars, \(x) plotFOP(train_data, x))
names(fop_results) <- continuous_vars

## ----fop-combine-data, echo=FALSE---------------------------------------------
# Combine FOP data into a single data frame
fop_combined <- map2_dfr(
  fop_results, continuous_vars,
  \(x, y) mutate(x$FOPdata, variable = y)
)

# Prepare raw training data for density plots
train_data_long <- train_data %>%
  select(all_of(continuous_vars)) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value")

## ----fop-faceted-plot, fig.height=10, echo=FALSE------------------------------
# Create a faceted plot to compare FOP across all continuous variables
ggplot() +
  # Grey density showing data distribution (scaled to fit on same axis)
  geom_density(
    data = train_data_long, aes(x = value, y = after_stat(scaled)),
    fill = "grey90", color = NA
  ) +
  # FOP points and loess line
  geom_point(data = fop_combined, aes(x = intEV, y = intRV), size = 1.5) +
  geom_line(data = fop_combined, aes(x = intEV, y = loess), color = "red") +
  facet_wrap(~variable, scales = "free_x", ncol = 3) +
  labs(
    x = "Environmental variable value",
    y = "Frequency of Observed Presence (FOP)"
  ) +
  theme_bw() +
  theme(strip.text = element_text(size = 8))

## ----derive-vars--------------------------------------------------------------
# Create derived variables using multiple transformation types
# L = Linear, M = Monotonic, D = Deviation (unimodal),
# HF = Forward hinge, HR = Reverse hinge, T = Threshold
train_DVs <- deriveVars(train_data,
  transformtype = c("L", "M", "D", "HF", "HR", "T"),
  allsplines = TRUE,
  quiet = TRUE
)

## ----examine-dvs--------------------------------------------------------------
# Example: some of the DVs created for temperature
head(names(train_DVs$dvdata$temp.coldest.month))

## ----select-dvs---------------------------------------------------------------
# Select the best DVs for each individual EV
train_DV_select <- selectDVforEV(train_DVs$dvdata,
  alpha = 0.001,
  quiet = TRUE
)

## ----examine-selected-dvs-----------------------------------------------------
# Example: DVs selected for temperature
names(train_DV_select$dvdata$temp.coldest.month)

## ----select-evs---------------------------------------------------------------
# Select which EVs (represented by their best DVs) to include
train_model <- selectEV(train_DV_select$dvdata,
  alpha = 0.001,
  interaction = FALSE,
  quiet = TRUE
)

# View the selection trail, only the top model from each round
selection_trail <- train_model$selection[!duplicated(train_model$selection$round), ]
knitr::kable(selection_trail, digits = c(0, 0, 0, 3, 1, 0, 3)) # Print the data.frame as a table

## ----final-model--------------------------------------------------------------
# View the final model
train_model$selectedmodel$formula

## ----response-curves, fig.height=4--------------------------------------------
# Get the most important variable (first selected)
top_EV <- train_model$selection$variables[1]

# Plot response curve for the most important variable
plotResp(
  train_model$selectedmodel,
  train_DVs$transformations,
  top_EV
)

## ----compare-fop-response, fig.show='hold', fig.height=4----------------------
plotFOP(train_data, top_EV)

## ----all-responses, include=FALSE---------------------------------------------
evs <- (train_model$selectedmodel$coefficients)[-1] |>
  names() |>
  sub("_.*$", "", x = _) |>
  unique()
for (ev in evs[-1]) {
  plotResp(
    train_model$selectedmodel,
    train_DVs$transformations,
    ev
  )
}
train_model$selectedmodel$formula

## ----load-data-pa-------------------------------------------------------------
# Extract presence-absence (PA) data
swi_pa <- disPa("SWI") # function from `disdat` package

# Merge with environmental data by location
swi_env <- disEnv("SWI") # function from `disdat` package
pa_merged <- merge(swi_pa, swi_env)

# Create validation data frame with RV and environmental variables (parallel to train_data)
validation_data <- data.frame(
  RV = pa_merged[, selected_species],
  pa_merged[, predictors]
)

# Rename columns to match training data
names(validation_data) <- col_name_map[names(validation_data)]

# Convert the categorical variable to factor
validation_data$bedrock.calcareous <- as.factor(validation_data$bedrock.calcareous)

cat("Validation presences:", sum(validation_data$RV == 1), "\n")
cat("Validation absences:", sum(validation_data$RV == 0), "\n")

## ----plot-pa, fig.height=4----------------------------------------------------
pa_sf <- st_as_sf(swi_pa, coords = c("x", "y"), crs = 21781)
# Convert to factor for discrete colors (presence/absence)
pa_sf[[selected_species]] <- as.factor(pa_sf[[selected_species]])
ggplot() +
  geom_sf(data = studyarea, fill = "lightgrey") +
  geom_sf(data = pa_sf, aes(color = .data[[selected_species]]), size = 0.5) +
  scale_color_manual(
    values = c("0" = "red", "1" = "blue"),
    labels = c("0" = "Absence", "1" = "Presence"),
    name = "Record type"
  ) +
  labs(
    title = paste("Presence-absence records for species", selected_species),
    x = "Easting (m)", y = "Northing (m)"
  ) +
  theme_minimal()

## ----validation-auc, fig.height=4---------------------------------------------
# Calculate validation AUC
auc_result <- testAUC(
  model = train_model$selectedmodel,
  transformations = train_DVs$transformations,
  data = validation_data
)

## ----reconstruct-spatial, include=FALSE---------------------------------------
# To create spatial predictions directly, we need raster data
# Here we reconstruct the (incomplete) raster from coordinates, and metadata instead

# Create projection data  (similar to validation data but with coordinates)
proj_data <- data.frame(
  x = pa_merged$x,
  y = pa_merged$y,
  pa_merged[, predictors]
)

# Rename columns to match training data
col_name_map <- c(
  x = "x",
  y = "y",
  col_name_map
)
names(proj_data) <- col_name_map[names(proj_data)]

# Convert the categorical variable to factor
proj_data$bedrock.calcareous <- as.factor(proj_data$bedrock.calcareous)

# SWI
# Prepared by Niklaus E. Zimmermann and Antoine Guisan
# 13 variables:  see SWI\01_metadata_SWI_environment.csv
# Coordinate reference system: Transverse, spheroid Bessel (note all data has had a constant shift applied to it)
# EPSG:21781
# Units: meters
# Raster cell size: 100m

# Create a raster to fill in with predictions
# Here we assume env_data has columns 'x' and 'y' for coordinates
coordinates <- proj_data[, c("x", "y")]
env_raster <- rast(
  xmin = min(coordinates$x),
  xmax = max(coordinates$x),
  ymin = min(coordinates$y),
  ymax = max(coordinates$y),
  resolution = 100,
  crs = "EPSG:21781"
)

# Create a simple prediction map using the environmental data
predictions <- projectModel(
  model = train_model$selectedmodel,
  transformations = train_DVs$transformations,
  data = proj_data
)

# Rasterize the predictions
pred_raster <- rasterize(as.matrix(coordinates), env_raster,
  values = predictions$output$PRO
)
plot(pred_raster, main = "Predicted Probability of Occurrence")

## ----auc-po, include=FALSE----------------------------------------------------
testAUC(
  model = train_model$selectedmodel,
  transformations = train_DVs$transformations,
  data = train_data
)

## ----bv-setup-----------------------------------------------------------------
# Create a tibble to store results for three alpha levels
model_comparison <- tibble(
  alpha = c(1e-1, 1e-3, 1e-9)
)

## ----bv-split-----------------------------------------------------------------
set.seed(123) # For reproducibility

# Create indices for 70/30 split
n_total <- nrow(train_data)
train_idx <- sample(1:n_total, size = floor(0.7 * n_total))

# Split the data into training and test sets
train_70 <- train_data[train_idx, ]
test_30 <- train_data[-train_idx, ]

# Derive variables for the training set
train_DVs_70 <- deriveVars(train_70,
  transformtype = c("L", "M", "D", "HF", "HR", "T"),
  allsplines = TRUE,
  quiet = TRUE
)

## ----bv-fit-models------------------------------------------------------------
# Add model fitting results using map
model_comparison <- model_comparison %>%
  mutate(
    # Step 1: Select DVs for each EV
    dv_selection = map(alpha, \(a) selectDVforEV(train_DVs_70$dvdata,
      alpha = a,
      quiet = TRUE
    )),
    # Step 2: Select EVs for final model
    ev_selection = map2(
      dv_selection, alpha,
      \(dv, a) selectEV(dv$dvdata,
        alpha = a,
        interaction = FALSE,
        quiet = TRUE
      )
    ),
    # Extract number of DVs in final model
    n_dvs = map_int(ev_selection, \(ev) length(ev$selectedmodel$coefficients) - 1)
  )

# Display intermediate results
model_comparison %>%
  select(alpha, n_dvs)

## ----bv-evaluate, fig.show='hide', warning=FALSE------------------------------
# Add AUC calculations
model_comparison <- model_comparison %>%
  mutate(
    # AUC on 30% test set (internal evaluation)
    auc_test = map_dbl(
      ev_selection,
      \(ev) testAUC(
        model = ev$selectedmodel,
        transformations = train_DVs_70$transformations,
        data = test_30
      )
    ),
    # AUC on independent validation set (external evaluation)
    auc_validation = map_dbl(
      ev_selection,
      \(ev) testAUC(
        model = ev$selectedmodel,
        transformations = train_DVs_70$transformations,
        data = validation_data
      )
    )
  )

# Display final results table
results_table <- model_comparison %>%
  select(alpha, n_dvs, auc_test, auc_validation)

knitr::kable(results_table, digits = c(10, 0, 3, 3))