## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ---- message=FALSE----------------------------------------------------------- library(palaeoSig) library(rioja) library(sf) library(gstat) library(dplyr) library(tibble) library(tidyr) library(purrr) library(ggplot2) theme_set(theme_bw()) set.seed(42) # for reproducibility ## ---- results='hide', warning=FALSE------------------------------------------- # load data data(Atlantic) meta <- c("Core", "Latitude", "Longitude", "summ50") # N Atlantic N_Atlantic <- Atlantic |> filter(Latitude > 3) |> slice_sample(n = 300) # subsample for speed N_Atlantic_meta <- N_Atlantic |> select(one_of(meta)) |> as.data.frame() # to keep rdist.earth happy N_Atlantic <- N_Atlantic |> select(-one_of(meta)) # S Atlantic S_Atlantic <- Atlantic |> filter(Latitude < -3) S_Atlantic_meta <- S_Atlantic |> select(one_of(meta)) S_Atlantic <- S_Atlantic |> select(-one_of(meta)) ## convert N_Atlatic_meta to an sf object N_Atlantic_meta <- st_as_sf( x = N_Atlantic_meta, coords = c("Longitude", "Latitude"), crs = 4326 ) # calculating distances among the sampled points in the # North Atlantic foraminifera data set geodist <- st_distance(N_Atlantic_meta) |> units::set_units("km") |> units::set_units(NULL) # values of h for which h-block cross-validation is calculated threshs <- c(0.01, 300, 600, 900, 1200) # h-block cross-validation of the NA foraminifera dataset for # different values of h res_h <- map(threshs, function(h) { mod <- MAT(N_Atlantic, N_Atlantic_meta$summ50, k = 5, lean = FALSE) mod <- crossval(mod, cv.method = "h-block", h.dist = geodist, h.cutoff = h) tibble( h = h, RMSE = performance(mod)$crossval["N05", "RMSE"], R2 = performance(mod)$crossval["N05", "R2"] ) }) |> list_rbind() ## ---- warning = FALSE, results = 'hide'--------------------------------------- # Leave-one-out cross-validated RMSEP using MAT with k = 5 round(res_h[1, "RMSE"], 2) # Predicting the South Atlantic test set mod_NA <- MAT(N_Atlantic, N_Atlantic_meta$summ50, k = 5) pred_SA <- predict(mod_NA, newdata = S_Atlantic)$fit # Determining RMSEP of the SA test set rmse_mat <- sqrt(mean((pred_SA[, 1] - S_Atlantic_meta$summ50)^2)) # RMSEP of the SA test set using MAT with k = 5 round(rmse_mat, 2) ## ---- echo=FALSE, results = 'hide', fig.cap = "Figure 1: Root mean square error of prediction (RMSEP) as a function of removal distance h. Dashed horizontal line indicates RMSEP found on a spatially independent test set."---- est_h <- approx(y = res_h$h, x = res_h$RMSE, xout = rmse_mat)$y seg_dat <- tibble( x = c(-Inf, est_h), xend = c(est_h, est_h), y = c(rmse_mat, rmse_mat), yend = c(rmse_mat, -Inf) ) ggplot(res_h, aes(x = h, y = RMSE)) + geom_point() + geom_line() + geom_segment(aes(x = x, y = y, xend = xend, yend = yend), data = seg_dat, colour = "red" ) + labs(x = "h [km]", y = "RMSEP [°C]") ## ---- message=FALSE, results = 'hide', fig.cap = "Figure 2: Semi-variogram fitted to detrended residuals of a weighted averaging model."---- # WA model modwa <- crossval(WA(sqrt(N_Atlantic), N_Atlantic_meta$summ50, mono = TRUE)) # residuals of the WA model wa_resid <- residuals(modwa, cv = TRUE) # detrend to remove edge effects (loess with span = 0.1) detrended_resid <- resid(loess(wa_resid[, 1] ~ N_Atlantic_meta$summ50, span = 0.1 )) # variogram of the detrended residuals of the WA model v <- variogram(detrended_resid ~ 1, data = N_Atlantic_meta) # Fitting a spherical variogram (partial sill, range and nugget are # approximately estimated from the empirical variogram) vm <- fit.variogram(v, vgm(psill = 2, "Sph", range = 1500, nugget = 0.5)) plot(v, vm) ## ---- fig.cap = "Figure 3: Semi-variogram model (Matérn class) fitted to the North Atlantic summer sea temperature at 50 m depth."---- # Estimate the variogram model for the environmental variable of interest ve <- variogram(summ50 ~ 1, data = N_Atlantic_meta) vem <- fit.variogram(ve, vgm(40, "Mat", 5000, .1, kappa = 1.8)) plot(ve, vem) # Simulating environmental variables sim <- krige(sim ~ 1, locations = N_Atlantic_meta, dummy = TRUE, nsim = 100, beta = mean(N_Atlantic_meta$summ50), model = vem, newdata = N_Atlantic_meta ) # convert back to a regular data.frame sim <- sim |> st_drop_geometry() ## ---- message = FALSE, warning= FALSE,results='hide', fig.cap="Figure 4: Histogram of squared correlation coefficients between simulated variables and the environmental variable of interest."---- # Function for h-block cross-validating several simulations at a time mat_h1 <- function(y, x, noanalogues, geodist, thresh) { if (!inherits(y, "dist")) { if (is.data.frame(y) || !(ncol(y) == nrow(y) && sum(diag(y)) == 0)) { y <- dist(sqrt(y))^2 # squared chord distance } } y <- as.matrix(y) diag(y) <- Inf if (inherits(geodist, "dist")) { geodist <- as.matrix(geodist) } sapply(seq_len(nrow(y)), function(n) { exneigh <- geodist[n, ] >= thresh x2 <- x[exneigh, ] y2 <- y[n, ][exneigh] analogues <- which(rank(y2, ties.method = "random") <= noanalogues) colMeans(x2[analogues, ]) }) } # h-block cross-validation of the simulated variables simhr <- sapply(threshs, function(h) { hn <- mat_h1(N_Atlantic, sim, noanalogues = 5, geodist = geodist, thresh = h) diag(cor(t(hn), sim)^2) }) # Estimating squared correlation between environmental variable of interest and # simulated variables sim_obs_r2 <- sapply(sim, cor, N_Atlantic_meta$summ50)^2 # Calculating sum of squares between the two squared correlations so_squares <- apply(simhr, 2, function(x) { sum((x - sim_obs_r2)^2) }) ## ----message = FALSE, warning= FALSE,results='hide', fig.cap = "Figure 5: Scatterplot of squared correlation coefficients between simulated variables and the environmental variable of interest and transfer function r^2^."---- simhr |> as.data.frame() |> set_names(threshs) |> mutate(sim_obs_r2 = sim_obs_r2) |> pivot_longer(cols = -sim_obs_r2, names_to = "h", values_to = "value") |> mutate(h = factor(h, levels = threshs)) |> ggplot(aes(x = sim_obs_r2, y = value)) + geom_point() + geom_abline() + facet_wrap(~h) + labs(x = "Simulated-observed environmental r²", y = "Transfer function r²") ## ----fig.cap = "Figure 6: Relationship between the sum of squares between the two r^2^ as function of distance *h*."---- tibble(threshs, so_squares) |> ggplot(aes(x = threshs, y = so_squares)) + geom_point() + labs(x = "h km", y = "Sum of squares")