# One structure
<- spinebil::data_gen("sine", n = 400, seed = 1)
df_sine
# Multiple structures at once
<- spinebil::data_gen("all", n = 400, seed = 1) df_all
This vignette is a guide to diagnosing Projection Pursuit Indices in high-dimensional exploratory analysis. We examine how Projection Pursuit Indices behave under different conditions. The goal is to help you decide which index to use, how to tune it, and how to trust its signals.
This vignette presents a workflow built around these functions: data_gen()
, noise_gen()
, ppi_mean()
, ppi_scale()
, ppi_noise_threshold()
and ppi_samplesize_effect()
.
data_gen()
; generate structured 2D point sets (e.g., sine
, circle
, cluster
) or orthogonal polynomial features for benchmarking/visualisation.
type
like "sine"
, "circle"
, …) return a tibble with columns x
, y
, and structure
.type = "polynomial"
) returns an n × degree
matrix of orthogonal polynomial basis vectors on [0,1].noise_gen()
; generates a numeric vector of noise values (as a tibble with value
and type
) from distributions such as: "gaussian"
, "uniform"
, "lognormal"
, "t_distributed"
, "cauchy"
, "beta_noise"
, "exponential"
, "microstructure"
.
We’ll preview available structures, create simple polynomial features, sample a few noise types, and combine structure + noise.
data_gen(type = "all", n = 500, degree = NULL, seed = NULL)
type
: "linear"
, "sine"
, "circle"
, "cluster"
, "snake"
, "outliers"
, "sparse"
, "clumpy"
, "skewed"
, "striated"
, "concave"
, "monotonic"
, "doughnut"
, "all"
, or "polynomial"
.n
: number of points/samples.degree
: only for "polynomial"
; number of basis columns.seed
: optional seed for reproducibility.noise_gen(n = 500, type = "gaussian", level = 0.1, seed = NULL)
n
: number of noise samples.type
: distribution name ( "gaussian"
, "uniform"
, "lognormal"
, "t_distributed"
, "cauchy"
, "beta_noise"
, "exponential"
, "microstructure"
).level
: scale/strength of noise.seed
: optional seed.# One structure
<- spinebil::data_gen("sine", n = 400, seed = 1)
df_sine
# Multiple structures at once
<- spinebil::data_gen("all", n = 400, seed = 1) df_all
::ggplot(df_all, ggplot2::aes(x, y)) +
ggplot2::geom_point(alpha = 0.6, size = 0.6) +
ggplot2::facet_wrap(~structure) +
ggplot2::labs(
ggplot2title = "Synthetic 2D Structures from dataGen()",
subtitle = "Each panel generated with n = 400"
+
) ::xlim(-1.5, 1.5) +
ggplot2::ylim(-1.5, 1.5) +
ggplot2::theme(
ggplot2aspect.ratio = 1,
axis.text = ggplot2::element_blank(),
axis.title = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank()
)
type = "polynomial"
)# Generate a 4-degree orthogonal polynomial basis
<- spinebil::data_gen("polynomial", n = 200, degree = 4, seed = 1)
dPoly
::ggplot(dPoly, ggplot2::aes(x = dPoly[, 1], y = dPoly[, 2])) +
ggplot2::geom_point(alpha = 0.7) +
ggplot2::xlab("polynomial degree 1") +
ggplot2::ylab("polynomial degree 2") +
ggplot2::theme(
ggplot2aspect.ratio = 1,
axis.text = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank()
)
noise_gen()
: noise distributions<- spinebil::noise_gen(n = 500, type = "gaussian", level = 0.06, seed = 340)
eps_g <- spinebil::noise_gen(n = 500, type = "t_distributed", level = 0.15, seed = 341)
eps_t <- spinebil::noise_gen(n = 500, type = "cauchy", level = 0.04, seed = 342) eps_cau
As a simple example, let’s take a clean sine wave and add Gaussian noise to y
.
set.seed(345)
<- 500
n <- spinebil::data_gen("sine", n = n) |> dplyr::mutate(y = sin(x))
sine_clean
# Noise to add on y
<- spinebil::noise_gen(n = n, type = "gaussian", level = 0.06, seed = 123)$value
eps <- sine_clean |> dplyr::mutate(y = y + eps)
sine_noisy
::bind_rows(
dplyr|> dplyr::mutate(which = "clean"),
sine_clean |> dplyr::mutate(which = "noisy (gaussian, level = 0.06)")
sine_noisy |>
) ::ggplot(ggplot2::aes(x, y, color = which)) +
ggplot2::geom_point(alpha = 0.6, size = 0.6) +
ggplot2::labs(
ggplot2title = "Sine Structure with and without Added Noise",
color = NULL
+
) ::theme(
ggplot2axis.text = ggplot2::element_blank(),
axis.title = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank()
)
data_gen("polynomial", ...)
returns an n × degree
matrix.data_gen(<other type>, ...)
return a tibble with columns x
, y
, and structure
.noise_gen(...)
returns a tibble with columns value
(numeric) and type
(factor/character).ppi_mean()
simulates and summarizes a projection pursuit index (PPI) over all 2D variable pairs in a dataset. For each pair, it repeats the index evaluation n_sim
times (sampling n_obs
observations each time) and returns the mean index per pair.
<- ppi_mean(
res # data frame with ≥ 2 numeric columns
data, # function taking n×2 matrix and returning a numeric value
index_fun, n_sim = 100,
n_obs = 300
)
<- as.data.frame(spinebil::data_gen(type = "polynomial", degree = 2))
data <- spinebil::ppi_mean(data, spinebil::scag_index("stringy"), n_sim = 5)
res
res#> # A tibble: 1 × 3
#> var_i var_j mean_index
#> <chr> <chr> <dbl>
#> 1 1 2 1
A tibble with one row per variable pair:
var_i
, var_j
; names of the variables forming the pair.mean_index
; the mean of the index across n_sim
simulations for that pair.ppi_scale()
runs simulations to compare a projection pursuit index (PPI) on:
For every pair of variables and across many simulations, it records the index values for both cases so you can see the scale and separation of the index between structure and noise.
ppi_scale(
data,
index_fun, n_sim = 100,
n_obs = 500,
seed = NULL
)
Inputs:
data
; data frame/tibble/matrix; (>= 2 numeric columns).index_fun
; function: takes a 2-column matrix and returns a numeric scalar (e.g., scag_index("stringy")
).n_sim
; number of simulations.n_obs
; number of observations.seed
; optional seed for reproducibility.Let’s evaluate the stringy
index from the cassowaryr
package to see how the scale change for pure noise and structure.
<- spinebil::ppi_scale(spinebil::data_gen("polynomial", degree = 3), spinebil::scag_index("stringy"), n_sim = 3)
res
head(res)
#> # A tibble: 6 × 6
#> simulation var_i var_j var_pair sigma index
#> <int> <chr> <chr> <chr> <dbl> <dbl>
#> 1 1 1 2 1-2 0 1
#> 2 1 1 2 1-2 1 0.732
#> 3 1 1 3 1-3 0 1
#> 4 1 1 3 1-3 1 0.783
#> 5 1 2 3 2-3 0 0.996
#> 6 1 2 3 2-3 1 0.719
Here sigma = 0
is structured and sigma = 1
is pure noise.
|>
res ::mutate(sigma = factor(sigma, levels = c(0, 1), labels = c("structured", "noise"))) |>
dplyr::ggplot(ggplot2::aes(x = sigma, y = index, fill = sigma)) +
ggplot2::geom_boxplot() +
ggplot2::facet_wrap(~var_pair) +
ggplot2::labs(
ggplot2title = "Index scale on structured vs noisy data",
x = NULL, y = "index"
)
ppi_scale()
returns a tibble with columns:
simulation
: simulation indexvar_i
, var_j
: the two variable namesvar_pair
: "var_i-var_j"
compact label.sigma
: 0 = structured
, 1 = noise
sigma = 0
); compute the index on the actual pair.sigma = 1
); generate n_obs
standard-normal x
, y
, scale to mean 0 / sd 1, compute the index.index
: the index value from index_fun
.ppi_noise_threshold()
estimates the 95th percentile of a projection pursuit index (PPI) under synthetic noise. This gives you a null threshold: if an observed index on real data exceeds this value, it may indicate non-random structure.
The function supports different noise types via noise_gen()
(e.g., Gaussian, t-distributed, Cauchy, microstructure) and lets you set the number of simulations and sample size per simulation.
ppi_noise_threshold(
index_fun,n_sim = 100,
n_obs = 500,
noise_type = "gaussian",
noise_level = 0.01,
seed = NULL
)
Inputs :
index_fun
: a function that returns a single numeric value.n_sim
: number of simulations.n_obs
: sample size per simulation.noise_type
/ noise_level
: passed through to noise_gen()
to control the distribution and its scale.seed
: for reproducibility.<- spinebil::ppi_noise_threshold(
thr index_fun = spinebil::scag_index("stringy"),
noise_type = "cauchy",
noise_level = 0.1,
n_sim = 10,
n_obs = 100
)
thr#> 95%
#> 0.7865055
ppi_noise_threshold()
returns a single numeric: estimated 95th percentile of the index under the specified noise model.This demonstrates how to use the ppi_samplesize_effect()
function to study how projection pursuit index (PPI) values behave under increasing sample sizes in purely noisy (Gaussian) data. For each size, it computes a specified index function over the samples and records the 95th percentile of index values observed. This helps estimate the distributional behavior of the index under pure noise, depending on sample size. These curves are useful as a null reference: above-curve values may indicate non-random structure.
ppi_samplesize_effect(index_fun, n_sim = 100)
index_fun
: a function that accepts the data for one simulation and returns a single numeric index value.n_sim
: the number of simulations per sample size (default 100).Internally, the function evaluates a grid of sample sizes from 30 up to 2000 (denser at small sizes), simulates
n_sim
draws for each, and returns the 95th percentile of the resulting index values.
Let’s evaluate the stringy
index from the cassowaryr
package to see how its values change under noise as the sample size increases.
<- spinebil::ppi_samplesize_effect(spinebil::scag_index("stringy"), n_sim = 4)
result head(result)
#> # A tibble: 6 × 2
#> sample_size percentile95
#> <dbl> <dbl>
#> 1 30 0.816
#> 2 35 0.752
#> 3 40 0.832
#> 4 45 0.758
#> 5 50 0.815
#> 6 55 0.800
tail(result)
#> # A tibble: 6 × 2
#> sample_size percentile95
#> <dbl> <dbl>
#> 1 1875 0.739
#> 2 1900 0.744
#> 3 1925 0.736
#> 4 1950 0.737
#> 5 1975 0.731
#> 6 2000 0.736
::ggplot(result, ggplot2::aes(sample_size, percentile95)) +
ggplot2::geom_point(alpha = 0.7, size = 1) +
ggplot2::geom_smooth(se = FALSE, method = "loess", formula = y ~ x, span = 0.1) +
ggplot2::labs(
ggplot2x = "Sample size (n)",
y = "95th percentile of index"
)
ppi_samplesize_effect()
returns a tibble:
sample_size
: the sample size for that block of simulationspercentile95
: the empirical 95th percentile of the index across the n_sim
simulations at that size.