--- title: "Privacy and validation" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Privacy and validation} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE) set.seed(2) ``` ## What the function does (Overview) `generate_fake_with_privacy()` creates a synthetic copy of your data. It then handles sensitive columns by name. ### Level presets | level | category_mode | column_mode | numeric_mode | |-------:|:--------------|:------------|:-------------| | low | preserve | keep | range | | medium | generic | generic | range | | high | generic | generic | distribution | - `sensitive_detect` auto-finds common PII by column name. - `sensitive_strategy` chooses how to treat those columns: `"fake"` (tokenize) or `"drop"` (remove). - You can also list sensitive columns yourself with `sensitive = c("id","email", ...)`. ## Levels and strategies ```{r} library(FakeDataR) df <- data.frame( id = 1:50, email = sprintf("u%02d@x.com", 1:50), phone = sprintf("555-01%02d", 1:50), dept = sample(c("A","B","C"), 50, TRUE), spend = round(runif(50, 10, 200), 2), check.names = FALSE ) # Auto-detect sensitive columns and fake them # Strategy: fake sensitive fields (default) fake_low <- generate_fake_with_privacy( data = df, n = 60, level = "low", seed = 1, sensitive_detect = TRUE, sensitive_strategy = "fake", normalize = TRUE ) # Auto-detect and drop sensitive columns # Strategy: drop sensitive fields fake_drop <- generate_fake_with_privacy( data = df, n = 60, level = "medium", seed = 1, sensitive_detect = TRUE, sensitive_strategy = "drop", normalize = TRUE ) names(fake_low) names(fake_drop) # Inspect privacy metadata attr(fake_low, "sensitive_columns") attr(fake_drop, "dropped_columns") attr(fake_low, "name_map") ``` ## Explicit 'sensitive' vs auto-detect You can fully control what’s sensitive. Here we turn off auto-detect and list columns ourselves: ```{r} fake_explicit <- generate_fake_with_privacy( data = df, n = 60, seed = 1, sensitive = c("id","email","phone"), sensitive_detect = FALSE, sensitive_strategy = "fake", normalize = TRUE ) names(fake_explicit) attr(fake_explicit, "sensitive_columns") ``` ## Extending detection with your own patterns ```{r} # A broad, configurable pattern set sensitive_patterns <- c( # direct IDs / names "^id$", "employee[_-]?id", "user(name|[_-]?id)?$", "full[_-]?name", "first[_-]?name", "last[_-]?name", # contact "email|e-mail", "phone|tel|mobile", "fax", # address / geo "address|street|road|avenue|apt|unit|suite|zip|postal|postcode|city|state|province|country", "lat(itude)?|lon(gitude)?|gps", # government IDs (international sampling) "RegId|ssn|sin|nin|aadhaar|aadhar|bvn|curp|dni|ced(ul|)+a|cpf|pan\\b|tin\\b|ein\\b|pesel|nin\\b", # licenses / travel docs "passport|visa|license|licence|driver|dl\\b|vin|plate", # finance / payments "iban|swift|bic|routing|sort[_-]?code|account|acct|bank", "credit|debit|card|cvv|cvc|pan[_-]?number", # auth / secrets / device "password|pass|pwd|pin|otp|secret|token|api[_-]?key|auth|bearer|session|cookie", "ip(_address)?|mac(_address)?|imei|imsi|serial|device|udid|android[_-]?id|idfa|gaid", # medical / patient "mrn|nhs|medicare|medicaid|patient|diagnosis", # birthdays "dob|date[_-]?of[_-]?birth|birth(day|date)", # education "student[_-]?id" ) rx <- paste0("(?i)(", paste(sensitive_patterns, collapse = "|"), ")") sens_cols <- names(df)[grepl(rx, names(df))] sens_cols sens_cols <- names(df)[grepl(rx, names(df))] fake_custom_detect <- generate_fake_with_privacy( data = df, n = 60, seed = 1, sensitive = unique(c(sens_cols, "email")), sensitive_detect = FALSE, sensitive_strategy = "fake", normalize = TRUE ) attr(fake_custom_detect, "sensitive_columns") ``` ## Validation ```{r} v1 <- validate_fake(df, fake_low) head(v1, 5) ```