## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", echo = FALSE ) ## ----setup-------------------------------------------------------------------- library(dedupewider) library(kableExtra) library(magrittr) library(data.table) ## ----display-table------------------------------------------------------------ display_tbl <- function(x) { kbl(x) %>% kable_paper(full_width = FALSE) } ## ----------------------------------------------------------------------------- example_table <- data.frame(Tel_1 = c(111, 222, 444, 555), Tel_2 = c(222, 666, 666, 555), Name = paste0("name", 1:4)) display_tbl(example_table) ## ----initial-table-usage------------------------------------------------------ initial_table <- data.frame(tel_1 = c(111, 222, 444, 555), tel_2 = c(222, 666, 666, 555), tel_3 = c(NA, NA, NA, 555), tel_4 = c(NA, NA, NA, 555), tel_5 = c(NA, NA, NA, 555), name = paste0("name", 1:4), nace = c("01.19", "01.64", "55.90", "09.10")) display_tbl(initial_table) ## ----usage-default, echo=TRUE, results='hide'--------------------------------- library(dedupewider) initial_table <- data.frame(tel_1 = c(111, 222, 444, 555), tel_2 = c(222, 666, 666, 555), tel_3 = c(NA, NA, NA, 555), tel_4 = c(NA, NA, NA, 555), tel_5 = c(NA, NA, NA, 555), name = paste0("name", 1:4), nace = c("01.19", "01.64", "55.90", "09.10")) table_deduplicated <- dedupe_wide(initial_table, cols_dedupe = paste0("tel_", 1:5), cols_expand = "name") table_deduplicated ## ----usage-default-display---------------------------------------------------- display_tbl(table_deduplicated) ## ----usage-multiple-expand, echo=TRUE, results='hide'------------------------- table_deduplicated <- dedupe_wide(initial_table, cols_dedupe = paste0("tel_", 1:5), cols_expand = c("name", "nace")) table_deduplicated ## ----usage-multiple-expand-display-------------------------------------------- display_tbl(table_deduplicated) ## ----usage-max-new-cols, echo=TRUE, results='hide'---------------------------- table_deduplicated <- dedupe_wide(initial_table, cols_dedupe = paste0("tel_", 1:5), cols_expand = c("name", "nace"), max_new_cols = 2) table_deduplicated ## ----usage-max-new-cols-display----------------------------------------------- display_tbl(table_deduplicated) ## ----duplicated-unique-na, echo=TRUE, results='hide'-------------------------- duplicated(c(NA, NA)) # returns FALSE TRUE unique(c(NA, NA)) # returns NA (vector of length 1) ## ----dedupe_wide-na-initial-table--------------------------------------------- table_na <- data.frame(tel_1 = c(111, 222, NA, NA), tel_2 = c(222, 111, NA, NA), name = paste0("name", 5:8)) display_tbl(table_na) ## ----dedupe_wide-na----------------------------------------------------------- table_na_dedupe <- dedupe_wide(table_na, cols_dedupe = c("tel_1", "tel_2"), cols_expand = "name") display_tbl(table_na_dedupe) ## ----usage-parallel, echo=TRUE, results='hide'-------------------------------- data.table::setDTthreads() dedupe_wide(initial_table, cols_dedupe = paste0("tel_", 1:5), cols_expand = "name") ## ----initial-table-algorithm-------------------------------------------------- initial_table <- data.frame(tel_1 = c(111, 222, 333, 444, 333, 333, 666, 777, 333, 333, 333, 333), tel_2 = c(777, 666, NA, 666, NA, NA, NA, NA, NA, 777, 888, 777), tel_3 = c(999, NA, NA, 333, NA, NA, NA, NA, NA, 101, 102, 777), tel_4 = c(102, NA, NA, NA, NA, NA, NA, NA, NA, 103, NA, 103), name = paste0("name", 1:12)) display_tbl(initial_table) ## ----gather-cols-------------------------------------------------------------- x <- data.frame(tel_1 = c(111, 222, 333, 444, 333, 333, 666, 777, 333, 333, 333, 333), tel_2 = c(777, 666, NA, 666, NA, NA, NA, NA, NA, 777, 888, 777), tel_3 = c(999, NA, NA, 333, NA, NA, NA, NA, NA, 101, 102, 777), tel_4 = c(102, NA, NA, NA, NA, NA, NA, NA, NA, 103, NA, 103), name = paste0("name", 1:12)) cols_dedupe <- paste0("tel_", 1:4) setDT(x) x[, ....idx := .I] x_tmp <- suppressWarnings(melt.data.table(x, id.vars = "....idx", measure.vars = cols_dedupe, na.rm = TRUE)) x_tmp[, variable := NULL] display_tbl(head(x_tmp)) ## ----group-by-value----------------------------------------------------------- x_tmp[, V1 := list(list(....idx)), by = "value"][, ....idx := NULL] x_tmp <- unique(x_tmp, by = "value") x_tmp[, `:=`(filter_col = lengths(V1), value = NULL)] x_tmp <- x_tmp[filter_col > 1L] x_tmp[, V1 := lapply(V1, unique)][, `:=`(main_index = unlist(lapply(V1, min), use.names = FALSE), filter_col = lengths(V1))] display_tbl(x_tmp[, c("V1", "main_index")]) ## ----make-pairs-indexes------------------------------------------------------- indexes_more_than_one_occurrence <- x_tmp indexes_more_than_one_occurrence_vector <- indexes_more_than_one_occurrence[, list(indexes = unique(sort(unlist(V1, use.names = FALSE), decreasing = TRUE)))][["indexes"]] indexes_more_than_one_occurrence[, V1 := lapply(V1, function(x) x[-which.min(x)])] indexes_more_than_one_occurrence <- indexes_more_than_one_occurrence[, list(rest_indexes = unlist(V1, use.names = FALSE)), by = main_index] setorder(indexes_more_than_one_occurrence, -rest_indexes, -main_index) indexes_more_than_one_occurrence <- unique(indexes_more_than_one_occurrence) display_tbl(indexes_more_than_one_occurrence) ## ----example-12--------------------------------------------------------------- display_tbl(data.frame(main_index = c(10, 3, 1), rest_indexes = c(12, 10, 3))) ## ----------------------------------------------------------------------------- cascade_indexes <- function(indexes_more_than_one_occurrence_vector, indexes_more_than_one_occurrence) { rest_indexes <- main_index <- filter <- NULL for (i in indexes_more_than_one_occurrence_vector) { which <- which(indexes_more_than_one_occurrence[["rest_indexes"]] == i) value <- shift(indexes_more_than_one_occurrence[which][["main_index"]], fill = i) set(indexes_more_than_one_occurrence, which, "rest_indexes", value) setorder(indexes_more_than_one_occurrence, -rest_indexes, -main_index) } indexes_more_than_one_occurrence[, filter := fifelse(main_index == rest_indexes, FALSE, TRUE)] indexes_more_than_one_occurrence <- indexes_more_than_one_occurrence[filter == TRUE][, filter := NULL] indexes_more_than_one_occurrence <- unique(indexes_more_than_one_occurrence, by = "rest_indexes") indexes_more_than_one_occurrence } indexes_more_than_one_occurrence <- cascade_indexes(indexes_more_than_one_occurrence_vector, indexes_more_than_one_occurrence) display_tbl(indexes_more_than_one_occurrence)