## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(pipetime) library(dplyr) library(stringr) library(ggplot2) ## ----------------------------------------------------------------------------- set.seed(123) make_str <- function(n) paste(sample(letters, n, TRUE), collapse = "") text_data <- data.frame( id = 1:1e5, email = paste0( sapply(sample(5:15, 1e5, TRUE), make_str), sample(c("@gmail.com", "@yahoo.com", "@hotmail.com"), 1e5, TRUE) ), phone = paste0( "(", sample(100:999, 1e5, TRUE), ") ", sample(100:999, 1e5, TRUE), "-", sample(1000:9999, 1e5, TRUE) ), text = sapply(sample(20:100, 1e5, TRUE), make_str) ) head(text_data, n = 3) ## ----------------------------------------------------------------------------- library(dplyr) library(pipetime) options(pipetime.console = FALSE) # Workflow A: Base R wf_A <- text_data |> mutate( domain = sub(".*@", "", email), clean_phone = gsub("[^0-9]", "", phone), word_count = lengths(strsplit(text, " ")) ) |> time_pipe("extract & clean", log = "base") |> filter(grepl("^[a-m]", text)) |> time_pipe("filter", log = "base") |> mutate( text_upper = toupper(text), truncated = substr(text, 1, 50) ) |> time_pipe("transform", log = "base") # Workflow B: stringr (optimized) wf_B <- text_data |> mutate( domain = str_extract(email, "(?<=@).*"), clean_phone = str_remove_all(phone, "[^0-9]"), word_count = str_count(text, "\\S+") ) |> time_pipe("extract & clean", log = "stringr") |> filter(str_detect(text, "^[a-m]")) |> time_pipe("filter", log = "stringr") |> mutate( text_upper = str_to_upper(text), truncated = str_sub(text, 1, 50) ) |> time_pipe("transform", log = "stringr") ## ----dpi = 500---------------------------------------------------------------- # Collect both logs logs <- get_log() |> bind_rows(.id = "workflow") |> group_by(workflow) |> # Add a starting point group_modify(~ add_row(.x, duration = 0, label = "start", .before = 1)) |> mutate(step = factor(row_number())) library(ggplot2) logs |> ggplot( aes( x = step, y = duration, colour = workflow, group = workflow ) ) + geom_line(linewidth = 1) + geom_point(size = 3) + geom_text(aes(label = label), vjust = -0.7, size = 3.5, show.legend = FALSE) + labs( x = "Step", y = "Cumulative time (sec)", title = "Base R vs stringr", colour = "Workflow" ) + theme_classic()