## ----echo = FALSE------------------------------------------------------------- knitr::opts_chunk$set(collapse = TRUE, comment = "##") ## ----eval=TRUE, message = FALSE----------------------------------------------- # Load readtext package library("readtext") ## ----------------------------------------------------------------------------- # Get the data directory from readtext DATA_DIR <- system.file("extdata/", package = "readtext") ## ----------------------------------------------------------------------------- # Read in all files from a folder readtext(paste0(DATA_DIR, "/txt/UDHR/*")) ## ----------------------------------------------------------------------------- # Manifestos with docvars from filenames readtext(paste0(DATA_DIR, "/txt/EU_manifestos/*.txt"), docvarsfrom = "filenames", docvarnames = c("unit", "context", "year", "language", "party"), dvsep = "_", encoding = "ISO-8859-1") ## ----------------------------------------------------------------------------- # Recurse through subdirectories readtext(paste0(DATA_DIR, "/txt/movie_reviews/*")) ## ----------------------------------------------------------------------------- # Read in comma-separated values readtext(paste0(DATA_DIR, "/csv/inaugCorpus.csv"), text_field = "texts") ## ----------------------------------------------------------------------------- # Read in tab-separated values readtext(paste0(DATA_DIR, "/tsv/dailsample.tsv"), text_field = "speech") ## ----------------------------------------------------------------------------- ## Read in JSON data readtext(paste0(DATA_DIR, "/json/inaugural_sample.json"), text_field = "texts") ## ----------------------------------------------------------------------------- ## Read in Universal Declaration of Human Rights pdf files (rt_pdf <- readtext(paste0(DATA_DIR, "/pdf/UDHR/*.pdf"), docvarsfrom = "filenames", docvarnames = c("document", "language"), sep = "_")) ## ----------------------------------------------------------------------------- ## Read in Word data (.docx) readtext(paste0(DATA_DIR, "/word/*.docx")) ## ----------------------------------------------------------------------------- # Note: Example required: which URL should we use? ## ----------------------------------------------------------------------------- # Note: Archive file required. The only zip archive included in readtext has # different encodings and is difficult to import (see section 4.2). ## ----------------------------------------------------------------------------- if (require("quanteda")) { # read in comma-separated values with readtext rt_csv <- readtext(paste0(DATA_DIR, "/csv/inaugCorpus.csv"), text_field = "texts") # create quanteda corpus corpus_csv <- corpus(rt_csv) summary(corpus_csv, 5) } ## ----message = FALSE---------------------------------------------------------- # Load stringi package require("stringi") ## ----------------------------------------------------------------------------- # Make some text with page numbers sample_text_a <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, page 1 with the newspaper from a boy named quick Seamus, in his mouth. page 2 The quicker brown fox jumped over 2 lazy dogs." sample_text_a # Remove "page" and respective digit sample_text_a2 <- unlist(stri_split_fixed(sample_text_a, '\n'), use.names = FALSE) sample_text_a2 <- stri_replace_all_regex(sample_text_a2, "page \\d*", "") sample_text_a2 <- stri_trim_both(sample_text_a2) sample_text_a2 <- sample_text_a2[sample_text_a2 != ''] stri_paste(sample_text_a2, collapse = '\n') ## ----------------------------------------------------------------------------- sample_text_b <- "The quick brown fox named Seamus - 1 - jumps over the lazy dog also named Seamus, with - 2 - the newspaper from a boy named quick Seamus, in his mouth. - 33 - The quicker brown fox jumped over 2 lazy dogs." sample_text_b sample_text_b2 <- unlist(stri_split_fixed(sample_text_b, '\n'), use.names = FALSE) sample_text_b2 <- stri_replace_all_regex(sample_text_b2, "[-] \\d* [-]", "") sample_text_b2 <- stri_trim_both(sample_text_b2) sample_text_b2 <- sample_text_b2[sample_text_b2 != ''] stri_paste(sample_text_b2, collapse = '\n') ## ----------------------------------------------------------------------------- # create a temporary directory to extract the .zip file FILEDIR <- tempdir() # unzip file unzip(system.file("extdata", "data_files_encodedtexts.zip", package = "readtext"), exdir = FILEDIR) ## ----------------------------------------------------------------------------- # get encoding from filename filenames <- list.files(FILEDIR, "^(Indian|UDHR_).*\\.txt$") head(filenames) # Strip the extension filenames <- gsub(".txt$", "", filenames) parts <- strsplit(filenames, "_") fileencodings <- sapply(parts, "[", 3) head(fileencodings) # Check whether certain file encodings are not supported notAvailableIndex <- which(!(fileencodings %in% iconvlist())) fileencodings[notAvailableIndex] ## ----------------------------------------------------------------------------- txts <- readtext(paste0(DATA_DIR, "/data_files_encodedtexts.zip"), encoding = fileencodings, docvarsfrom = "filenames", docvarnames = c("document", "language", "input_encoding")) print(txts, n = 50) ## ----------------------------------------------------------------------------- if (require("quanteda")) { corpus_txts <- corpus(txts) summary(corpus_txts, 5) }