## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(tlda) ## ----fig.height=1, fig.width=3.5, fig.align='center', echo=FALSE-------------- oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1)) par(mar = c(0, 0, 0, 0), xpd = TRUE) plot(c(1,2), c(1,1), xlim = c(-.2, 2.1), ylim = c(.7, 1.6), axes = FALSE, type = "n") arrows(x0 = 1, x1 = 2, y0 = 1, y1 = 1, code = 3, angle = 90, length = .05) points(x = 1.4, y = 1, pch = "|") text(x = 1, y = .8, label = "0", cex = .8) text(x = 2, y = .8, label = "1", cex = .8) text(x = 1.4, y = .8, label = ".40", cex = .8) text(x = 1, y = 1.2, label = ".01", col = "grey40", cex = .8) text(x = 2, y = 1.2, label = ".11", col = "grey40", cex = .8) text(x = 1.4, y = 1.2, label = ".05", col = "grey40", cex = .8) text(x = .7, y = 1.19, label = "Dispersion scores", adj = 1, col = "grey40", cex = .8) text(x = .7, y = .79, label = "Frequency adjustment", adj = 1, cex = .8) text(x = 1:2, y = 1.5, label = c("Min", "Max"), cex = .8) par(oldpar) ## ----echo=FALSE--------------------------------------------------------------- partsizes <- c( 30, 28, 25, 22, 20, 17, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 7, 5, 4, 3, 2) subfreqs <- c( 8, 3, 7, 2, 5, 3, 1, 4, 3, 0, 2, 0, 2, 4, 1, 0, 5, 1, 0, 1, 2, 0, 3, 1, 2, 0, 1, 0, 0, 0) ## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="Illustrative set of data."---- plot_col <- matrix( rep(rep("transparent", max(partsizes)), length(subfreqs)), nrow = max(partsizes), byrow = FALSE) for(i in 1:length(partsizes)) plot_col[1:partsizes[i], i] <- "black" plot_fill <- matrix( rep(rep("transparent", max(partsizes)), length(subfreqs)), nrow = max(partsizes), byrow = FALSE) for(i in 1:length(partsizes)) plot_fill[(1:(subfreqs[i] + 1)) - 1, i] <- "grey40" oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1)) par(mar = c(2, 0, 0, 0), xpd = TRUE) plot(rep(1:length(subfreqs), each = max(partsizes)), rep(1:max(partsizes), length(subfreqs)), col = plot_col, pch = 22, bg = plot_fill, axes = FALSE, xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5) text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8) par(oldpar) ## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="A **minimally pervasive** distribution."---- subfreq_min_disp <- find_min_disp( subfreq = subfreqs, partsize = partsizes, freq_adjust_method = "pervasive") plot_fill_min <- matrix( rep(rep("transparent", max(partsizes)), length(subfreq_min_disp)), nrow = max(partsizes), byrow = FALSE) for(i in 1:length(partsizes)) plot_fill_min[(1:(subfreq_min_disp[i] + 1)) - 1, i] <- "grey40" oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1)) par(mar = c(2, 0, 0, 0), xpd = TRUE) plot(rep(1:length(subfreq_min_disp), each = max(partsizes)), rep(1:max(partsizes), length(subfreq_min_disp)), col = plot_col, pch = 22, bg = plot_fill_min, axes = FALSE, xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5) text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8) par(oldpar) ## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="A **maximally pervasive** distribution."---- subfreq_max_disp <- find_max_disp( subfreq = subfreqs, partsize = partsizes, freq_adjust_method = "pervasive") plot_fill_max <- matrix( rep(rep("transparent", max(partsizes)), length(subfreq_max_disp)), nrow = max(partsizes), byrow = FALSE) for(i in 1:length(partsizes)) plot_fill_max[(1:(subfreq_max_disp[i] + 1)) - 1, i] <- "grey40" oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1)) par(mar = c(2, 0, 0, 0), xpd = TRUE) plot(rep(1:length(subfreq_max_disp), each = max(partsizes)), rep(1:max(partsizes), length(subfreqs)), col = plot_col, pch = 22, bg = plot_fill_max, axes = FALSE, xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5) text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8) par(oldpar) ## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="A **minimally even** distribution."---- subfreq_min_disp <- find_min_disp( subfreq = subfreqs, partsize = partsizes, freq_adjust_method = "even") subfreq_min_disp[22] <- subfreq_min_disp[20] subfreq_min_disp[20] <- 0 plot_fill_max <- matrix( rep(rep("transparent", max(partsizes)), length(subfreqs)), nrow = max(partsizes), byrow = FALSE) for(i in 1:length(partsizes)) plot_fill_max[(1:(subfreq_min_disp[i] + 1)) - 1, i] <- "grey40" oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1)) par(mar = c(2, 0, 0, 0), xpd = TRUE) plot(rep(1:length(subfreqs), each = max(partsizes)), rep(1:max(partsizes), length(subfreqs)), col = plot_col, pch = 22, bg = plot_fill_max, axes = FALSE, xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5) text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8) par(oldpar) ## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="A **maximally even** distribution."---- subfreq_max_disp <- find_max_disp( subfreq = subfreqs, partsize = partsizes, freq_adjust_method = "even") plot_fill_max <- matrix( rep(rep("transparent", max(partsizes)), length(subfreqs)), nrow = max(partsizes), byrow = FALSE) for(i in 1:length(partsizes)) plot_fill_max[(1:(subfreq_max_disp[i] + 1)) - 1, i] <- "grey40" oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1)) par(mar = c(2, 0, 0, 0), xpd = TRUE) plot(rep(1:length(subfreqs), each = max(partsizes)), rep(1:max(partsizes), length(subfreqs)), col = plot_col, pch = 22, bg = plot_fill_max, axes = FALSE, xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5) text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8) par(oldpar) ## ----fig.width=4, fig.height=2.5, warning=FALSE, message=FALSE, out.width="30%"---- oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1)) par(mar = c(4, 4, 1, 0.3), xpd = TRUE) hist( biber150_spokenBNC1994[1,], main = NULL, xlab = "Size of corpus parts (speakers)", breaks = seq(0, 70000, length=40), col = "grey60") par(oldpar) ## ----------------------------------------------------------------------------- DM_even <- disp_tdm( biber150_spokenBNC1994, row_partsize = "first", freq_adjust = TRUE, freq_adjust_method = "even", unit_interval = FALSE, print_score = FALSE, verbose = FALSE, suppress_warning = TRUE) DM_pervasive <- disp_tdm( biber150_spokenBNC1994, row_partsize = "first", freq_adjust = TRUE, freq_adjust_method = "pervasive", unit_interval = FALSE, print_score = FALSE, verbose = FALSE, suppress_warning = TRUE) ## ----------------------------------------------------------------------------- round( apply( DM_even, 2, range, na.rm = TRUE), 2) ## ----------------------------------------------------------------------------- apply( DM_even, 2, function(x){ sum(x < 0 | x > 1, na.rm = TRUE) }) ## ----------------------------------------------------------------------------- round( apply( DM_pervasive, 2, range, na.rm = TRUE), 2) ## ----------------------------------------------------------------------------- apply( DM_pervasive, 2, function(x){ sum(x < 0 | x > 1, na.rm = TRUE) })