library(data.table) library(ggplot2) library(magrittr) library(pbapply) large <- fread("ERR260132_genes.csv") #' Sample a rarefied version of a count vector. #' #' @param x A named vector of counts. #' @param n Total number of counts in the new vector. #' @return A named vector with counts for each observed event in the sample. #' @examples #' counts <- 1:100 #' subsample(counts, 10) #' @export subsample <- function(x, n) { sa <- tabulate(sample(1:length(x), n, replace=TRUE, prob=x)) names(sa) <- names(x)[1:length(sa)] return(sa[sa > 0]) } #' Orlitsky's diminishing attenuation estimator (q1/3). #' #' @param data A named vector of counts for which to approximate discrete #' probablities. #' @return A named vector `p` assigning a probability to each event in data. #' Those do not sum up to one since there is also a remaining probability to #' observe a new event given as p(new) = 1 - sum(p). #' @examples #' x <- sample(1:10, 100, replace=T) #' p <- orlitsky(x) #' @export orlitsky <- function(data) { n <- length(data) phi <- c(tabulate(data), 0) cn <- ceiling((n+1)^1/3) new <- max(cn, phi[1] + 1) probs <- (data + 1) * pmax(cn, phi[data + 1] + 1) / pmax(cn, phi[data]) names(probs) <- names(data) return(probs/sum(c(probs, new))) } real <- large[, expected_count] names(real) <- large[, name] #real <- rep(1, 1000) #names(real) <- as.character(1:1000) real <- real/sum(real) error <- function(x, y) { return(median(x / y[names(x)])) } #' Benchmark different probability estimators. #' #' @param real The real discrete distribution from which to generate samples. #' @param n A series of sample sizes. #' @return Nothing. #' @examples #' p <- (1:10)/10 #' benchmark(p) #' @export benchmark <- function(real, n = 10^seq(1, 8, length.out=100)) { samples <- pblapply(10^seq(1, 8, length.out=100), function(k) { x <- subsample(real, k) po <- orlitsky(x) # diminishing attenuation pp <- x/sum(x) # proportions pc <- (x + 1)/(sum(x + 1) + 1) # pseudo counts data.table(n=k, orlitsky=error(po, real), orlitsky_new=1 - sum(po), proportion=error(pp, real), pseudo=error(pc, real)) }) return(rbindlist(samples)) } bench <- benchmark(real) bench_long <- melt(bench, id.vars="n") att_plot <- ggplot(rare_long[!grepl("new", variable)], aes(x=n, y=value, col=variable)) + geom_point() + geom_smooth() + scale_x_log10() + scale_y_log10()