Sampling parameters and simulating data

Christian Bartels
Allschwil, June-2014

Example from

Efficient generic integration algorithm to determine confidence intervals and p-values for hypothesis testing. Christian Bartels. figshare.

http://dx.doi.org/10.6084/m9.figshare.1054694

Introduction

Setup

# clean up
rm(list = ls())

# print out version information, path, date
R.Version()
## $platform
## [1] "x86_64-w64-mingw32"
## 
## $arch
## [1] "x86_64"
## 
## $os
## [1] "mingw32"
## 
## $system
## [1] "x86_64, mingw32"
## 
## $status
## [1] ""
## 
## $major
## [1] "3"
## 
## $minor
## [1] "1.0"
## 
## $year
## [1] "2014"
## 
## $month
## [1] "04"
## 
## $day
## [1] "10"
## 
## $`svn rev`
## [1] "65387"
## 
## $language
## [1] "R"
## 
## $version.string
## [1] "R version 3.1.0 (2014-04-10)"
## 
## $nickname
## [1] "Spring Dance"
date()
## [1] "Thu Jun 12 18:34:16 2014"
Sys.info()
##                      sysname                      release 
##                    "Windows"                      "7 x64" 
##                      version                     nodename 
## "build 7601, Service Pack 1"               "CHRISTIAN-PC" 
##                      machine                        login 
##                     "x86-64"                  "Christian" 
##                         user               effective_user 
##                  "Christian"                  "Christian"

# get libraries
library(rstan)
## Loading required package: Rcpp
## Loading required package: inline
## 
## Attaching package: 'inline'
## 
## Das folgende Objekt ist maskiert from 'package:Rcpp':
## 
##     registerPlugin
## 
## rstan (Version 2.2.0, packaged: 2014-02-14 04:29:17 UTC, GitRev: 52d7b230aaa0)
library(ggplot2)
library(data.table)


set_cppo("fast")
## mode fast without NDEBUG for compiling C++ code has been set already

Define functions

# my likelihood function x ... observations or samples of observations col:
# repeated measures (evaluated with the same parameter) row: different
# samples (evaluate with different parameters) par ... data.frame of
# parameters col: different parameters, e.g., mean and sd row: samples
# return vector of log-likelihood of samples
my.ll <- function(x, par) {
    return(rowSums(dnbinom(x, size = par$size, prob = par$prob, log = TRUE)))
}

# my maximum likelihood estimator - use numerical optimization (slow!)
my.opt <- function(parc, x, fix = data.frame(size = NA, prob = NA)) {
    parf <- fix
    parf[, is.na(parf[1, ])] <- parc
    return(-my.ll(x, parf))
}
my.maxll <- function(x) {
    df1 <- do.call(rbind, apply(x, 1, function(x1) {
        n <- length(x1)
        op <- optim(c(0.08, 0.06), my.opt, x = matrix(x1, nrow = 1), control = list(maxit = 100))
        dft <- data.frame(size = op$par[1], prob = op$par[2], mll = -op$value)  # max. likelihood parameters
        return(dft)
    }))
    return(df1)
}

# my log prior par ... data.frame of parameters col: different parameters,
# e.g., mean and sd row: samples return vector of log-priors of samples
my.prior <- function(par) {
    return(0)
}

# my posterior
my.post <- function(x, par) return(my.ll(x, par) + my.prior(par))

# my simulation function par ... data.frame of parameters col: different
# parameters, e.g., mean and sd row: samples n ... number of repeated
# measures
my.sim <- function(n, par) {
    m <- nrow(par)
    return(matrix(rnbinom(n * m, size = par$size, prob = par$prob), ncol = n))
}

Generate data to be analyzed

# make reproducible
set.seed(2)

# generate observed data - series of zeros
n = 10  # number of repeated observations
x <- matrix(c(10, 1, rep(0, n - 2)), ncol = n)  # observed data

Parameter sampling

Use Stan to sample “reasonable” parameters

# list model
modelSample = "mcmc6.model"  # model to sample parameters
cbind(system(paste("cat ", modelSample, sep = ""), intern = TRUE))
##       [,1]                                                     
##  [1,] "data {"                                                 
##  [2,] "  int<lower=0> N;"                                      
##  [3,] "  int x[N];"                                            
##  [4,] "} "                                                     
##  [5,] "parameters {"                                           
##  [6,] "  real<lower=1E-4,upper=1E4> alpha;"                    
##  [7,] "  real<lower=1E-4,upper=1E4> beta;"                     
##  [8,] "} "                                                     
##  [9,] "model {"                                                
## [10,] "    x  ~ neg_binomial(alpha, beta);  // fit"            
## [11,] "    increment_log_prob(-log(alpha)-log(beta)); // prior"
## [12,] "} "                                                     
## [13,] "generated quantities { "                                
## [14,] "  real<lower=0,upper=1> prob;"                          
## [15,] "  prob <- beta/(1+beta);"                               
## [16,] "}"                                                      
## [17,] ""

# compile model
parSampler <- stan_model(modelSample, verbose = FALSE)
## 
## TRANSLATING MODEL 'mcmc6' FROM Stan CODE TO C++ CODE NOW.
## COMPILING THE C++ CODE FOR MODEL 'mcmc6' NOW.
## cygwin warning:
##   MS-DOS style path detected: C:/PROGRA~1/R/R-31~1.0/etc/x64/Makeconf
##   Preferred POSIX equivalent is: /cygdrive/c/PROGRA~1/R/R-31~1.0/etc/x64/Makeconf
##   CYGWIN environment variable option "nodosfilewarning" turns off this warning.
##   Consult the user's guide for more details about POSIX paths:
##     http://cygwin.com/cygwin-ug-net/using.html#using-pathnames
## C:/Program Files/R/R-3.1.0/library/rstan/include//stansrc/stan/agrad/rev/var_stack.hpp:49:17: warning: 'void stan::agrad::free_memory()' defined but not used [-Wunused-function]
## C:/Program Files/R/R-3.1.0/library/rstan/include//stansrc/stan/agrad/rev/chainable.hpp:87:17: warning: 'void stan::agrad::set_zero_all_adjoints()' defined but not used [-Wunused-function]

# call Stan
dat <- list(N = n, x = c(x))
parSample <- sampling(parSampler, data = dat, init = 0, iter = 8000, thin = 4, 
    seed = 123)
## SAMPLING FOR MODEL 'mcmc6' NOW (CHAIN 1).
## 
Iteration:    1 / 8000 [  0%]  (Warmup)
Iteration:  800 / 8000 [ 10%]  (Warmup)
Iteration: 1600 / 8000 [ 20%]  (Warmup)
Iteration: 2400 / 8000 [ 30%]  (Warmup)
Iteration: 3200 / 8000 [ 40%]  (Warmup)
Iteration: 4000 / 8000 [ 50%]  (Warmup)
Iteration: 4800 / 8000 [ 60%]  (Sampling)
Iteration: 5600 / 8000 [ 70%]  (Sampling)
Iteration: 6400 / 8000 [ 80%]  (Sampling)
Iteration: 7200 / 8000 [ 90%]  (Sampling)
Iteration: 8000 / 8000 [100%]  (Sampling)
## Elapsed Time: 0.109 seconds (Warm-up)
##               0.091 seconds (Sampling)
##               0.2 seconds (Total)
## 
## SAMPLING FOR MODEL 'mcmc6' NOW (CHAIN 2).
## 
Iteration:    1 / 8000 [  0%]  (Warmup)
Iteration:  800 / 8000 [ 10%]  (Warmup)
Iteration: 1600 / 8000 [ 20%]  (Warmup)
Iteration: 2400 / 8000 [ 30%]  (Warmup)
Iteration: 3200 / 8000 [ 40%]  (Warmup)
Iteration: 4000 / 8000 [ 50%]  (Warmup)
Iteration: 4800 / 8000 [ 60%]  (Sampling)
Iteration: 5600 / 8000 [ 70%]  (Sampling)
Iteration: 6400 / 8000 [ 80%]  (Sampling)
Iteration: 7200 / 8000 [ 90%]  (Sampling)
Iteration: 8000 / 8000 [100%]  (Sampling)
## Elapsed Time: 0.107 seconds (Warm-up)
##               0.117 seconds (Sampling)
##               0.224 seconds (Total)
## 
## SAMPLING FOR MODEL 'mcmc6' NOW (CHAIN 3).
## 
Iteration:    1 / 8000 [  0%]  (Warmup)
Iteration:  800 / 8000 [ 10%]  (Warmup)
Iteration: 1600 / 8000 [ 20%]  (Warmup)
Iteration: 2400 / 8000 [ 30%]  (Warmup)
Iteration: 3200 / 8000 [ 40%]  (Warmup)
Iteration: 4000 / 8000 [ 50%]  (Warmup)
Iteration: 4800 / 8000 [ 60%]  (Sampling)
Iteration: 5600 / 8000 [ 70%]  (Sampling)
Iteration: 6400 / 8000 [ 80%]  (Sampling)
Iteration: 7200 / 8000 [ 90%]  (Sampling)
Iteration: 8000 / 8000 [100%]  (Sampling)
## Elapsed Time: 0.109 seconds (Warm-up)
##               0.121 seconds (Sampling)
##               0.23 seconds (Total)
## 
## SAMPLING FOR MODEL 'mcmc6' NOW (CHAIN 4).
## 
Iteration:    1 / 8000 [  0%]  (Warmup)
Iteration:  800 / 8000 [ 10%]  (Warmup)
Iteration: 1600 / 8000 [ 20%]  (Warmup)
Iteration: 2400 / 8000 [ 30%]  (Warmup)
Iteration: 3200 / 8000 [ 40%]  (Warmup)
Iteration: 4000 / 8000 [ 50%]  (Warmup)
Iteration: 4800 / 8000 [ 60%]  (Sampling)
Iteration: 5600 / 8000 [ 70%]  (Sampling)
Iteration: 6400 / 8000 [ 80%]  (Sampling)
Iteration: 7200 / 8000 [ 90%]  (Sampling)
Iteration: 8000 / 8000 [100%]  (Sampling)
## Elapsed Time: 0.108 seconds (Warm-up)
##               0.097 seconds (Sampling)
##               0.205 seconds (Total)

# output from stan
print(parSample, digits = 3)
## Inference for Stan model: mcmc6.
## 4 chains, each with iter=8000; warmup=4000; thin=4; 
## post-warmup draws per chain=1000, total post-warmup draws=4000.
## 
##          mean se_mean    sd    2.5%     25%     50%     75%   97.5% n_eff
## alpha   0.078   0.002 0.088   0.006   0.026   0.050   0.097   0.313  1365
## beta    0.080   0.003 0.127   0.000   0.006   0.032   0.100   0.419  1966
## prob    0.064   0.002 0.084   0.000   0.006   0.031   0.091   0.296  2021
## lp__  -11.202   0.021 1.056 -13.939 -11.660 -10.920 -10.436 -10.088  2446
##        Rhat
## alpha 1.004
## beta  1.003
## prob  1.003
## lp__  1.001
## 
## Samples were drawn using NUTS(diag_e) at Thu Jun 12 18:35:27 2014.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

plot(parSample)

plot of chunk unnamed-chunk-4

traceplot(parSample)

plot of chunk unnamed-chunk-4


# get samples of parameters (representing integral)
par <- as.data.table(as.data.frame(parSample))

# rename col-names to match rest of code
setnames(par, c("alpha", "prob"), c("size", "prob"))

# add identifier & key
par[, `:=`(ipar, 1:nrow(par))]
##          size      beta      prob   lp__ ipar
##    1: 0.02843 0.3428034 0.2552893 -12.98    1
##    2: 0.04396 0.0025133 0.0025070 -11.13    2
##    3: 0.17007 0.0615702 0.0579992 -10.89    3
##    4: 0.02535 0.0007464 0.0007459 -11.56    4
##    5: 0.02767 0.0062750 0.0062359 -10.89    5
##   ---                                        
## 3996: 0.04112 0.0947016 0.0865090 -10.58 3996
## 3997: 0.04456 0.0496515 0.0473029 -10.30 3997
## 3998: 0.10385 0.0285465 0.0277542 -10.58 3998
## 3999: 0.01512 0.0072160 0.0071643 -11.49 3999
## 4000: 0.12845 0.1243502 0.1105974 -10.17 4000
setkey(par, ipar)

Simulate data, used sampled parameters to guide sampling

# simulate
dat <- as.data.table(my.sim(n, par))

# add identifier & key
dat[, `:=`(idat, 1:nrow(dat))]
##       V1 V2 V3 V4 V5 V6  V7 V8 V9 V10 idat
##    1:  0  0  0  0  0  0   0  0  0   0    1
##    2:  0  0  0  0  0  0   0  0 16   0    2
##    3: 13  5  2  0  5  0   0  1  0   1    3
##    4:  0  0  0  0  0  0   0  0  0   0    4
##    5:  0  0  0  8  0  0   0  0  0   0    5
##   ---                                     
## 3996:  0  0  0  0  0  0   0  0  0   0 3996
## 3997:  0  0  0  0  0  0   0  0  0   0 3997
## 3998:  0  4  1  0  1  0 141  0  0   1 3998
## 3999:  0  0  0  0 70  0   0  0  0   0 3999
## 4000:  0  0  0  0  0  0   0  0  0   0 4000
setkey(dat, idat)

# indices of data columns
iix <- match(grep("V", colnames(dat), value = TRUE), colnames(dat))

Prepare further calculations - log-likelihoods and priors

Evaluate statistical tests for data

Given individual parameter vector as null hypothesis and complementary space as alternative hypothesis

# test c: likelihood ratio test ll ... log likelihood of null hypothesis,
# i.e., data given selected parameter mll ... log likelihood of alternative
# hypothesis, i.e., data and max. likl. parameter
ftc <- function(ll, mll) {
    return(-2 * ll + 2 * mll)
}

# test d: posterior test - probability of parameter given data
ftd <- function(wn.par) {
    return(1/wn.par)
}


# evaluate tests for simulated data
llm[, `:=`(tc, ftc(ll, dat[idat, mll])), by = idat]
llm[, `:=`(td, ftd(wn.par)), by = idat]

# evaluate tests for observed data
par[, `:=`(tcr, ftc(ll, mll.obs$mll))]
par[, `:=`(tdr, ftd(1/nrow(par)))]

Calculate p-values

Plot p-values

ggplot(par, aes(x = size, y = prob, colour = cut_number(p.c, n = 10))) + geom_point() + 
    guides(colour = guide_legend("p-value")) + ggtitle("Likelihood-Ratio Test")

plot of chunk negBinom


ggplot(par, aes(x = size, y = prob, colour = cut_number(p.d, n = 10))) + geom_point() + 
    guides(colour = guide_legend("p-value")) + ggtitle("Posterior Test")

plot of chunk negBinom


ggplot(par, aes(x = p.c, y = p.d)) + geom_point()

plot of chunk negBinom


ggplot(par, aes(x = tcr, y = p.c)) + geom_point(aes(y = 1 - pchisq(tcr, 2)), 
    colour = "yellow") + geom_point(alpha = 0.5) + xlab("Log Likelihood Ratio") + 
    ylab("p-value")

plot of chunk negBinom