Generic and consistent confidence and credible regions - example normal distribution

Christian Bartels
Allschwil, 31-Aug-2015

Implementation of algorithm described at:
Generic and consistent confidence and credible regions. Christian Bartels.figshare.
http://dx.doi.org/10.6084/m9.figshare.1528163

Available at:
Code - Generic and consistent confidence and credible regions. Christian Bartels. figshare.
http://dx.doi.org/10.6084/m9.figshare.1528187

Setup

# clean up
rm(list = ls())

# get libraries
library(rstan)
## Loading required package: Rcpp
## Loading required package: inline
## Warning: package 'inline' was built under R version 3.1.3
## 
## Attaching package: 'inline'
## 
## The following object is masked from 'package:Rcpp':
## 
##     registerPlugin
## 
## rstan (Version 2.6.0, packaged: 2015-02-06 21:02:34 UTC, GitRev: 198082f07a60)
library(ggplot2)
library(data.table)
require(matrixStats) # to get logSumExp()
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 3.1.3
## matrixStats v0.14.2 (2015-06-23) successfully loaded. See ?matrixStats for help.
theme_set(theme_bw())

set_cppo("fast")
## mode fast without NDEBUG for compiling C++ code has been set already

Define functions

# my likelihood function
# x ... observations or samples of observations
#    col: repeated measures (evaluated with the same parameter)
#    row: different samples (evaluate with different parameters)
# par ... data.frame of parameters
#    col: different parameters, e.g., mean and sd
#    row: samples
# return vector of log-likelihood of samples
my.ll <- function(x,par) {
   # str(x)
   # str(par)
   return(rowSums(dnorm(x,par$mean,par$sd,log=TRUE)))
# dnorm(matrix(1,2,3),c(0,1),log=TRUE)
# rowSums(dnorm(matrix(1,2,3),c(0,1),log=TRUE))
}



# my simulation function
# par ... data.frame of parameters
#    col: different parameters, e.g., mean and sd
#    row: samples
# n ... number of repeated measures
my.sim <- function(n,par) {
  m<-nrow(par)
  return(matrix(rnorm(n*m,par$mean,par$sd),ncol=n))
  # matrix(rnorm(10,c(0,10),c(0.01,0.01)),ncol=5)
}

Generate data to be analyzed

Data is simulated from a normal distribution.

# make reproducible
set.seed(1)

# generate observed data
n=10  # number of repeated observations
p0<-data.frame(mean=1,sd=1) # 'unknown truth'
x<-my.sim(n,p0) # observed data
# my.ll(x,p0)

10 values are sampled from a normal distribution with mean 1 and standard deviation 1.

1. Sample parameters theta

Use Stan to sample “reasonable” parameters

# list model
modelSample = "mcmc7.model" # model to sample parameters
cbind(system(paste("cat ",modelSample,sep=""),intern=TRUE))

# compile model 
parSampler <- stan_model(modelSample,verbose=FALSE) 

# call Stan
dat <- list(N=n, x=c(x)) 
parSample <- sampling(parSampler, data = dat, init = 0, iter = 20000, thin=10, seed=123)

# output from stan
print(parSample,digits=3)

plot(parSample)

traceplot(parSample)

# get samples of parameters (representing integral)
par<-as.data.table(as.data.frame(parSample))

# rename col-names to match rest of code
setnames(par,c("m","s"),c("mean","sd"))

# add identifier & key
par[,ipar:=1:nrow(par)]
setkey(par,ipar)

2. Sample data points

Simulate data; use sampled parameters to guide sampling

# simulate
dat<-as.data.table(my.sim(n,par))

# add identifier & key
dat[,idat:=1:nrow(dat)]
setkey(dat,idat)

# indices of data columns
iix <- match(grep("V",colnames(dat),value=TRUE),colnames(dat))

Prepare further calculations - log-likelihoods

# likelihood of observed data for sampled parameters - f(thetaj,x0)
par[,ll:=my.ll(x,.SD[1,]),by=ipar]

# check
ggplot(par,aes(x=lp__-log(sd),y=ll-prior)) + geom_point()

# likelihood of simulated data given sampled parameters - f(thetaj,xi)
iix <- match(grep("V",colnames(dat),value=TRUE),colnames(dat))
llm <- par[,list(idat=dat$idat,ll=my.ll(as.matrix(dat[,iix,with=FALSE]),.SD[1,])),by=list(ipar)]
setkey(llm,ipar,idat)

# log likelihood used for simulation of data 
dat <- dat[llm[,list(lp.sample=logSumExp(ll)),by=idat]]

3. measure for integrals over data given parameter

# weights of data to evaluate integral over data for given parameter (independent of prior)
llm[,w.sample:=ll-dat[idat,lp.sample]]  
llm[,wn.sample:=(w.sample-logSumExp(w.sample)),by=ipar]      

4. determine reference prior

# measure of the sampled paramters 
# parameters were sample with a sampling prior 
llm[,w.par.d:=ll-par[ipar,ll+prior]]  
llm[,wn.par.d:=(w.par.d-logSumExp(w.par.d)),by=idat]        


# calculate log of reference prior via Kullback-Leibler divergence
llm[,kl.p:=exp(wn.sample)*wn.par.d]
par[,kl.pr:=NULL]
par <- par[llm[,list(kl.pr=(sum(kl.p))),by=ipar]]

5. measure for integrals over parameters given data

# measure of joint distribution
llm[,w.joint:=ll-dat[idat,lp.sample]+par[ipar,kl.pr]]      
llm[,wn.joint:=w.joint-logSumExp(w.joint)]      

# marginal of parameters (=reference prior)
par <- par[llm[,list(wn.marg.par=logSumExp(wn.joint)),by=ipar]]
ggplot(par,aes(x=(wn.marg.par),y=kl.pr)) + geom_point()

# marginal of data
dat <- dat[llm[,list(wn.marg.dat=logSumExp(wn.joint)),by=idat]]

# log mutual information
llm[,mut:=wn.joint-dat[idat,wn.marg.dat]-par[ipar,wn.marg.par]]

# conditional on parameters - same as wn.sample (redundant code)
llm[,wn.cond.dat:=wn.joint-logSumExp(wn.joint),by=ipar]

# conditional on data - same as wn.par (redundant code)
llm[,wn.cond.par:=wn.joint-logSumExp(wn.joint),by=idat]


# measure of paramters to evaluate integral over parameters given observed data
# using reference prior
par[,w.par:=(kl.pr+ll)]
par[,wn.par:=w.par-logSumExp(w.par)]              # do normalization

# measure of paramters to evaluate integral over parameters for any sampled data
# using reference prior
llm[,w.par:=(par[ipar,kl.pr]+ll)]  # unnormalized weight
llm[,wn.par:=w.par-logSumExp(w.par),by=idat]              # do normalization
ggplot(llm[ipar==4],aes(x=(wn.cond.par),y=(wn.par))) + geom_point()

Some illustrations

# calculate mean and sd of sampled data
iix <- match(grep("V",colnames(dat),value=TRUE),colnames(dat))
dat[,mean.i:=mean(unlist(dat[idat,iix,with=FALSE])),by=idat]
dat[,sd.i:=sd(unlist(dat[idat,iix,with=FALSE])),by=idat]

# sampled parameters - colour: log-likelihood of observed data (measure used for sampling)
ggplot(par,aes(x=mean,y=sd,colour=cut_number(ll,n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x)) 

# sampled parameters - colour: reference prior - measure used for sampling 
ggplot(par,aes(x=mean,y=sd,colour=cut_number(kl.pr+ll,n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x)) 

# sampled parameters - colour: measure of parameters given observed data
ggplot(par,aes(x=mean,y=sd,colour=cut_number(wn.par,n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x)) 

# sampled data
ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(lp.sample,n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x)) 

# measure used for sampling versus reference prior
ggplot(par,aes(x=-ll,y=kl.pr)) + 
  geom_point() 

# difference between reference measure and sampling measure versus SD
# match of sampled prior with analytic solution 
lm(I(kl.pr+ll)~log(sd),dat=par)

ggplot(par,aes(x=-2*log(sd),y=kl.pr+ll)) + 
  geom_point() +
  geom_smooth(method='lm',formula=y~x)

# difference between analytic solution and sampled solution of reference prior
ggplot(par,aes(x=log(sd),y=kl.pr+ll+2*log(sd),colour=cut_number(mean,n=10))) + 
  geom_point() 

ggplot(par,aes(x=mean,y=sd,colour=cut_number(kl.pr+ll+2*log(sd),n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x)) 

6 Evaluate statistical integrals

a) 95% Confidence Interval (via explicit determination of decision set)

# order observations by prob of parameter given observation
setkey(llm,ipar,wn.par)

# calculate coverage from prob of observation given parameter
llm[,covFreq:=cumsum(exp(wn.sample)),by=ipar]

# decision set to have at least 95% coverage 
# - no randomization; less than 0.05 false negative
llm[,dsf:=as.double(covFreq-exp(wn.sample)>0.05),by=ipar]
# - randomization; exactly 0.05 false negative
llm[dsf!=1&covFreq>0.05,dsf:=(covFreq-0.05)/exp(wn.sample),by=ipar]

# check coverage and illustrate size
par[,size.r:=NULL]
par <- par[llm[,list(size.r=sum(dsf*dat[idat,exp(wn.marg.dat)])),by=ipar]]
dat <- dat[llm[,list(size.rb=sum(dsf*par[ipar,exp(wn.marg.par)])),keyby=idat]]
par[,cov:=NULL]
par <- par[llm[,list(cov=sum(dsf*exp(wn.sample))),by=ipar]]
dat <- dat[llm[,list(cov.b=sum(dsf*exp(wn.par))),keyby=idat]]
if(any(abs(par$cov-0.95)>1E-10)) stop("Wrong coverage")
ggplot(par,aes(x=mean,y=sd,colour=cut_number(size.r,n=10))) + geom_point()

ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(size.rb,n=10))) + geom_point()

ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(cov.b,n=10))) + geom_point()

# confidence set
# check if observed data would be included in decision set
par[,coi95:=NULL]
par <- par[llm[dsf==1,list(coi95=min(wn.par)<par[ipar,wn.par]),by=ipar]] 
par[,coi95b:=NULL]
par <- par[llm[dsf!=0,list(coi95b=min(wn.par)<par[ipar,wn.par]),by=ipar]] 

# illustrate 95% confidence interval
ggplot(par,aes(x=mean,y=sd,colour=coi95)) + 
  geom_point()+
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x))

b) P-value

# order observations by prob of parameter given observation
setkey(llm,ipar,wn.par)

# calculate p-value
# probability of observations further away from the parameters than the observed data
par[,pVal:=NULL]
par <- par[llm[,list(pVal=sum(exp(wn.sample)*(wn.par<par[ipar,wn.par]))),by=ipar]]

# illustrate p-value
ggplot(par,aes(x=mean,y=sd,colour=cut_number(pVal,n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x)) +
  xlab("Mean") + ylab("Standard Deviation") +
  guides(colour = guide_legend("P-value",ncol=2)) +
  theme_bw() +
  theme(legend.position = c(0.3, 0.8)) 

c) 95% Confidence Interval from p-value

# check
par[coi95!=(pVal>0.05)]
par[coi95b!=(pVal>0.05)]
# illustrate 95% confidence interval
ggplot(par,aes(x=mean,y=sd,colour=(pVal>0.05))) + 
  geom_point()+
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x))

d) 95% Credible Interval

# order parameters by prob of observation given parameter
setkey(par,ll)

# calculate coverage 
par[,covBayes:=cumsum(exp(wn.par))]

# back to default ordering to enable joins with llm
setkey(par,ipar)


# illustrate 95% credible interval
ggplot(par,aes(x=mean,y=sd,colour=(covBayes>0.05))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x))

# illustrate Bayesian coverage
ggplot(par,aes(x=mean,y=sd,colour=cut_number(covBayes,n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x))

# compare Bayesian coverage to p-value
ggplot(par,aes(x=covBayes,y=pVal)) + geom_point()

# compare 95% confidence interval and 95% credible interval
ggplot(par,aes(x=mean,y=sd,colour=paste((covBayes>0.05),(pVal>0.05)))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x))

e) P-value based on likelihood ratio test

setkey(par,ipar)

# my maximum likelihood estimator
my.maxll <- function(x) {
  df1 <- do.call(rbind,apply(x,1,function(x1) {
        n<-length(x1)
        dft<-data.frame(mean=mean(x1),sd=sd(x1)*sqrt((n-1)/n))   # max. likelihood parameters
        dft$mll<-my.ll(matrix(x1,nrow=1),dft)                                  # max. likelihood
        return(dft)
      }
    ))
  return(df1)
}

# maximum likelihood estimate for simulated data
iix <- match(grep("V",colnames(dat),value=TRUE),colnames(dat))
dat<-cbind(dat,my.maxll(as.matrix(dat[,iix,with=FALSE])))

# maximum likelihood for observed data
mll.obs <- my.maxll(x)

# likelihood ratio test
# ll ... log likelihood of null hypothesis, i.e., data given selected parameter
# mll ... log likelihood of alternative hypothesis, i.e., data and max. likl. parameter
ftc <- function(ll,mll) {
  return(-2*ll+2*mll)
}

# likelihood ratio for sampled and observed data
llm[,tc:=ftc(ll,dat[idat,mll]),by=idat]
par[,tcr:=ftc(ll,mll.obs$mll)]

# calculate p-value
# probability of observations further away from the parameters than the observed data
par[,pVal.lr:=NULL]
par <- par[llm[,list(pVal.lr=sum(exp(wn.sample)*(tc>par[ipar,tcr]))),by=ipar]]

# compare to chi-square distribution
ggplot(par,aes(x=tcr,y=pVal.lr)) + 
  geom_point(alpha=1.0) + 
  geom_point(aes(y=1-pchisq(tcr,2)),colour="yellow")  + 
  xlab("Log Likelihood Ratio") + ylab("P-value")

# illustrate p-value
ggplot(par,aes(x=mean,y=sd,colour=cut_number(pVal.lr,n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x))

# compare p-values of likelihood ratio and optimal size test
ggplot(par,aes(x=pVal,y=pVal.lr)) + 
  geom_abline(slope=1,size=2,colour="grey") +
  geom_point() +
  xlab("P-value mutual information test") + ylab("P-value likelihood-ratio test") 

ggplot(par,aes(x=pVal,y=pVal.lr)) + 
  geom_abline(slope=1,size=2,colour="grey") +
  geom_point() + xlim(0,0.1) + ylim(0,0.1)

# compare p-values of likelihood ratio to Bayes coverage
ggplot(par,aes(x=covBayes,y=pVal.lr)) + 
  geom_abline(slope=1,size=2,colour="grey") +
  geom_point() 

# compare p-values of likelihood ratio and optimal size test
ggplot(par,aes(x=mean,y=sd,colour=cut_number(pVal.lr/pVal,n=10))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x))

# compare 95% confidence intervals and 95% credible interval
ggplot(par,aes(x=mean,y=sd,colour=paste((covBayes>0.05),(pVal>0.05),(pVal.lr>0.05)))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x))

# compare 95% confidence intervals of likelihood ratio test and mutual information test
ggplot(par,aes(x=mean,y=sd,colour=paste((pVal>0.05),(pVal.lr>0.05)))) + 
  geom_point() +
  geom_hline(yintercept=sd(x)) +
  geom_vline(xintercept=mean(x)) +
  xlab("Mean") + ylab("Standard Deviation") +
  # guides(colour = guide_legend("P-value",ncol=2)) +
  theme(legend.position = c(0.3, 0.8)) 

# compare likelihood ratio test statistics to posterior test statistics
ggplot(llm[ipar==20],aes(x=tc,y=(wn.par))) + 
  geom_point() 

f) LR-Test 95% Confidence Interval (via explicit determination of decision set)

# order observations by likelihood ratio
llm[,tc.tmp:=-tc]
setkey(llm,ipar,tc.tmp)

# calculate coverage from prob of observation given parameter
llm[,covFreq.lr:=cumsum(exp(wn.sample)),by=ipar]

# decision set to have at least 95% coverage 
# - no randomization; less than 0.05 false negative
llm[,dsf.lr:=as.double(covFreq.lr-exp(wn.sample)>0.05),by=ipar]
# - randomization; exactly 0.05 false negative
llm[dsf.lr!=1&covFreq.lr>0.05,dsf.lr:=(covFreq.lr-0.05)/exp(wn.sample),by=ipar]


# check coverage and illustrate size
par[,size.lr:=NULL]
par <- par[llm[,list(size.lr=sum(dsf.lr*dat[idat,exp(wn.marg.dat)])),by=ipar]]
dat <- dat[llm[,list(size.lrb=sum(dsf.lr*par[ipar,exp(wn.marg.par)])),keyby=idat]]
par[,cov.lr:=NULL]
par <- par[llm[,list(cov.lr=sum(dsf.lr*exp(wn.sample))),by=ipar]]
dat <- dat[llm[,list(cov.lrb=sum(dsf.lr*exp(wn.par))),keyby=idat]]

if(any(abs(par$cov-0.95)>1E-10)) stop("Wrong coverage")
ggplot(par,aes(x=mean,y=sd,colour=cut_number(size.lr,n=10))) + geom_point()

ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(size.lrb,n=10))) + geom_point()

ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(cov.lrb,n=10))) + geom_point()

# compare sizes
ggplot(par,aes(x=(size.lr),y=(size.r))) + 
  geom_abline(slope=1,size=2,colour="grey") +
  geom_point() 

# print out version information, path, date
R.Version()
## $platform
## [1] "x86_64-w64-mingw32"
## 
## $arch
## [1] "x86_64"
## 
## $os
## [1] "mingw32"
## 
## $system
## [1] "x86_64, mingw32"
## 
## $status
## [1] ""
## 
## $major
## [1] "3"
## 
## $minor
## [1] "1.2"
## 
## $year
## [1] "2014"
## 
## $month
## [1] "10"
## 
## $day
## [1] "31"
## 
## $`svn rev`
## [1] "66913"
## 
## $language
## [1] "R"
## 
## $version.string
## [1] "R version 3.1.2 (2014-10-31)"
## 
## $nickname
## [1] "Pumpkin Helmet"
date()
## [1] "Mon Aug 31 18:03:00 2015"
Sys.info()
##        sysname        release        version       nodename        machine 
##      "Windows"        "7 x64"   "build 9200"  "CHRISTIANPC"       "x86-64" 
##          login           user effective_user 
##    "Christian"    "Christian"    "Christian"