Christian Bartels
Allschwil, 31-Aug-2015
Implementation of algorithm described at:
Generic and consistent confidence and credible regions. Christian Bartels.figshare.
http://dx.doi.org/10.6084/m9.figshare.1528163
Available at:
Code - Generic and consistent confidence and credible regions. Christian Bartels. figshare.
http://dx.doi.org/10.6084/m9.figshare.1528187
# clean up
rm(list = ls())
# get libraries
library(rstan)
## Loading required package: Rcpp
## Loading required package: inline
## Warning: package 'inline' was built under R version 3.1.3
##
## Attaching package: 'inline'
##
## The following object is masked from 'package:Rcpp':
##
## registerPlugin
##
## rstan (Version 2.6.0, packaged: 2015-02-06 21:02:34 UTC, GitRev: 198082f07a60)
library(ggplot2)
library(data.table)
require(matrixStats) # to get logSumExp()
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 3.1.3
## matrixStats v0.14.2 (2015-06-23) successfully loaded. See ?matrixStats for help.
theme_set(theme_bw())
set_cppo("fast")
## mode fast without NDEBUG for compiling C++ code has been set already
# my likelihood function
# x ... observations or samples of observations
# col: repeated measures (evaluated with the same parameter)
# row: different samples (evaluate with different parameters)
# par ... data.frame of parameters
# col: different parameters, e.g., mean and sd
# row: samples
# return vector of log-likelihood of samples
my.ll <- function(x,par) {
# str(x)
# str(par)
return(rowSums(dnorm(x,par$mean,par$sd,log=TRUE)))
# dnorm(matrix(1,2,3),c(0,1),log=TRUE)
# rowSums(dnorm(matrix(1,2,3),c(0,1),log=TRUE))
}
# my simulation function
# par ... data.frame of parameters
# col: different parameters, e.g., mean and sd
# row: samples
# n ... number of repeated measures
my.sim <- function(n,par) {
m<-nrow(par)
return(matrix(rnorm(n*m,par$mean,par$sd),ncol=n))
# matrix(rnorm(10,c(0,10),c(0.01,0.01)),ncol=5)
}
Data is simulated from a normal distribution.
# make reproducible
set.seed(1)
# generate observed data
n=10 # number of repeated observations
p0<-data.frame(mean=1,sd=1) # 'unknown truth'
x<-my.sim(n,p0) # observed data
# my.ll(x,p0)
10 values are sampled from a normal distribution with mean 1 and standard deviation 1.
Use Stan to sample “reasonable” parameters
# list model
modelSample = "mcmc7.model" # model to sample parameters
cbind(system(paste("cat ",modelSample,sep=""),intern=TRUE))
# compile model
parSampler <- stan_model(modelSample,verbose=FALSE)
# call Stan
dat <- list(N=n, x=c(x))
parSample <- sampling(parSampler, data = dat, init = 0, iter = 20000, thin=10, seed=123)
# output from stan
print(parSample,digits=3)
plot(parSample)
traceplot(parSample)
# get samples of parameters (representing integral)
par<-as.data.table(as.data.frame(parSample))
# rename col-names to match rest of code
setnames(par,c("m","s"),c("mean","sd"))
# add identifier & key
par[,ipar:=1:nrow(par)]
setkey(par,ipar)
Simulate data; use sampled parameters to guide sampling
# simulate
dat<-as.data.table(my.sim(n,par))
# add identifier & key
dat[,idat:=1:nrow(dat)]
setkey(dat,idat)
# indices of data columns
iix <- match(grep("V",colnames(dat),value=TRUE),colnames(dat))
# likelihood of observed data for sampled parameters - f(thetaj,x0)
par[,ll:=my.ll(x,.SD[1,]),by=ipar]
# check
ggplot(par,aes(x=lp__-log(sd),y=ll-prior)) + geom_point()
# likelihood of simulated data given sampled parameters - f(thetaj,xi)
iix <- match(grep("V",colnames(dat),value=TRUE),colnames(dat))
llm <- par[,list(idat=dat$idat,ll=my.ll(as.matrix(dat[,iix,with=FALSE]),.SD[1,])),by=list(ipar)]
setkey(llm,ipar,idat)
# log likelihood used for simulation of data
dat <- dat[llm[,list(lp.sample=logSumExp(ll)),by=idat]]
# weights of data to evaluate integral over data for given parameter (independent of prior)
llm[,w.sample:=ll-dat[idat,lp.sample]]
llm[,wn.sample:=(w.sample-logSumExp(w.sample)),by=ipar]
# measure of the sampled paramters
# parameters were sample with a sampling prior
llm[,w.par.d:=ll-par[ipar,ll+prior]]
llm[,wn.par.d:=(w.par.d-logSumExp(w.par.d)),by=idat]
# calculate log of reference prior via Kullback-Leibler divergence
llm[,kl.p:=exp(wn.sample)*wn.par.d]
par[,kl.pr:=NULL]
par <- par[llm[,list(kl.pr=(sum(kl.p))),by=ipar]]
# measure of joint distribution
llm[,w.joint:=ll-dat[idat,lp.sample]+par[ipar,kl.pr]]
llm[,wn.joint:=w.joint-logSumExp(w.joint)]
# marginal of parameters (=reference prior)
par <- par[llm[,list(wn.marg.par=logSumExp(wn.joint)),by=ipar]]
ggplot(par,aes(x=(wn.marg.par),y=kl.pr)) + geom_point()
# marginal of data
dat <- dat[llm[,list(wn.marg.dat=logSumExp(wn.joint)),by=idat]]
# log mutual information
llm[,mut:=wn.joint-dat[idat,wn.marg.dat]-par[ipar,wn.marg.par]]
# conditional on parameters - same as wn.sample (redundant code)
llm[,wn.cond.dat:=wn.joint-logSumExp(wn.joint),by=ipar]
# conditional on data - same as wn.par (redundant code)
llm[,wn.cond.par:=wn.joint-logSumExp(wn.joint),by=idat]
# measure of paramters to evaluate integral over parameters given observed data
# using reference prior
par[,w.par:=(kl.pr+ll)]
par[,wn.par:=w.par-logSumExp(w.par)] # do normalization
# measure of paramters to evaluate integral over parameters for any sampled data
# using reference prior
llm[,w.par:=(par[ipar,kl.pr]+ll)] # unnormalized weight
llm[,wn.par:=w.par-logSumExp(w.par),by=idat] # do normalization
ggplot(llm[ipar==4],aes(x=(wn.cond.par),y=(wn.par))) + geom_point()
# calculate mean and sd of sampled data
iix <- match(grep("V",colnames(dat),value=TRUE),colnames(dat))
dat[,mean.i:=mean(unlist(dat[idat,iix,with=FALSE])),by=idat]
dat[,sd.i:=sd(unlist(dat[idat,iix,with=FALSE])),by=idat]
# sampled parameters - colour: log-likelihood of observed data (measure used for sampling)
ggplot(par,aes(x=mean,y=sd,colour=cut_number(ll,n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# sampled parameters - colour: reference prior - measure used for sampling
ggplot(par,aes(x=mean,y=sd,colour=cut_number(kl.pr+ll,n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# sampled parameters - colour: measure of parameters given observed data
ggplot(par,aes(x=mean,y=sd,colour=cut_number(wn.par,n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# sampled data
ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(lp.sample,n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# measure used for sampling versus reference prior
ggplot(par,aes(x=-ll,y=kl.pr)) +
geom_point()
# difference between reference measure and sampling measure versus SD
# match of sampled prior with analytic solution
lm(I(kl.pr+ll)~log(sd),dat=par)
ggplot(par,aes(x=-2*log(sd),y=kl.pr+ll)) +
geom_point() +
geom_smooth(method='lm',formula=y~x)
# difference between analytic solution and sampled solution of reference prior
ggplot(par,aes(x=log(sd),y=kl.pr+ll+2*log(sd),colour=cut_number(mean,n=10))) +
geom_point()
ggplot(par,aes(x=mean,y=sd,colour=cut_number(kl.pr+ll+2*log(sd),n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# order observations by prob of parameter given observation
setkey(llm,ipar,wn.par)
# calculate coverage from prob of observation given parameter
llm[,covFreq:=cumsum(exp(wn.sample)),by=ipar]
# decision set to have at least 95% coverage
# - no randomization; less than 0.05 false negative
llm[,dsf:=as.double(covFreq-exp(wn.sample)>0.05),by=ipar]
# - randomization; exactly 0.05 false negative
llm[dsf!=1&covFreq>0.05,dsf:=(covFreq-0.05)/exp(wn.sample),by=ipar]
# check coverage and illustrate size
par[,size.r:=NULL]
par <- par[llm[,list(size.r=sum(dsf*dat[idat,exp(wn.marg.dat)])),by=ipar]]
dat <- dat[llm[,list(size.rb=sum(dsf*par[ipar,exp(wn.marg.par)])),keyby=idat]]
par[,cov:=NULL]
par <- par[llm[,list(cov=sum(dsf*exp(wn.sample))),by=ipar]]
dat <- dat[llm[,list(cov.b=sum(dsf*exp(wn.par))),keyby=idat]]
if(any(abs(par$cov-0.95)>1E-10)) stop("Wrong coverage")
ggplot(par,aes(x=mean,y=sd,colour=cut_number(size.r,n=10))) + geom_point()
ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(size.rb,n=10))) + geom_point()
ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(cov.b,n=10))) + geom_point()
# confidence set
# check if observed data would be included in decision set
par[,coi95:=NULL]
par <- par[llm[dsf==1,list(coi95=min(wn.par)<par[ipar,wn.par]),by=ipar]]
par[,coi95b:=NULL]
par <- par[llm[dsf!=0,list(coi95b=min(wn.par)<par[ipar,wn.par]),by=ipar]]
# illustrate 95% confidence interval
ggplot(par,aes(x=mean,y=sd,colour=coi95)) +
geom_point()+
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# order observations by prob of parameter given observation
setkey(llm,ipar,wn.par)
# calculate p-value
# probability of observations further away from the parameters than the observed data
par[,pVal:=NULL]
par <- par[llm[,list(pVal=sum(exp(wn.sample)*(wn.par<par[ipar,wn.par]))),by=ipar]]
# illustrate p-value
ggplot(par,aes(x=mean,y=sd,colour=cut_number(pVal,n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x)) +
xlab("Mean") + ylab("Standard Deviation") +
guides(colour = guide_legend("P-value",ncol=2)) +
theme_bw() +
theme(legend.position = c(0.3, 0.8))
# check
par[coi95!=(pVal>0.05)]
par[coi95b!=(pVal>0.05)]
# illustrate 95% confidence interval
ggplot(par,aes(x=mean,y=sd,colour=(pVal>0.05))) +
geom_point()+
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# order parameters by prob of observation given parameter
setkey(par,ll)
# calculate coverage
par[,covBayes:=cumsum(exp(wn.par))]
# back to default ordering to enable joins with llm
setkey(par,ipar)
# illustrate 95% credible interval
ggplot(par,aes(x=mean,y=sd,colour=(covBayes>0.05))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# illustrate Bayesian coverage
ggplot(par,aes(x=mean,y=sd,colour=cut_number(covBayes,n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# compare Bayesian coverage to p-value
ggplot(par,aes(x=covBayes,y=pVal)) + geom_point()
# compare 95% confidence interval and 95% credible interval
ggplot(par,aes(x=mean,y=sd,colour=paste((covBayes>0.05),(pVal>0.05)))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
setkey(par,ipar)
# my maximum likelihood estimator
my.maxll <- function(x) {
df1 <- do.call(rbind,apply(x,1,function(x1) {
n<-length(x1)
dft<-data.frame(mean=mean(x1),sd=sd(x1)*sqrt((n-1)/n)) # max. likelihood parameters
dft$mll<-my.ll(matrix(x1,nrow=1),dft) # max. likelihood
return(dft)
}
))
return(df1)
}
# maximum likelihood estimate for simulated data
iix <- match(grep("V",colnames(dat),value=TRUE),colnames(dat))
dat<-cbind(dat,my.maxll(as.matrix(dat[,iix,with=FALSE])))
# maximum likelihood for observed data
mll.obs <- my.maxll(x)
# likelihood ratio test
# ll ... log likelihood of null hypothesis, i.e., data given selected parameter
# mll ... log likelihood of alternative hypothesis, i.e., data and max. likl. parameter
ftc <- function(ll,mll) {
return(-2*ll+2*mll)
}
# likelihood ratio for sampled and observed data
llm[,tc:=ftc(ll,dat[idat,mll]),by=idat]
par[,tcr:=ftc(ll,mll.obs$mll)]
# calculate p-value
# probability of observations further away from the parameters than the observed data
par[,pVal.lr:=NULL]
par <- par[llm[,list(pVal.lr=sum(exp(wn.sample)*(tc>par[ipar,tcr]))),by=ipar]]
# compare to chi-square distribution
ggplot(par,aes(x=tcr,y=pVal.lr)) +
geom_point(alpha=1.0) +
geom_point(aes(y=1-pchisq(tcr,2)),colour="yellow") +
xlab("Log Likelihood Ratio") + ylab("P-value")
# illustrate p-value
ggplot(par,aes(x=mean,y=sd,colour=cut_number(pVal.lr,n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# compare p-values of likelihood ratio and optimal size test
ggplot(par,aes(x=pVal,y=pVal.lr)) +
geom_abline(slope=1,size=2,colour="grey") +
geom_point() +
xlab("P-value mutual information test") + ylab("P-value likelihood-ratio test")
ggplot(par,aes(x=pVal,y=pVal.lr)) +
geom_abline(slope=1,size=2,colour="grey") +
geom_point() + xlim(0,0.1) + ylim(0,0.1)
# compare p-values of likelihood ratio to Bayes coverage
ggplot(par,aes(x=covBayes,y=pVal.lr)) +
geom_abline(slope=1,size=2,colour="grey") +
geom_point()
# compare p-values of likelihood ratio and optimal size test
ggplot(par,aes(x=mean,y=sd,colour=cut_number(pVal.lr/pVal,n=10))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# compare 95% confidence intervals and 95% credible interval
ggplot(par,aes(x=mean,y=sd,colour=paste((covBayes>0.05),(pVal>0.05),(pVal.lr>0.05)))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x))
# compare 95% confidence intervals of likelihood ratio test and mutual information test
ggplot(par,aes(x=mean,y=sd,colour=paste((pVal>0.05),(pVal.lr>0.05)))) +
geom_point() +
geom_hline(yintercept=sd(x)) +
geom_vline(xintercept=mean(x)) +
xlab("Mean") + ylab("Standard Deviation") +
# guides(colour = guide_legend("P-value",ncol=2)) +
theme(legend.position = c(0.3, 0.8))
# compare likelihood ratio test statistics to posterior test statistics
ggplot(llm[ipar==20],aes(x=tc,y=(wn.par))) +
geom_point()
# order observations by likelihood ratio
llm[,tc.tmp:=-tc]
setkey(llm,ipar,tc.tmp)
# calculate coverage from prob of observation given parameter
llm[,covFreq.lr:=cumsum(exp(wn.sample)),by=ipar]
# decision set to have at least 95% coverage
# - no randomization; less than 0.05 false negative
llm[,dsf.lr:=as.double(covFreq.lr-exp(wn.sample)>0.05),by=ipar]
# - randomization; exactly 0.05 false negative
llm[dsf.lr!=1&covFreq.lr>0.05,dsf.lr:=(covFreq.lr-0.05)/exp(wn.sample),by=ipar]
# check coverage and illustrate size
par[,size.lr:=NULL]
par <- par[llm[,list(size.lr=sum(dsf.lr*dat[idat,exp(wn.marg.dat)])),by=ipar]]
dat <- dat[llm[,list(size.lrb=sum(dsf.lr*par[ipar,exp(wn.marg.par)])),keyby=idat]]
par[,cov.lr:=NULL]
par <- par[llm[,list(cov.lr=sum(dsf.lr*exp(wn.sample))),by=ipar]]
dat <- dat[llm[,list(cov.lrb=sum(dsf.lr*exp(wn.par))),keyby=idat]]
if(any(abs(par$cov-0.95)>1E-10)) stop("Wrong coverage")
ggplot(par,aes(x=mean,y=sd,colour=cut_number(size.lr,n=10))) + geom_point()
ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(size.lrb,n=10))) + geom_point()
ggplot(dat,aes(x=mean.i,y=sd.i,colour=cut_number(cov.lrb,n=10))) + geom_point()
# compare sizes
ggplot(par,aes(x=(size.lr),y=(size.r))) +
geom_abline(slope=1,size=2,colour="grey") +
geom_point()
# print out version information, path, date
R.Version()
## $platform
## [1] "x86_64-w64-mingw32"
##
## $arch
## [1] "x86_64"
##
## $os
## [1] "mingw32"
##
## $system
## [1] "x86_64, mingw32"
##
## $status
## [1] ""
##
## $major
## [1] "3"
##
## $minor
## [1] "1.2"
##
## $year
## [1] "2014"
##
## $month
## [1] "10"
##
## $day
## [1] "31"
##
## $`svn rev`
## [1] "66913"
##
## $language
## [1] "R"
##
## $version.string
## [1] "R version 3.1.2 (2014-10-31)"
##
## $nickname
## [1] "Pumpkin Helmet"
date()
## [1] "Mon Aug 31 18:03:00 2015"
Sys.info()
## sysname release version nodename machine
## "Windows" "7 x64" "build 9200" "CHRISTIANPC" "x86-64"
## login user effective_user
## "Christian" "Christian" "Christian"