--- title: "Replication studies considered harmful" author: "Martin Shepperd" date: "8/2/2018" output: html_document --- #Replication studies considered harmful This is a deliberately contentious title for a short paper accepted for the ACM/IEEE ICSE2018 New and Interesting Results section. ##How similar must a replication be? Even if we assume no publication bias, and an exact reproduction of the materials and experimental design, sampling and measurement error can have a substantial impact upon results such that the original and replication studies may differ considerably. Some R initialisation and define the main function RepSim and a second function ```{r} require(effsize) set.seed(4159) sizenames <- c("Large Neg","Med Neg","Small Neg","None","Small Pos","Med Pos","Large Pos") sizes <- c(-0.8,-0.5,-0.2,0,0.2,0.5,0.8) names(sizes) <- sizenames RepSim <- function(nSims=10000,x.nParts=30,y.nParts=30,x.mean=100,y.mean=100, x.sd=20,y.sd=20,depndt=T,x.mixProb=0.0,y.mixProb=0.0) { # Martin Shepperd - 18.10.2017 # # nSims = number of simulated experiments # x.nParts = number of participants for treatment X # y.nParts = number of participants for treatment Y # depndt = whether X and Y are paired (for the effect sz calcs) # x.mixProb = the probability that the X distribution is contaminated with x3 sd # y.mixProb = the probability that the Y distribution is contaminated with x3 sd p <-numeric(nSims) #set up empty container for all simulated p-values es <- numeric(nSims) #set up empty container for all simulated effect sizes es.lo <- es.hi <- numeric(nSims) #set up empty containers for effect size confidence limits sizenames <- c("Large Neg","Med Neg","Small Neg","None Neg","None Pos","Small Pos","Med Pos","Large Pos") sizes <- c(-0.8,-0.5,-0.2,0,0.2,0.5,0.8,1.0) names(sizes) <- sizenames contam.level <- 10 for(i in 1:nSims){ #for each simulated experiment if (x.mixProb > 0) { sd.choice <- sample(c(x.sd,(x.sd*contam.level)), x.nParts, replace=T, prob=c(1-x.mixProb,x.mixProb)) x <- rnorm(x.nParts, x.mean, sd.choice) } else { x<-rnorm(n=x.nParts, mean = x.mean, sd = x.sd) } if (y.mixProb > 0) { sd.choice <- sample(c(y.sd,(y.sd*contam.level)), y.nParts, replace=T, prob=c(1-y.mixProb,y.mixProb)) y <- rnorm(y.nParts, y.mean, sd.choice) } else { y<-rnorm(n=y.nParts, mean = y.mean, sd = y.sd) } z<-t.test(x,y) #perform the t-test p[i]<-z$p.value #get the p-value and store it temp<-cohen.d(x,y,pooled=T,paired=depndt) es[i] <- as.numeric(temp[3]) temp1 <- temp[4] es.lo[i] <- as.numeric(temp1$conf.int["inf"]) es.hi[i] <- as.numeric(temp1$conf.int["sup"]) } hist(es, main="Histogram of Effect Sizes (cohen d)", xlab=("Observed d")) message(c("Number of expts with at least a small effect sz: ",length(which(es >= 0.2)))) sizeresults <- ecdf(es)(sizes) names(sizeresults) <- sizenames message(c("Cummulative distribution of effect sizes: ",sizeresults)) hist(p, main="Histogram of p-values (true group difference)", xlab=("Observed p-value")) percntl <- ecdf(p) message("5th percentile is: ", percntl(0.05)) return(cbind(p,es,es.lo,es.hi)) } ###### end function RepSim CompStudiesSim <- function(nSims=10000,r) { # This function randomly draws 2 studies from the nSims that have been simulated, # compares them and repeats until no studies are left. nComps <- nSims/2 # since we don't want to use any simulated study more than once effectdiff <- numeric(nComps) signdiff <- character(nComps) t1 <-sample(1:10000,size = nSims,replace = F) t1.pos <- 1 for (i in 1:nComps) { orig.d <- as.numeric(r[t1[t1.pos],2]) rep.d <- as.numeric(r[t1[t1.pos+1],2]) effectdiff[i] <- orig.d - rep.d if ((orig.d > 0) & (rep.d > 0)) { signdiff[i] <- "++" } else if (orig.d > 0 & rep.d <= 0) { signdiff[i] <- "+-" } else if (orig.d <= 0 & rep.d > 0) { signdiff[i] <- "-+" } else { signdiff[i] <- "--" } t1.pos <- t1.pos+2 } as.factor(signdiff) as.numeric(effectdiff) return(cbind.data.frame(effectdiff,signdiff)) } ##### end of function CompStudiesSim ``` ### A basic simulation, with same means and sd's Here we assume equal mean, sd and two independent samples size 30, in other words there should be *no effect*. ```{r} r0 <- RepSim(nSims=10000,x.nParts=30,y.nParts=30,x.mean=100,y.mean=100,x.sd=20,y.sd=20,depndt=F) sizeresults <- ecdf(r0[,2])(sizes) names(sizeresults) <- sizenames sizeresults ``` To interpret the above, even though there is no effect only just over half (55%) report no effect. Then we must consider the likelihood of replicating what could be an incorrect result. An astonishing 0.3% of studies would report at least a large effect when there is none. The confusion matrix (showing probabilities rather than counts is ...) By simply comparing results yields almost no insight. ###Different means but similar variances Consider the situation where there is a true positive (small) effect size with d=0.2 (100-96/20). ```{r} r1 <- RepSim(nSims=10000,x.nParts=30,y.nParts=30,x.mean=100,y.mean=96,x.sd=20,y.sd=20,depndt=F) sizeresults <- ecdf(r1[,2])(sizes) names(sizeresults) <- sizenames sizeresults ``` Here the distribution of p values is shifted more towards lower values. Unlike the no-effect case we see ~11% of all experiments find a significant effect (i.e., p<0.05) which given there is a true effect is not encouraging. ###Contaminated normal distributions We restrict our mixtures to those within the parametric family, specifically Gaussian. So we add an additional small proportion (typically the *mixture weights* are 0.2) with a greater variance (x10 typically) to yield a heavy tailed distribution. So we generate two new cases of no-effect (r2) and small effect (r3) with contaminated distributions. ```{r} r2 <- RepSim(nSims=10000,x.nParts=30,y.nParts=30,x.mean=100,y.mean=100,x.sd=20,y.sd=20,depndt=F,x.mixProb = 0.2,y.mixProb = 0.2) r3 <- RepSim(nSims=10000,x.nParts=30,y.nParts=30,x.mean=100,y.mean=96,x.sd=20,y.sd=20,depndt=F,x.mixProb = 0.2,y.mixProb = 0.2) ``` ## Simulating Replications ```{r} CompStudiesSim <- function(nSims=10000,r) { # This function randomly draws 2 studies from the nSims that have been simulated, # compares them and repeats until no studies are left. nComps <- nSims/2 # since we don't want to use any simulated study more than once effectdiff <- numeric(nComps) signdiff <- character(nComps) t1 <-sample(1:10000,size = nSims,replace = F) t1.pos <- 1 for (i in 1:nComps) { orig.d <- as.numeric(r[t1[t1.pos],2]) rep.d <- as.numeric(r[t1[t1.pos+1],2]) effectdiff[i] <- orig.d - rep.d if ((orig.d > 0) & (rep.d > 0)) { signdiff[i] <- "++" } else if (orig.d > 0 & rep.d <= 0) { signdiff[i] <- "+-" } else if (orig.d <= 0 & rep.d > 0) { signdiff[i] <- "-+" } else { signdiff[i] <- "--" } t1.pos <- t1.pos+2 } as.factor(signdiff) as.numeric(effectdiff) return(cbind.data.frame(effectdiff,signdiff)) } ##### end of function CompStudiesSim ``` Now to test for each distribution. ```{r} rep0 <- CompStudiesSim(10000,r0) table(rep0$signdiff) rep1 <- CompStudiesSim(10000,r1) table(rep1$signdiff) rep2 <- CompStudiesSim(10000,r2) table(rep2$signdiff) rep3 <- CompStudiesSim(10000,r3) table(rep3$signdiff) ``` ## Finally some simple meta-analysis This illustrates for Briand experiment how to combine results from two studies (Briand et al 1997; Briand et al 2001). ```{r} require(metafor) require(esc) s1 <- escalc("SMD",m1i=98.1,m2i = 82.9,sd1i = 4.5,sd2i = 13.8,n1i = 7,n2i = 6, slab = "Briand et al 1997") s2 <- escalc("SMD",m1i=5.64,m2i = 4.31,sd1i = 1.03,sd2i = 1.38,n1i =33,n2i = 32, slab = "Briand et al 2001") mta <- rbind(s1,s2) res <- rma(yi, vi, data = mta) forest(res,refline = 1.14) # 1.14 shows the smmary effect visually ```