Data and useful functions

library(tidyverse)
library(bigsnpr)
library(doParallel)
celiac2 <- snp_attach("backingfiles/celiacQC_sub1.rds")
CHR <- celiac2$map$chromosome
POS <- celiac2$map$physical.pos
G <- celiac2$genotypes
ind.HLA <- snp_indLRLDR(CHR, POS, subset(LD.wiki34, ID == "hild12"))

Get maximum AUCs

K <- 0.3

Estimation from Wray 2010

Computation of the maximum AUC from equation (3) of Wray et al. (2010). Approximation is less accurate for high heritabilities.

sapply(c(0.5, 0.8), function(h2) {
  T0 <- qnorm(1 - K)
  z <- dnorm(T0)
  i <- z / K
  v <- -i * K / (1 - K)
  num <- (i - v) * h2
  deno.part <- 1 - h2 * i * (i - T0) + (1 - h2 * v * (v - T0))
  pnorm(num / sqrt(h2 * deno.part))
})
## [1] 0.8406370 0.9301013

Common estimation for all simulations

Assuming that \(y \sim N(0, h^2)\) and knowing that \(\epsilon \sim N(0, 1-h^2)\), we can estimate the theoretical value of the AUC that can be achieved given the heritability \(h^2\) and a liability threshold model on \(y + \epsilon\) with a prevalence of 30%.

nsimu <- 1e7

registerDoParallel(cl <- makeCluster(nb_cores()))
aucs <- foreach(i = seq_len(100), .combine = "cbind") %:%
  foreach(h2 = c(0.5, 0.8), .combine = "rbind") %dopar% {
    
    S <- rnorm(nsimu, mean = 0, sd = sqrt(h2))
    E <- rnorm(nsimu, mean = 0, sd = sqrt(1 - h2))
    Y <- (S + E) > qnorm(1 - K)
    
    bigstatsr::AUC(S, Y)
  }
stopCluster(cl)

apply(aucs, 1, summary)
##          result.1  result.2
## Min.    0.8407055 0.9404302
## 1st Qu. 0.8409396 0.9405599
## Median  0.8410290 0.9405946
## Mean    0.8410334 0.9405949
## 3rd Qu. 0.8411200 0.9406333
## Max.    0.8413765 0.9407566

Different estimations for all simulations

get_oracle <- function(
  G,                                        ## matrix of genotypes
  h2,                                       ## heritability 
  M,                                        ## nbs of causal variants
  ind.possible = cols_along(G),             ## indices of possible causal variants
  effects.dist = c("gaussian", "laplace"),  ## distribution of effects 
  model = c("simple", "fancy"),             ## model for simulation
  K = 0.3                                   ## prevalence
) {
  
  effects.dist  <- match.arg(effects.dist)
  model <- match.arg(model)
  
  set <- sample(ind.possible, size = M)
  effects <- `if`(effects.dist == "gaussian", 
                  rnorm(M, sd = sqrt(h2 / M)),
                  rmutil::rlaplace(M, s = sqrt(h2 / (2*M))))
  
  if (model == "simple") {
    # only linear
    y.simu <- scale(G[, set]) %*% effects
  } else {
    sets <- split(1:M, sample(rep_len(1:3, M)))
    # linear
    ind1 <- sets[[1]]
    y.simu <- scale(G[, set[ind1]]) %*% effects[ind1]
    # recessive / dominant
    ind2 <- sets[[2]]
    y.simu <- y.simu + scale(G[, set[ind2]] > 0.5) %*% effects[ind2]
    # interactions
    ind3 <- matrix(sets[[3]], ncol = 2)
    y.simu <- y.simu + scale(G[, set[ind3[, 1]]] * G[, set[ind3[, 2]]]) %*% 
      effects[ind3[, 1]] * sqrt(2)
  }
  
  y.simu <- y.simu / sd(y.simu) * sqrt(h2)
  stopifnot(all.equal(drop(var(y.simu)), h2))
  S <- y.simu
  y.simu <- S + rnorm(nrow(G), sd = sqrt(1 - h2))
  pheno <- as.numeric(y.simu > qnorm(1 - K))
  bigstatsr::AUC(S, pheno)
}
params.grid2 <- expand.grid(
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.5, 0.8), 
  par.model  = c("simple", "fancy"),
  num.simu   = 1:100,
  stringsAsFactors = FALSE
) 
AUC_oracle <- function(G, params, ind.HLA) {
  
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  AUC <- get_oracle(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", bigstatsr::cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]], 
    K = 0.3                 
  )
  
  params$AUC <- AUC
  params
}

With all 22 chromosomes

registerDoParallel(cl <- makeCluster(nb_cores()))
oracles <- foreach(i = rows_along(params.grid2), .combine = "rbind") %dopar% {
  AUC_oracle(G, params.grid2[i, ], ind.HLA)
}
stopCluster(cl)
boot <- pkg.paper.PRS::boot
oracles %>% 
  mutate(
    par.causal = factor(map_chr(par.causal, ~paste(.x[1], .x[2], sep = " in ")),
                        levels = c("30 in HLA", paste(3 * 10^(1:3), "in all")))) %>%
  group_by_at(vars(starts_with("par"))) %>%
  summarise(AUC_mean = round(mean(AUC), 4),
            AUC_mean_boot = round(boot(AUC), 4)) %>%
  arrange(AUC_mean) %>%
  print(n = Inf)
## # A tibble: 32 x 6
## # Groups:   par.causal, par.dist, par.h2 [16]
##    par.causal  par.dist par.h2 par.model AUC_mean AUC_mean_boot
##    <fct>       <chr>     <dbl> <chr>        <dbl>         <dbl>
##  1 30 in HLA   laplace     0.5 fancy        0.831      0.0018  
##  2 30 in all   laplace     0.5 fancy        0.832      0.0018  
##  3 30 in HLA   gaussian    0.5 fancy        0.833      0.0011  
##  4 30 in all   gaussian    0.5 fancy        0.834      0.0011  
##  5 30 in all   laplace     0.5 simple       0.839      0.001   
##  6 300 in all  laplace     0.5 fancy        0.839      0.0005  
##  7 300 in all  gaussian    0.5 fancy        0.840      0.0005  
##  8 30 in all   gaussian    0.5 simple       0.841      0.0005  
##  9 300 in all  gaussian    0.5 simple       0.841      0.0004  
## 10 3000 in all gaussian    0.5 simple       0.841      0.0005  
## 11 3000 in all laplace     0.5 fancy        0.841      0.0004  
## 12 300 in all  laplace     0.5 simple       0.841      0.0005  
## 13 3000 in all gaussian    0.5 fancy        0.841      0.0005  
## 14 30 in HLA   laplace     0.5 simple       0.841      0.0009  
## 15 3000 in all laplace     0.5 simple       0.841      0.0005  
## 16 30 in HLA   gaussian    0.5 simple       0.842      0.0007  
## 17 30 in HLA   laplace     0.8 fancy        0.932      0.0015  
## 18 30 in all   laplace     0.8 fancy        0.934      0.0011  
## 19 30 in HLA   gaussian    0.8 fancy        0.936      0.0008  
## 20 30 in all   gaussian    0.8 fancy        0.937      0.0007  
## 21 300 in all  laplace     0.8 fancy        0.940      0.000300
## 22 300 in all  gaussian    0.8 fancy        0.940      0.000300
## 23 30 in all   laplace     0.8 simple       0.940      0.0007  
## 24 30 in all   gaussian    0.8 simple       0.94       0.000300
## 25 3000 in all laplace     0.8 simple       0.940      0.000300
## 26 3000 in all gaussian    0.8 fancy        0.940      0.0002  
## 27 3000 in all gaussian    0.8 simple       0.940      0.000300
## 28 3000 in all laplace     0.8 fancy        0.941      0.0002  
## 29 300 in all  gaussian    0.8 simple       0.941      0.0002  
## 30 300 in all  laplace     0.8 simple       0.941      0.000300
## 31 30 in HLA   gaussian    0.8 simple       0.941      0.0005  
## 32 30 in HLA   laplace     0.8 simple       0.941      0.0007

With chromosome 6 only

G6 <- big_copy(G, ind.col = which(CHR == 6))
ind.HLA6 <- snp_indLRLDR(CHR[CHR == 6], POS[CHR == 6], 
                         subset(LD.wiki34, ID == "hild12"))

registerDoParallel(cl <- makeCluster(nb_cores()))
oracles_chr6 <- foreach(i = rows_along(params.grid2), .combine = "rbind") %dopar% {
  AUC_oracle(G6, params.grid2[i, ], ind.HLA6)
}
stopCluster(cl)
oracles_chr6 %>% 
  mutate(
    par.causal = factor(map_chr(par.causal, ~paste(.x[1], .x[2], sep = " in ")),
                        levels = c("30 in HLA", paste(3 * 10^(1:3), "in all")))) %>%
  group_by_at(vars(starts_with("par"))) %>%
  summarise(AUC_mean = round(mean(AUC), 4),
            AUC_mean_boot = round(boot(AUC), 4)) %>%
  arrange(AUC_mean) %>%
  print(n = Inf)
## # A tibble: 32 x 6
## # Groups:   par.causal, par.dist, par.h2 [16]
##    par.causal  par.dist par.h2 par.model AUC_mean AUC_mean_boot
##    <fct>       <chr>     <dbl> <chr>        <dbl>         <dbl>
##  1 30 in HLA   laplace     0.5 fancy        0.827      0.0021  
##  2 30 in all   gaussian    0.5 fancy        0.833      0.001   
##  3 30 in HLA   gaussian    0.5 fancy        0.834      0.001   
##  4 30 in all   laplace     0.5 fancy        0.834      0.0013  
##  5 300 in all  laplace     0.5 fancy        0.839      0.0005  
##  6 30 in all   laplace     0.5 simple       0.840      0.0008  
##  7 30 in HLA   gaussian    0.5 simple       0.840      0.000600
##  8 300 in all  gaussian    0.5 fancy        0.840      0.0004  
##  9 30 in all   gaussian    0.5 simple       0.840      0.000600
## 10 30 in HLA   laplace     0.5 simple       0.840      0.0011  
## 11 3000 in all gaussian    0.5 simple       0.841      0.0005  
## 12 300 in all  laplace     0.5 simple       0.841      0.0005  
## 13 3000 in all laplace     0.5 fancy        0.841      0.0005  
## 14 3000 in all gaussian    0.5 fancy        0.841      0.0005  
## 15 3000 in all laplace     0.5 simple       0.841      0.0005  
## 16 300 in all  gaussian    0.5 simple       0.842      0.0005  
## 17 30 in HLA   laplace     0.8 fancy        0.932      0.0015  
## 18 30 in all   laplace     0.8 fancy        0.933      0.0013  
## 19 30 in HLA   gaussian    0.8 fancy        0.935      0.0008  
## 20 30 in all   gaussian    0.8 fancy        0.936      0.0008  
## 21 300 in all  laplace     0.8 fancy        0.940      0.000300
## 22 300 in all  gaussian    0.8 fancy        0.940      0.000300
## 23 30 in HLA   laplace     0.8 simple       0.94       0.000600
## 24 30 in all   gaussian    0.8 simple       0.940      0.0004  
## 25 3000 in all laplace     0.8 fancy        0.940      0.000300
## 26 300 in all  laplace     0.8 simple       0.940      0.000300
## 27 3000 in all laplace     0.8 simple       0.940      0.000300
## 28 3000 in all gaussian    0.8 simple       0.940      0.000300
## 29 300 in all  gaussian    0.8 simple       0.941      0.0002  
## 30 3000 in all gaussian    0.8 fancy        0.941      0.0002  
## 31 30 in HLA   gaussian    0.8 simple       0.941      0.0004  
## 32 30 in all   laplace     0.8 simple       0.941      0.0005

References

Wray, Naomi R, Jian Yang, Michael E Goddard, and Peter M Visscher. 2010. “The Genetic Interpretation of Area Under the Roc Curve in Genomic Profiling.” PLoS Genetics 6 (2). Public Library of Science: e1000864.