library(tidyverse)
library(bigsnpr)
library(doParallel)
celiac2 <- snp_attach("backingfiles/celiacQC_sub1.rds")
CHR <- celiac2$map$chromosome
POS <- celiac2$map$physical.pos
G <- celiac2$genotypes
ind.HLA <- snp_indLRLDR(CHR, POS, subset(LD.wiki34, ID == "hild12"))
K <- 0.3
Computation of the maximum AUC from equation (3) of Wray et al. (2010). Approximation is less accurate for high heritabilities.
sapply(c(0.5, 0.8), function(h2) {
T0 <- qnorm(1 - K)
z <- dnorm(T0)
i <- z / K
v <- -i * K / (1 - K)
num <- (i - v) * h2
deno.part <- 1 - h2 * i * (i - T0) + (1 - h2 * v * (v - T0))
pnorm(num / sqrt(h2 * deno.part))
})
## [1] 0.8406370 0.9301013
Assuming that \(y \sim N(0, h^2)\) and knowing that \(\epsilon \sim N(0, 1-h^2)\), we can estimate the theoretical value of the AUC that can be achieved given the heritability \(h^2\) and a liability threshold model on \(y + \epsilon\) with a prevalence of 30%.
nsimu <- 1e7
registerDoParallel(cl <- makeCluster(nb_cores()))
aucs <- foreach(i = seq_len(100), .combine = "cbind") %:%
foreach(h2 = c(0.5, 0.8), .combine = "rbind") %dopar% {
S <- rnorm(nsimu, mean = 0, sd = sqrt(h2))
E <- rnorm(nsimu, mean = 0, sd = sqrt(1 - h2))
Y <- (S + E) > qnorm(1 - K)
bigstatsr::AUC(S, Y)
}
stopCluster(cl)
apply(aucs, 1, summary)
## result.1 result.2
## Min. 0.8407055 0.9404302
## 1st Qu. 0.8409396 0.9405599
## Median 0.8410290 0.9405946
## Mean 0.8410334 0.9405949
## 3rd Qu. 0.8411200 0.9406333
## Max. 0.8413765 0.9407566
get_oracle <- function(
G, ## matrix of genotypes
h2, ## heritability
M, ## nbs of causal variants
ind.possible = cols_along(G), ## indices of possible causal variants
effects.dist = c("gaussian", "laplace"), ## distribution of effects
model = c("simple", "fancy"), ## model for simulation
K = 0.3 ## prevalence
) {
effects.dist <- match.arg(effects.dist)
model <- match.arg(model)
set <- sample(ind.possible, size = M)
effects <- `if`(effects.dist == "gaussian",
rnorm(M, sd = sqrt(h2 / M)),
rmutil::rlaplace(M, s = sqrt(h2 / (2*M))))
if (model == "simple") {
# only linear
y.simu <- scale(G[, set]) %*% effects
} else {
sets <- split(1:M, sample(rep_len(1:3, M)))
# linear
ind1 <- sets[[1]]
y.simu <- scale(G[, set[ind1]]) %*% effects[ind1]
# recessive / dominant
ind2 <- sets[[2]]
y.simu <- y.simu + scale(G[, set[ind2]] > 0.5) %*% effects[ind2]
# interactions
ind3 <- matrix(sets[[3]], ncol = 2)
y.simu <- y.simu + scale(G[, set[ind3[, 1]]] * G[, set[ind3[, 2]]]) %*%
effects[ind3[, 1]] * sqrt(2)
}
y.simu <- y.simu / sd(y.simu) * sqrt(h2)
stopifnot(all.equal(drop(var(y.simu)), h2))
S <- y.simu
y.simu <- S + rnorm(nrow(G), sd = sqrt(1 - h2))
pheno <- as.numeric(y.simu > qnorm(1 - K))
bigstatsr::AUC(S, pheno)
}
params.grid2 <- expand.grid(
par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")),
par.dist = c("gaussian", "laplace"),
par.h2 = c(0.5, 0.8),
par.model = c("simple", "fancy"),
num.simu = 1:100,
stringsAsFactors = FALSE
)
AUC_oracle <- function(G, params, ind.HLA) {
par.causal <- params[["par.causal"]][[1]]
# Simulate phenotypes
AUC <- get_oracle(
G,
h2 = params[["par.h2"]],
M = as.integer(par.causal[1]),
ind.possible = `if`(par.causal[2] == "all", bigstatsr::cols_along(G), ind.HLA),
effects.dist = params[["par.dist"]],
model = params[["par.model"]],
K = 0.3
)
params$AUC <- AUC
params
}
registerDoParallel(cl <- makeCluster(nb_cores()))
oracles <- foreach(i = rows_along(params.grid2), .combine = "rbind") %dopar% {
AUC_oracle(G, params.grid2[i, ], ind.HLA)
}
stopCluster(cl)
boot <- pkg.paper.PRS::boot
oracles %>%
mutate(
par.causal = factor(map_chr(par.causal, ~paste(.x[1], .x[2], sep = " in ")),
levels = c("30 in HLA", paste(3 * 10^(1:3), "in all")))) %>%
group_by_at(vars(starts_with("par"))) %>%
summarise(AUC_mean = round(mean(AUC), 4),
AUC_mean_boot = round(boot(AUC), 4)) %>%
arrange(AUC_mean) %>%
print(n = Inf)
## # A tibble: 32 x 6
## # Groups: par.causal, par.dist, par.h2 [16]
## par.causal par.dist par.h2 par.model AUC_mean AUC_mean_boot
## <fct> <chr> <dbl> <chr> <dbl> <dbl>
## 1 30 in HLA laplace 0.5 fancy 0.831 0.0018
## 2 30 in all laplace 0.5 fancy 0.832 0.0018
## 3 30 in HLA gaussian 0.5 fancy 0.833 0.0011
## 4 30 in all gaussian 0.5 fancy 0.834 0.0011
## 5 30 in all laplace 0.5 simple 0.839 0.001
## 6 300 in all laplace 0.5 fancy 0.839 0.0005
## 7 300 in all gaussian 0.5 fancy 0.840 0.0005
## 8 30 in all gaussian 0.5 simple 0.841 0.0005
## 9 300 in all gaussian 0.5 simple 0.841 0.0004
## 10 3000 in all gaussian 0.5 simple 0.841 0.0005
## 11 3000 in all laplace 0.5 fancy 0.841 0.0004
## 12 300 in all laplace 0.5 simple 0.841 0.0005
## 13 3000 in all gaussian 0.5 fancy 0.841 0.0005
## 14 30 in HLA laplace 0.5 simple 0.841 0.0009
## 15 3000 in all laplace 0.5 simple 0.841 0.0005
## 16 30 in HLA gaussian 0.5 simple 0.842 0.0007
## 17 30 in HLA laplace 0.8 fancy 0.932 0.0015
## 18 30 in all laplace 0.8 fancy 0.934 0.0011
## 19 30 in HLA gaussian 0.8 fancy 0.936 0.0008
## 20 30 in all gaussian 0.8 fancy 0.937 0.0007
## 21 300 in all laplace 0.8 fancy 0.940 0.000300
## 22 300 in all gaussian 0.8 fancy 0.940 0.000300
## 23 30 in all laplace 0.8 simple 0.940 0.0007
## 24 30 in all gaussian 0.8 simple 0.94 0.000300
## 25 3000 in all laplace 0.8 simple 0.940 0.000300
## 26 3000 in all gaussian 0.8 fancy 0.940 0.0002
## 27 3000 in all gaussian 0.8 simple 0.940 0.000300
## 28 3000 in all laplace 0.8 fancy 0.941 0.0002
## 29 300 in all gaussian 0.8 simple 0.941 0.0002
## 30 300 in all laplace 0.8 simple 0.941 0.000300
## 31 30 in HLA gaussian 0.8 simple 0.941 0.0005
## 32 30 in HLA laplace 0.8 simple 0.941 0.0007
G6 <- big_copy(G, ind.col = which(CHR == 6))
ind.HLA6 <- snp_indLRLDR(CHR[CHR == 6], POS[CHR == 6],
subset(LD.wiki34, ID == "hild12"))
registerDoParallel(cl <- makeCluster(nb_cores()))
oracles_chr6 <- foreach(i = rows_along(params.grid2), .combine = "rbind") %dopar% {
AUC_oracle(G6, params.grid2[i, ], ind.HLA6)
}
stopCluster(cl)
oracles_chr6 %>%
mutate(
par.causal = factor(map_chr(par.causal, ~paste(.x[1], .x[2], sep = " in ")),
levels = c("30 in HLA", paste(3 * 10^(1:3), "in all")))) %>%
group_by_at(vars(starts_with("par"))) %>%
summarise(AUC_mean = round(mean(AUC), 4),
AUC_mean_boot = round(boot(AUC), 4)) %>%
arrange(AUC_mean) %>%
print(n = Inf)
## # A tibble: 32 x 6
## # Groups: par.causal, par.dist, par.h2 [16]
## par.causal par.dist par.h2 par.model AUC_mean AUC_mean_boot
## <fct> <chr> <dbl> <chr> <dbl> <dbl>
## 1 30 in HLA laplace 0.5 fancy 0.827 0.0021
## 2 30 in all gaussian 0.5 fancy 0.833 0.001
## 3 30 in HLA gaussian 0.5 fancy 0.834 0.001
## 4 30 in all laplace 0.5 fancy 0.834 0.0013
## 5 300 in all laplace 0.5 fancy 0.839 0.0005
## 6 30 in all laplace 0.5 simple 0.840 0.0008
## 7 30 in HLA gaussian 0.5 simple 0.840 0.000600
## 8 300 in all gaussian 0.5 fancy 0.840 0.0004
## 9 30 in all gaussian 0.5 simple 0.840 0.000600
## 10 30 in HLA laplace 0.5 simple 0.840 0.0011
## 11 3000 in all gaussian 0.5 simple 0.841 0.0005
## 12 300 in all laplace 0.5 simple 0.841 0.0005
## 13 3000 in all laplace 0.5 fancy 0.841 0.0005
## 14 3000 in all gaussian 0.5 fancy 0.841 0.0005
## 15 3000 in all laplace 0.5 simple 0.841 0.0005
## 16 300 in all gaussian 0.5 simple 0.842 0.0005
## 17 30 in HLA laplace 0.8 fancy 0.932 0.0015
## 18 30 in all laplace 0.8 fancy 0.933 0.0013
## 19 30 in HLA gaussian 0.8 fancy 0.935 0.0008
## 20 30 in all gaussian 0.8 fancy 0.936 0.0008
## 21 300 in all laplace 0.8 fancy 0.940 0.000300
## 22 300 in all gaussian 0.8 fancy 0.940 0.000300
## 23 30 in HLA laplace 0.8 simple 0.94 0.000600
## 24 30 in all gaussian 0.8 simple 0.940 0.0004
## 25 3000 in all laplace 0.8 fancy 0.940 0.000300
## 26 300 in all laplace 0.8 simple 0.940 0.000300
## 27 3000 in all laplace 0.8 simple 0.940 0.000300
## 28 3000 in all gaussian 0.8 simple 0.940 0.000300
## 29 300 in all gaussian 0.8 simple 0.941 0.0002
## 30 3000 in all gaussian 0.8 fancy 0.941 0.0002
## 31 30 in HLA gaussian 0.8 simple 0.941 0.0004
## 32 30 in all laplace 0.8 simple 0.941 0.0005
Wray, Naomi R, Jian Yang, Michael E Goddard, and Peter M Visscher. 2010. “The Genetic Interpretation of Area Under the Roc Curve in Genomic Profiling.” PLoS Genetics 6 (2). Public Library of Science: e1000864.