library(tidyverse)
# devtools::install_github("privefl/bigsnpr")
library(bigsnpr)
# devtools::install_github("privefl/paper2-PRS/pkg.paper.PRS")
library(pkg.paper.PRS) ## See code source for functions' code.
Each method’s function returns a tibble (data frame) with 4 columns:
10^(-c(0, -log10(5e-8), exp(seq(log(0.1), log(100), length.out = 100)))) %>%
sprintf("%.1e", .) %>%
paste(collapse = ", ")
## [1] "1.0e+00, 5.0e-08, 7.9e-01, 7.8e-01, 7.7e-01, 7.5e-01, 7.4e-01, 7.2e-01, 7.0e-01, 6.9e-01, 6.7e-01, 6.5e-01, 6.3e-01, 6.1e-01, 5.9e-01, 5.7e-01, 5.4e-01, 5.2e-01, 5.0e-01, 4.7e-01, 4.5e-01, 4.2e-01, 3.9e-01, 3.7e-01, 3.4e-01, 3.2e-01, 2.9e-01, 2.7e-01, 2.4e-01, 2.2e-01, 2.0e-01, 1.8e-01, 1.5e-01, 1.3e-01, 1.2e-01, 1.0e-01, 8.5e-02, 7.1e-02, 5.8e-02, 4.8e-02, 3.8e-02, 3.0e-02, 2.3e-02, 1.8e-02, 1.3e-02, 9.8e-03, 7.0e-03, 4.9e-03, 3.3e-03, 2.2e-03, 1.4e-03, 8.8e-04, 5.3e-04, 3.1e-04, 1.7e-04, 9.2e-05, 4.7e-05, 2.3e-05, 1.1e-05, 4.6e-06, 1.9e-06, 7.3e-07, 2.6e-07, 8.8e-08, 2.7e-08, 7.7e-09, 2.0e-09, 4.7e-10, 1.0e-10, 1.9e-11, 3.2e-12, 4.7e-13, 6.0e-14, 6.7e-15, 6.3e-16, 5.0e-17, 3.3e-18, 1.8e-19, 8.1e-21, 2.9e-22, 7.9e-24, 1.7e-25, 2.7e-27, 3.3e-29, 2.9e-31, 1.8e-33, 7.7e-36, 2.2e-38, 4.3e-41, 5.2e-44, 3.8e-47, 1.7e-50, 4.3e-54, 6.0e-58, 4.4e-62, 1.6e-66, 2.8e-71, 2.3e-76, 7.7e-82, 1.1e-87, 5.5e-94, 1.0e-100"
celiac2 <- snp_attach("backingfiles/celiacQC_sub1.rds")
CHR <- celiac2$map$chromosome
POS <- celiac2$map$physical.pos
(G <- celiac2$genotypes)
## A Filebacked Big Matrix of type 'code 256' with 7100 rows and 281122 columns.
(G2 <- big_attach("backingfiles/celiacQC_sub1_tripled1.rds"))
## A Filebacked Big Matrix of type 'code 256' with 7100 rows and 843366 columns.
covar.all <- readRDS("backingfiles/PCA2.rds")$u
n <- nrow(G)
ind.HLA <- snp_indLRLDR(CHR, POS, subset(LD.wiki34, ID == "hild12"))
params.grid1 <- expand.grid(
n.train = 6000,
par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")),
par.dist = c("gaussian", "laplace"),
par.h2 = 0.8,
par.model = c("simple", "fancy"),
num.simu = 1:5,
stringsAsFactors = FALSE
)
if (!dir.exists("results1")) dir.create("results1")
for (i in rows_along(params.grid1)) {
res.file <- paste0("results1/simu_", i, ".rds")
if (file.exists(res.file)) next
params <- params.grid1[i, ]
par.causal <- params[["par.causal"]][[1]]
# Simulate phenotypes
simu_pheno <- get_pheno(
G,
h2 = params[["par.h2"]],
M = as.integer(par.causal[1]),
ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
effects.dist = params[["par.dist"]],
model = params[["par.model"]]
)
pheno.all <- simu_pheno$pheno
params[["true_set"]] <- list(simu_pheno$set)
# Split in training/test sets
ind.train <- sort(sample(n, size = params[["n.train"]]))
ind.test <- setdiff(1:n, ind.train)
params[["pheno"]] <- list(pheno.all[ind.test])
# Get results from all methods
res <- bind_rows(
# PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple"),
# logit.CMSA(G2, pheno.all, covar.all, ind.train, ind.test, "logit-triple"),
ttrees("../TTree-source/TTree", "backingfiles/ttrees",
pheno.all, ind.train, ind.test, n.trees = 100)
)
params[["res"]] <- list(res)
saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}
params.grid2 <- expand.grid(
n.train = 6000,
par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")),
par.dist = c("gaussian", "laplace"),
par.h2 = c(0.5, 0.8),
par.model = c("simple", "fancy"),
num.simu = 1:100,
stringsAsFactors = FALSE
)
if (!dir.exists("results2")) dir.create("results2")
for (i in rows_along(params.grid2)) {
res.file <- paste0("results2/simu_", i, ".rds")
if (file.exists(res.file)) next
params <- params.grid2[i, ]
par.causal <- params[["par.causal"]][[1]]
# Simulate phenotypes
simu_pheno <- get_pheno(
G,
h2 = params[["par.h2"]],
M = as.integer(par.causal[1]),
ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
effects.dist = params[["par.dist"]],
model = params[["par.model"]]
)
pheno.all <- simu_pheno$pheno
params[["true_set"]] <- list(simu_pheno$set)
# Split in training/test sets
ind.train <- sort(sample(n, size = params[["n.train"]]))
ind.test <- setdiff(1:n, ind.train)
params[["pheno"]] <- list(pheno.all[ind.test])
# Get results from all methods
res <- bind_rows(
PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple"),
logit.CMSA(G2, pheno.all, covar.all, ind.train, ind.test, "logit-triple")
)
params[["res"]] <- list(res)
saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}
params.grid4 <- expand.grid(
n.train = 6000,
par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")),
par.dist = c("gaussian", "laplace"),
par.h2 = c(0.5, 0.8),
par.model = "simple",
num.simu = 1:100,
stringsAsFactors = FALSE
)
if (!dir.exists("results4")) dir.create("results4")
G6 <- big_copy(G, ind.col = which(CHR == 6))
ind.HLA6 <- snp_indLRLDR(CHR[CHR == 6], POS[CHR == 6],
subset(LD.wiki34, ID == "hild12"))
for (i in rows_along(params.grid4)) {
res.file <- paste0("results4/simu_", i, ".rds")
if (file.exists(res.file)) next
params <- params.grid4[i, ]
par.causal <- params[["par.causal"]][[1]]
# Simulate phenotypes
simu_pheno <- get_pheno(
G6,
h2 = params[["par.h2"]],
M = as.integer(par.causal[1]),
ind.possible = `if`(par.causal[2] == "all", cols_along(G6), ind.HLA6),
effects.dist = params[["par.dist"]],
model = params[["par.model"]]
)
pheno.all <- simu_pheno$pheno
params[["true_set"]] <- list(simu_pheno$set)
# Split in training/test sets
ind.train <- sort(sample(n, size = params[["n.train"]]))
ind.test <- setdiff(1:n, ind.train)
params[["pheno"]] <- list(pheno.all[ind.test])
# Get results from all methods
res <- bind_rows(
PRS(G6, CHR[CHR == 6], POS[CHR == 6], pheno.all, covar.all, ind.train, ind.test),
logit.CMSA(G6, pheno.all, covar.all, ind.train, ind.test, "logit-simple")
)
params[["res"]] <- list(res)
saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}
params.grid5 <- expand.grid(
n.train = 1:5 * 1000,
par.causal = list(c(300, "all")),
par.dist = c("gaussian", "laplace"),
par.h2 = c(0.5, 0.8),
par.model = "simple",
num.simu = 1:100,
stringsAsFactors = FALSE
)
if (!dir.exists("results5")) dir.create("results5")
for (i in rows_along(params.grid5)) {
res.file <- paste0("results5/simu_", i, ".rds")
if (file.exists(res.file)) next
params <- params.grid5[i, ]
par.causal <- params[["par.causal"]][[1]]
# Simulate phenotypes
simu_pheno <- get_pheno(
G,
h2 = params[["par.h2"]],
M = as.integer(par.causal[1]),
ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
effects.dist = params[["par.dist"]],
model = params[["par.model"]]
)
pheno.all <- simu_pheno$pheno
params[["true_set"]] <- list(simu_pheno$set)
# Split in training/test sets
ind.train <- sort(sample(n, size = params[["n.train"]]))
ind.test <- setdiff(1:n, ind.train)
params[["pheno"]] <- list(pheno.all[ind.test])
# Get results from all methods
res <- bind_rows(
PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple")
)
params[["res"]] <- list(res)
saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}
library(biglasso)
library(Matrix)
G3 <- bigmemory::big.matrix(nrow(G), ncol(G) + ncol(covar.all),
backingfile = "G-PC",
backingpath = "backingfiles")
big_apply(G, function(X, ind) { G3[, ind] <- X[, ind]; NULL }, a.combine = 'c')
G3[, ncol(G) + cols_along(covar.all)] <- covar.all
G3 <- bigmemory::attach.big.matrix("backingfiles/G-PC.desc")
logit.biglasso <- function(G3, pheno.all, covar.all, ind.train, ind.test) {
timing <- system.time({
biglasso <- biglasso(G3, pheno.all, ind.train, ncores = nb_cores(),
penalty = "lasso", alpha = 1, family = "binomial")
preds <- 1 / (1 + exp(-predict(biglasso, G3, ind.test)))
})[3]
aucs <- apply(preds, 2, bigstatsr::AUC, target = pheno.all[ind.test])
ind.max <- which.max(aucs)
tibble(
method = "biglasso",
pred = list(preds[, ind.max]),
timing = timing,
alpha = 1,
set = list(which(head(biglasso$beta[, ind.max], -ncol(covar.all)) != 0))
)
}
params.grid7 <- expand.grid(
n.train = 6000,
par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")),
par.dist = c("gaussian", "laplace"),
par.h2 = c(0.8),
par.model = c("simple"),
num.simu = 1:100,
stringsAsFactors = FALSE
)
if (!dir.exists("results7")) dir.create("results7")
for (i in rows_along(params.grid7)) {
res.file <- paste0("results7/simu_", i, ".rds")
if (file.exists(res.file)) next
params <- params.grid7[i, ]
par.causal <- params[["par.causal"]][[1]]
# Simulate phenotypes
simu_pheno <- get_pheno(
G,
h2 = params[["par.h2"]],
M = as.integer(par.causal[1]),
ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
effects.dist = params[["par.dist"]],
model = params[["par.model"]]
)
pheno.all <- simu_pheno$pheno
params[["true_set"]] <- list(simu_pheno$set)
# Split in training/test sets
ind.train <- sort(sample(n, size = params[["n.train"]]))
ind.test <- setdiff(1:n, ind.train)
params[["pheno"]] <- list(pheno.all[ind.test])
# Get results from all methods
res <- bind_rows(
logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple", alphas = 1),
logit.biglasso(G3, pheno.all, covar.all, ind.train, ind.test)
)
params[["res"]] <- list(res)
saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}