library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.0.9000 ✔ dplyr 0.8.0.9000
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(bigsnpr)
## Loading required package: bigstatsr
library(pkg.paper.PRS)
pAUC <- pkg.paper.PRS:::pAUC
# Read file, see "Preprocessing" notebook
celiac <- snp_attach("backingfiles/celiacQC.rds")
G <- celiac$genotypes
CHR <- celiac$map$chromosome
POS <- celiac$map$physical.pos
NCORES <- nb_cores()
y <- celiac$fam$affection - 1
obj.svd <- readRDS("backingfiles/PCA.rds")
G2 <- big_attach("backingfiles/celiacQC_tripled1.rds")
# Divide in training/test sets
dim(G)
## [1] 15155 281122
set.seed(1)
ind.train <- sort(sample(nrow(G), size = 12e3))
ind.test <- setdiff(rows_along(G), ind.train)
# GWAS
system.time(
gwas.train <- big_univLogReg(
G, y[ind.train], ind.train = ind.train,
covar.train = obj.svd$u[ind.train, , drop = FALSE],
ncores = NCORES
)
)
## user system elapsed
## 0.744 0.072 143.566
# Q-Q plot
snp_qq(gwas.train) +
coord_cartesian(ylim = c(0, 15))
gwas.train.gc <- snp_gc(gwas.train)
# Manhattan plot
labels <- 1:22; labels[c(11, 13, 15, 17, 18, 20, 21)] <- ""
cowplot::plot_grid(
snp_manhattan(gwas.train.gc, CHR, POS, labels = labels),
snp_manhattan(gwas.train.gc, CHR, POS, labels = labels) +
coord_cartesian(ylim = c(0, 22)) +
geom_hline(yintercept = -log10(5e-8), color = "red", linetype = 3),
align = "hv", ncol = 1, labels = LETTERS[1:2], label_size = 25, scale = 0.95
)
# Clumping on the test set
ind.keep <- snp_clumping(G, infos.chr = CHR,
ind.row = ind.test,
thr.r2 = 0.2,
S = abs(gwas.train.gc$score),
size = 500,
is.size.in.bp = TRUE,
infos.pos = POS,
ncores = NCORES)
# C+T
thrs <- c(0, -log10(5e-8), exp(seq(log(0.1), log(100), length.out = 100)))
lpS <- -predict(gwas.train.gc)
prs <- snp_PRS(G, betas.keep = gwas.train.gc$estim[ind.keep],
ind.test = ind.test,
ind.keep = ind.keep,
lpS.keep = lpS[ind.keep],
thr.list = thrs)
nb.pred <- sapply(thrs, function(thr) sum(lpS[ind.keep] > thr))
aucs <- apply(prs, 2, AUC, target = y[ind.test])
summary(aucs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8121 0.8159 0.8168 0.8177 0.8191 0.8246
nb.pred[which.max(aucs)]
## [1] 9573
plot_density_AUC <- function(pred, true = y[ind.test]) {
true_fct <- forcats::fct_recode(as.factor(true), Control = "0", Case = "1")
data.frame(Score = pred, Status = true_fct) %>%
ggplot() +
geom_density(aes(Score, fill = Status), alpha = 0.4) +
ggtitle(sprintf("AUC = %s%% | pAUC = %s",
round(100 * AUC(pred, true), 2),
round(pAUC(pred, true), 4))) +
theme_bigstatsr() +
theme(plot.title = element_text(size = rel(1.5), hjust = 0.5))
}
(p1 <- plot_density_AUC(prs[, which.max(aucs)]))
Cross-Model Selection and Averaging (CMSA):
# Train PLR
system.time(
logit <- big_spLogReg(X = G, y01.train = y[ind.train],
ind.train = ind.train,
covar.train = obj.svd$u[ind.train, ],
ncores = NCORES, alphas = 1)
)
## user system elapsed
## 2.572 0.356 33.471
pred1 <- predict(logit, X = G, ind.row = ind.test,
covar.row = obj.svd$u[ind.test, ])
(nb.pred1 <- summary(logit, best.only = TRUE)$nb_var)
## [1] 2215
(aucs1 <- AUC(pred1, target = y[ind.test]))
## [1] 0.8870682
(p2 <- plot_density_AUC(pred1))
system.time(
logit2 <- big_spLogReg(X = G2, y01.train = y[ind.train],
ind.train = ind.train,
covar.train = obj.svd$u[ind.train, ],
ncores = NCORES, alphas = 1)
)
## user system elapsed
## 3.596 0.816 140.317
pred2 <- predict(logit2, X = G2, ind.row = ind.test,
covar.row = obj.svd$u[ind.test, ])
(nb.pred2 <- summary(logit2, best.only = TRUE)$nb_var)
## [1] 4389
(aucs2 <- AUC(pred2, target = y[ind.test]))
## [1] 0.8925798
(p3 <- plot_density_AUC(pred2))
list(p1, p2, p3) %>%
lapply(function(p) p + theme(legend.position = "none")) %>%
cowplot::plot_grid(plotlist = ., ncol = 1, labels = LETTERS[1:3], label_size = 20, scale = 0.95) %>%
cowplot::plot_grid(., cowplot::get_legend(p1), rel_widths = c(1, 0.15), scale = 0.95)
ggsave("figures/supp-score-densities.pdf", scale = 1/100, width = 780, height = 900)
library(plotROC)
scores_tidy <- tibble(
d = y[ind.test],
"C+T-max" = prs[, which.max(aucs)],
"PLR" = pred1,
"PLR3" = pred2
) %>%
gather(key = "Method", value = "Score", -d)
methods.color <- setNames(scales::hue_pal()(6),
c("PLR", "C+T-max", "C+T-stringent",
"T-Trees", "PLR3", "C+T-all"))
ggplot(scores_tidy, aes(d = d, m = Score, color = Method, linetype = Method)) +
style_roc(xlab = "1 - Specificity", ylab = "Sensitivity") +
theme_bigstatsr() +
geom_roc(n.cuts = 0, size = 2) +
theme(legend.position = c(0.7, 0.3), legend.key.width = unit(4, "line")) +
coord_equal() +
scale_color_manual(values = methods.color)
ggsave("figures/celiac-roc.pdf", scale = 1/90, width = 700, height = 700)
Each method’s function returns a tibble (data frame) with 4 columns:
if (!dir.exists("results6")) dir.create("results6")
pheno.all <- y
covar.all <- obj.svd$u
for (i in 1:100) {
res.file <- paste0("results6/simu_", i, ".rds")
if (file.exists(res.file)) next
# Split in training/test sets
ind.train <- sort(sample(nrow(G), size = 12e3))
ind.test <- setdiff(rows_along(G), ind.train)
# Get results from all methods
res <- bind_rows(
filter(PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test), thr.r2 == 0.2),
logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple"),
logit.CMSA(G2, pheno.all, covar.all, ind.train, ind.test, "logit-triple")
)
res$pheno <- list(pheno.all[ind.test])
saveRDS(res, file = res.file)
}
list.files(path = "results6/", full.names = TRUE) %>%
map_df(readRDS) %>%
as_tibble() %>%
mutate(
method = fct_recode(sub("PRS", "C+T", method),
PLR = "logit-simple", PLR3 = "logit-triple"),
AUC = map2_dbl(pred, pheno, ~bigstatsr::AUC(.x, .y)),
pAUC = map2_dbl(pred, pheno, ~pAUC(.x, .y)),
nb.preds = map_int(set, length)
) %>%
group_by(method) %>%
summarize_at(c("AUC", "pAUC", "nb.preds", "timing"), function(x) {
glue::glue("{signif(mean(x), 3)} ({signif(boot(x), 3)})")
}) %>%
print() %>%
xtable::xtable()
## # A tibble: 5 x 5
## method AUC pAUC nb.preds timing
## <fct> <S3: glue> <S3: glue> <S3: glue> <S3: glue>
## 1 C+T-all 0.818 (0.000662) 0.0299 (0.000167) 96100 (6.78) 136 (0.43)
## 2 C+T-max 0.827 (0.000663) 0.0292 (0.000184) 10100 (781) 136 (0.431)
## 3 C+T-stringent 0.819 (0.000689) 0.0276 (0.000167) 185 (0.329) 136 (0.428)
## 4 PLR 0.889 (0.000489) 0.0418 (0.000199) 3250 (63.6) 273 (1.29)
## 5 PLR3 0.894 (0.000489) 0.0432 (0.000195) 4530 (83.4) 438 (2.38)
## % latex table generated in R 3.4.4 by xtable 1.8-3 package
## % Wed Jan 23 20:09:56 2019
## \begin{table}[ht]
## \centering
## \begin{tabular}{rlllll}
## \hline
## & method & AUC & pAUC & nb.preds & timing \\
## \hline
## 1 & C+T-all & 0.818 (0.000662) & 0.0299 (0.000167) & 96100 (6.78) & 136 (0.43) \\
## 2 & C+T-max & 0.827 (0.000663) & 0.0292 (0.000184) & 10100 (781) & 136 (0.431) \\
## 3 & C+T-stringent & 0.819 (0.000689) & 0.0276 (0.000167) & 185 (0.329) & 136 (0.428) \\
## 4 & PLR & 0.889 (0.000489) & 0.0418 (0.000199) & 3250 (63.6) & 273 (1.29) \\
## 5 & PLR3 & 0.894 (0.000489) & 0.0432 (0.000195) & 4530 (83.4) & 438 (2.38) \\
## \hline
## \end{tabular}
## \end{table}