Analysis of celiac disease

library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0          ✔ purrr   0.2.5     
## ✔ tibble  2.0.0.9000     ✔ dplyr   0.8.0.9000
## ✔ tidyr   0.8.2          ✔ stringr 1.3.1     
## ✔ readr   1.1.1          ✔ forcats 0.3.0

## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(bigsnpr)

## Loading required package: bigstatsr

library(pkg.paper.PRS)
pAUC <- pkg.paper.PRS:::pAUC

Data

# Read file, see "Preprocessing" notebook
celiac <- snp_attach("backingfiles/celiacQC.rds")
G <- celiac$genotypes
CHR <- celiac$map$chromosome
POS <- celiac$map$physical.pos
NCORES <- nb_cores()
y <- celiac$fam$affection - 1
obj.svd <- readRDS("backingfiles/PCA.rds")

G2 <- big_attach("backingfiles/celiacQC_tripled1.rds")

Predictive models and plots

# Divide in training/test sets
dim(G)

## [1]  15155 281122

set.seed(1)
ind.train <- sort(sample(nrow(G), size = 12e3))
ind.test <- setdiff(rows_along(G), ind.train)

C+T

# GWAS
system.time(
  gwas.train <- big_univLogReg(
    G, y[ind.train], ind.train = ind.train, 
    covar.train = obj.svd$u[ind.train, , drop = FALSE], 
    ncores = NCORES
  )
)

##    user  system elapsed 
##   0.744   0.072 143.566

# Q-Q plot
snp_qq(gwas.train) +
  coord_cartesian(ylim = c(0, 15))

gwas.train.gc <- snp_gc(gwas.train)

# Manhattan plot
labels <- 1:22; labels[c(11, 13, 15, 17, 18, 20, 21)] <- ""
cowplot::plot_grid(
  snp_manhattan(gwas.train.gc, CHR, POS, labels = labels),
  snp_manhattan(gwas.train.gc, CHR, POS, labels = labels) +
    coord_cartesian(ylim = c(0, 22)) + 
    geom_hline(yintercept = -log10(5e-8), color = "red", linetype = 3),
  align = "hv", ncol = 1, labels = LETTERS[1:2], label_size = 25, scale = 0.95
)

# Clumping on the test set
ind.keep <- snp_clumping(G, infos.chr = CHR,
                         ind.row = ind.test,
                         thr.r2 = 0.2, 
                         S = abs(gwas.train.gc$score),
                         size = 500,
                         is.size.in.bp = TRUE,
                         infos.pos = POS,
                         ncores = NCORES)

# C+T
thrs <- c(0, -log10(5e-8), exp(seq(log(0.1), log(100), length.out = 100)))
lpS <- -predict(gwas.train.gc)
prs <- snp_PRS(G, betas.keep = gwas.train.gc$estim[ind.keep],
               ind.test = ind.test,
               ind.keep = ind.keep,
               lpS.keep = lpS[ind.keep], 
               thr.list = thrs)
nb.pred <- sapply(thrs, function(thr) sum(lpS[ind.keep] > thr))
aucs <- apply(prs, 2, AUC, target = y[ind.test])
summary(aucs)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8121  0.8159  0.8168  0.8177  0.8191  0.8246

nb.pred[which.max(aucs)]

## [1] 9573

plot_density_AUC <- function(pred, true = y[ind.test]) {
  true_fct <- forcats::fct_recode(as.factor(true), Control = "0", Case = "1")
  data.frame(Score = pred, Status = true_fct) %>%
  ggplot() +
  geom_density(aes(Score, fill = Status), alpha = 0.4) +
  ggtitle(sprintf("AUC = %s%% | pAUC = %s", 
                  round(100 * AUC(pred, true), 2),
                  round(pAUC(pred, true), 4))) + 
    theme_bigstatsr() +
    theme(plot.title = element_text(size = rel(1.5), hjust = 0.5))
}
(p1 <- plot_density_AUC(prs[, which.max(aucs)]))

Logistic regression

Cross-Model Selection and Averaging (CMSA):

# Train PLR
system.time(
  logit <- big_spLogReg(X = G, y01.train = y[ind.train], 
                        ind.train = ind.train, 
                        covar.train = obj.svd$u[ind.train, ],
                        ncores = NCORES, alphas = 1)
)

##    user  system elapsed 
##   2.572   0.356  33.471

pred1 <- predict(logit, X = G, ind.row = ind.test, 
                  covar.row = obj.svd$u[ind.test, ])
(nb.pred1 <- summary(logit, best.only = TRUE)$nb_var)

## [1] 2215

(aucs1 <- AUC(pred1, target = y[ind.test]))

## [1] 0.8870682

(p2 <- plot_density_AUC(pred1))

Logistic regression with feature engineering (PLR3)

system.time(
  logit2 <- big_spLogReg(X = G2, y01.train = y[ind.train], 
                         ind.train = ind.train, 
                         covar.train = obj.svd$u[ind.train, ],
                         ncores = NCORES, alphas = 1)
)

##    user  system elapsed 
##   3.596   0.816 140.317

pred2 <- predict(logit2, X = G2, ind.row = ind.test, 
                  covar.row = obj.svd$u[ind.test, ])
(nb.pred2 <- summary(logit2, best.only = TRUE)$nb_var)

## [1] 4389

(aucs2 <- AUC(pred2, target = y[ind.test]))

## [1] 0.8925798

(p3 <- plot_density_AUC(pred2))

list(p1, p2, p3) %>%
  lapply(function(p) p + theme(legend.position = "none")) %>%
  cowplot::plot_grid(plotlist = ., ncol = 1, labels = LETTERS[1:3], label_size = 20, scale = 0.95) %>%
  cowplot::plot_grid(., cowplot::get_legend(p1), rel_widths = c(1, 0.15), scale = 0.95)

ggsave("figures/supp-score-densities.pdf", scale = 1/100, width = 780, height = 900)

ROC Curves

library(plotROC)
scores_tidy <- tibble(
  d = y[ind.test], 
  "C+T-max" = prs[, which.max(aucs)],
  "PLR"     = pred1,
  "PLR3"    = pred2
) %>%
  gather(key = "Method", value = "Score", -d)
  
methods.color <- setNames(scales::hue_pal()(6), 
                          c("PLR", "C+T-max", "C+T-stringent",
                            "T-Trees", "PLR3", "C+T-all")) 

ggplot(scores_tidy, aes(d = d, m = Score, color = Method, linetype = Method)) +
  style_roc(xlab = "1 - Specificity", ylab = "Sensitivity") +
  theme_bigstatsr() +
  geom_roc(n.cuts = 0, size = 2) +
  theme(legend.position = c(0.7, 0.3), legend.key.width = unit(4, "line")) +
  coord_equal() +
  scale_color_manual(values = methods.color)

ggsave("figures/celiac-roc.pdf", scale = 1/90, width = 700, height = 700)

Predictive methods (multiple times)

Methods’ functions

Each method’s function returns a tibble (data frame) with 4 columns:

the name of the method,
the predictive scores and true phenotypes for the test set, as a list-column,
the timing of the main computations (in seconds),
the number of SNPs used for the prediction.

Run multiple times

if (!dir.exists("results6")) dir.create("results6")
pheno.all <- y
covar.all <- obj.svd$u

for (i in 1:100) {
  
  res.file <- paste0("results6/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  # Split in training/test sets
  ind.train <- sort(sample(nrow(G), size = 12e3))
  ind.test <- setdiff(rows_along(G), ind.train)
    
  # Get results from all methods
  res <- bind_rows(
    filter(PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test), thr.r2 == 0.2),
    logit.CMSA(G,  pheno.all, covar.all, ind.train, ind.test, "logit-simple"),
    logit.CMSA(G2, pheno.all, covar.all, ind.train, ind.test, "logit-triple")
  )
  res$pheno <- list(pheno.all[ind.test])
  
  saveRDS(res, file = res.file)
}

list.files(path = "results6/", full.names = TRUE) %>%
  map_df(readRDS) %>%
  as_tibble() %>%
  mutate(
    method = fct_recode(sub("PRS", "C+T", method), 
                        PLR = "logit-simple", PLR3 = "logit-triple"),
    AUC = map2_dbl(pred, pheno, ~bigstatsr::AUC(.x, .y)),
    pAUC = map2_dbl(pred, pheno, ~pAUC(.x, .y)),
    nb.preds = map_int(set, length)
  ) %>%
  group_by(method) %>%
  summarize_at(c("AUC", "pAUC", "nb.preds", "timing"), function(x) {
    glue::glue("{signif(mean(x), 3)} ({signif(boot(x), 3)})")
  }) %>%
  print() %>%
  xtable::xtable()

## # A tibble: 5 x 5
##   method        AUC              pAUC              nb.preds     timing     
##   <fct>         <S3: glue>       <S3: glue>        <S3: glue>   <S3: glue> 
## 1 C+T-all       0.818 (0.000662) 0.0299 (0.000167) 96100 (6.78) 136 (0.43) 
## 2 C+T-max       0.827 (0.000663) 0.0292 (0.000184) 10100 (781)  136 (0.431)
## 3 C+T-stringent 0.819 (0.000689) 0.0276 (0.000167) 185 (0.329)  136 (0.428)
## 4 PLR           0.889 (0.000489) 0.0418 (0.000199) 3250 (63.6)  273 (1.29) 
## 5 PLR3          0.894 (0.000489) 0.0432 (0.000195) 4530 (83.4)  438 (2.38)

## % latex table generated in R 3.4.4 by xtable 1.8-3 package
## % Wed Jan 23 20:09:56 2019
## \begin{table}[ht]
## \centering
## \begin{tabular}{rlllll}
##   \hline
##  & method & AUC & pAUC & nb.preds & timing \\ 
##   \hline
## 1 & C+T-all & 0.818 (0.000662) & 0.0299 (0.000167) & 96100 (6.78) & 136 (0.43) \\ 
##   2 & C+T-max & 0.827 (0.000663) & 0.0292 (0.000184) & 10100 (781) & 136 (0.431) \\ 
##   3 & C+T-stringent & 0.819 (0.000689) & 0.0276 (0.000167) & 185 (0.329) & 136 (0.428) \\ 
##   4 & PLR & 0.889 (0.000489) & 0.0418 (0.000199) & 3250 (63.6) & 273 (1.29) \\ 
##   5 & PLR3 & 0.894 (0.000489) & 0.0432 (0.000195) & 4530 (83.4) & 438 (2.38) \\ 
##    \hline
## \end{tabular}
## \end{table}

Analysis of celiac disease

Florian Privé

March 1, 2018

Data

Predictive models and plots

C+T

Logistic regression

Logistic regression with feature engineering (PLR3)

ROC Curves

Predictive methods (multiple times)

Methods’ functions

Run multiple times