library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## âś” ggplot2 3.1.0           âś” purrr   0.2.5      
## âś” tibble  2.0.0           âś” dplyr   0.7.99.9000
## âś” tidyr   0.8.2           âś” stringr 1.3.1      
## âś” readr   1.1.1           âś” forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
# devtools::install_github("privefl/bigsnpr")
library(bigsnpr)
## Loading required package: bigstatsr
# devtools::install_github("privefl/paper2-PRS/pkg.paper.PRS")
library(pkg.paper.PRS)  ## See code source for functions' code.
options(nboot = 1e5)

Useful functions

COLORS <- scales::hue_pal()(6)
methods.color <- setNames(
  c(COLORS, COLORS[c(4, 2, 5)], "black"), 
  c("PLR", "C+T-max", "C+T-stringent", "T-Trees", "PLR3", "C+T-all",
    "C+T-max-0.05", "C+T-max-0.2", "C+T-max-0.8", "biglasso")
) 
myggplot <- function(..., coeff = 1) {
  ggplot(...) + bigstatsr::theme_bigstatsr(size.rel = coeff)
} 
barplot_causal <- function(results) {
  
  results %>%
    myggplot(aes(par.causal, AUC_mean, fill = method, color = method)) +
    geom_hline(yintercept = 0.5, linetype = 2) +
    geom_col(position = position_dodge(), alpha = 0.5, color = "black", size = 1) +
    geom_errorbar(aes(ymin = AUC_mean - 2 * AUC_boot, ymax = AUC_mean + 2 * AUC_boot),
                  position = position_dodge(width = 0.9), color = "black", width = 0.2, size = 1) + 
    scale_y_continuous(limits = c(0.5, NA), minor_breaks = 0:20 / 20,
                       oob = scales::rescale_none) +
    labs(x = "Causal SNPs (number and location)", y = "Mean of 100 AUCs",
         fill = "Method", color = "Method") +
    scale_fill_manual(values = methods.color) +
    scale_color_manual(values = methods.color)
}

barplot_causal_one <- function(results, h2) {
  
  auc_max <- `if`(h2 == 0.8, 0.94, 0.84)
  
  results %>%
    filter(par.h2 == h2) %>%
    barplot_causal() +
    geom_hline(yintercept = auc_max, linetype = 3, color = "blue") +
    facet_grid(par.dist ~ .)
}

barplot_causal_all <- function(results) {
  
  results %>%
    barplot_causal() +
    geom_hline(aes(yintercept = auc_max), linetype = 3, color = "blue",
               data = data.frame(par.h2 = c(0.5, 0.8), auc_max = c(0.84, 0.94))) +
    facet_grid(par.dist ~ par.h2)
}
compare_PLR <- function(results, with_method) {
  
  results <- filter(
    results, method %in% c("PLR", with_method), par.h2 == 0.8)
  
  results.summary <- results %>%
    group_by_at(c(vars(starts_with("par")), "method")) %>%
    summarise(AUC_mean = mean(AUC), AUC_boot = boot(AUC), N = n())
  
  nsimu <- results.summary$N[[1]]
  # stopifnot(all(results.summary$N == nsimu))
  
  plot_extra <- function(y, ylab = y) {
    
    results %>%
      myggplot() +
      geom_boxplot(aes_string("par.causal", y, color = "method", 
                              fill = "method"), alpha = 0.3) + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      facet_grid(par.model ~ par.dist) +
      labs(x = "Causal SNPs (number and location)", y = ylab, 
           fill = "Method", color = "Method") +
      theme(legend.position = "none") +
      scale_fill_manual(values = methods.color) +
      scale_color_manual(values = methods.color) +
      scale_y_continuous(limits = c(0, NA))
  }
  
  p1 <- results.summary %>%
    barplot_causal_one(h2 = 0.8) +
    facet_grid(par.model ~ par.dist) +
    labs(y = sprintf("Mean of %d AUCs", nsimu))
  
  p2 <- cowplot::plot_grid(
    plot_extra("nb.preds", "Number of predictors"),
    plot_extra("timing", "Execution time (in seconds)"),
    ncol = 1, align = "hv", scale = 0.95, 
    labels = LETTERS[2:3], label_size = 20
  )
  
  p12 <- cowplot::plot_grid(
    p1 + theme(legend.position = "none"), p2, 
    ncol = 2, scale = 0.95, rel_widths = c(6, 4), 
    labels = c("A", ""), label_size = 20
  )
  
  cowplot::plot_grid(
    cowplot::get_legend(p1 + theme(legend.direction = "horizontal")), p12, 
    rel_heights = c(0.1, 1), ncol = 1
  )
}
G <- snp_attach("backingfiles/celiacQC_sub1.rds")$genotypes
corr <- snp_cor(G, size = 100)
saveRDS(corr, "backingfiles/corr2.rds")
corr <- readRDS("backingfiles/corr2.rds")

Results with T-Trees

results1 <- list.files("results1", full.names = TRUE) %>%
  read_format_results(corr)
compare_PLR(results1, with_method = "T-Trees")

ggsave("figures/supp-ttrees.pdf", scale = 1/100, width = 1400, height = 1060)

Results with PLR3 & model COMP

results2 <- list.files("results2", full.names = TRUE) %>%
  read_format_results(corr)

Results with PLR3

compare_PLR(results2, with_method = "PLR3")

ggsave("figures/supp-triple.pdf", scale = 1/100, width = 1400, height = 1060)
results2 %>%
  filter(grepl("PLR", method)) %>%
  group_by_at(c(vars(starts_with("par")), "method")) %>%
  summarise(AUC_mean = mean(AUC), AUC_boot = boot(AUC), N = n()) %>%
  barplot_causal_one(h2 = 0.5) +
  facet_grid(par.model ~ par.dist)

ggsave("figures/supp-AUC-triple.pdf", scale = 1/110, width = 1242, height = 951)

Results with model COMP

results2 %>%
  filter(par.model == "COMP", method %in% c("PLR", "C+T-max")) %>%
  filter(thr.r2 == 0.2 | is.na(thr.r2)) %>%
  group_by_at(c(vars(starts_with("par")), "method")) %>%
  summarise(AUC_mean = mean(AUC), AUC_boot = boot(AUC), N = n()) %>%
  barplot_causal_all()

ggsave("figures/supp-AUC-logit-fancy.pdf", scale = 1/110, width = 1242, height = 951)
results2 %>%
  filter(par.model == "COMP", grepl("C\\+T", method)) %>%
  filter(thr.r2 == 0.2 | is.na(thr.r2)) %>%
  group_by_at(c(vars(starts_with("par")), "method")) %>%
  summarise(AUC_mean = mean(AUC), AUC_boot = boot(AUC), N = n()) %>%
  barplot_causal_all()

ggsave("figures/supp-AUC-PRS-fancy.pdf", scale = 1/110, width = 1242, height = 951)

Results with only PLR & C+T

Different r2 thresholds

results2.all.r2 <- results2 %>%
  filter(par.model == "ADD", method %in% c("PLR", "C+T-max")) %>%
  mutate(method = c_method_r2(method, thr.r2)) %>%
  group_by_at(c(vars(starts_with("par")), "method")) %>%
  summarise(AUC_mean = mean(AUC), AUC_boot = boot(AUC), N = n())
  
barplot_causal_all(results2.all.r2)

ggsave("figures/supp-AUC-all-r2.pdf", scale = 1/110, width = 1242, height = 951)
results2 %>%
  filter(par.model == "ADD", method %in% c("PLR", "C+T-max")) %>%
  mutate(method = c_method_r2(method, thr.r2)) %>%
  group_by_at(c(vars(starts_with("par")), "method")) %>%
  summarise(AltSens_mean = mean(AltSens), AltSens_boot = boot(AltSens), N = n()) %>%
  myggplot(aes(par.causal, AltSens_mean, fill = method, color = method)) + 
  geom_col(position = position_dodge(), alpha = 0.3) +
  geom_errorbar(aes(ymin = AltSens_mean - 2 * AltSens_boot,
                    ymax = AltSens_mean + 2 * AltSens_boot),
                position = position_dodge(width = 0.9), color = "black", width = 0.2) + 
  scale_y_continuous(limits = c(0, 1)) +
  labs(x = "Causal SNPs (number and location)", y = "Mean of 100 AltSens",
       fill = "Method", color = "Method") +
  scale_fill_manual(values = methods.color) +
  scale_color_manual(values = methods.color) +
  facet_grid(par.dist ~ par.h2)

ggsave("figures/supp-AltSens-all-r2.pdf", scale = 1/110, width = 1242, height = 951)

Correlation between predictive measures

results3 <- results2 %>%
  filter(thr.r2 == 0.2 | is.na(thr.r2)) %>%
  filter(par.model == "ADD", method != "PLR3")
results3 %>%
  filter(method %in% c("PLR", "C+T-max")) %>%
  mutate(Parameters  = interaction(par.h2, par.dist, method, sep = " | ")) %>%
  myggplot(aes(AUC, pAUC, color = Parameters)) + 
  geom_point(size = 0.6, alpha = 0.5) +
  geom_smooth(aes(linetype = Parameters), method = "loess", se = FALSE, size = 2) +
  theme(legend.key.width = unit(3.5, "line"), 
        legend.direction = "horizontal") + 
  labs(y = "Partial AUC (specificity: [0.9, 1])") +
  theme(legend.position = "top", axis.title = element_text(size = rel(2.2))) +
  

ggsave("figures/supp-AUC-corr.pdf", scale = 1/100, width = 1525, height = 983)

cor(results3$AUC, results3$pAUC, method = "spearman")
## [1] 0.9804009

6000 in training & all chromosomes

results3.summary <- results3 %>%
  group_by_at(c(vars(starts_with("par")), "method")) %>%
  summarise(AUC_mean = mean(AUC), AUC_boot = boot(AUC), N = n())
results3.summary %>%
  filter(method %in% c("PLR", "C+T-max"), par.model == "ADD") %>%
  barplot_causal_all()

ggsave("figures/main-AUC-logit.pdf", scale = 1/110, width = 1242, height = 951)
results3.summary %>%
  filter(grepl("C\\+T", method)) %>%
  barplot_causal_all()

ggsave("figures/supp-AUC-PRS.pdf", scale = 1/110, width = 1242, height = 951)

6000 in training & only chromosome 6

results4 <- list.files("results4", full.names = TRUE) %>%
  read_format_results(corr)
results4 %>%
  filter(method %in% c("PLR", "C+T-max")) %>%
  mutate(method = c_method_r2(method, thr.r2)) %>%
  group_by_at(c(vars(starts_with("par")), "method")) %>%
  summarise(AUC_mean = mean(AUC), AUC_boot = boot(AUC), N = n()) %>%
  barplot_causal_all() + 
  geom_col(aes(par.causal, AUC_mean), data = results2.all.r2, 
           position = position_dodge(), color = "black", alpha = 0)

ggsave("figures/supp-AUC-chr6-all-r2.pdf", scale = 1/110, width = 1242, height = 951)

Varying training size & all shromosomes

results5 <- list.files("results5", full.names = TRUE) %>%
  read_format_results(corr)

results6 <- results2 %>%
  filter(par.model == "ADD", par.causal == "300 in all") %>%
  bind_rows(results5) %>%
  filter(method %in% c("PLR", "C+T-max")) %>%
  mutate(method = c_method_r2(method, thr.r2),
         par.causal = as.character(par.causal))
results6 %>%
  group_by_at(c(vars(starts_with("par")), "method", "n.train")) %>%
  summarise(AUC_mean = mean(AUC), AUC_boot = boot(AUC), N = n()) %>%
  myggplot(aes(n.train, AUC_mean, color = method, linetype = method)) +
  geom_hline(yintercept = 0.5, linetype = 2) +
  geom_hline(aes(yintercept = auc_max), linetype = 3, color = "blue",
             data = data.frame(par.h2 = c(0.5, 0.8), auc_max = c(0.84, 0.94))) +
  geom_line(size = 1.5) +
  geom_errorbar(aes(ymin = AUC_mean - 2 * AUC_boot, 
                    ymax = AUC_mean + 2 * AUC_boot), 
                size = 1.5, width = 0) +
  facet_grid(par.dist ~ par.h2) +
  scale_x_continuous(breaks = 1:6 * 1000, minor_breaks = NULL) + 
  scale_y_continuous(minor_breaks = 0:20 / 20) +
  scale_color_manual(values = methods.color) +
  labs(x = "Size of the training set", y = "Mean of 100 AUCs", 
       color = "Method", linetype = "Method") +
  theme(legend.key.width = unit(3, "line"))

ggsave("figures/main-AUC-ntrain.pdf", scale = 1/100, width = 1050, height = 800)

Comparison with biglasso

list.files("results7", full.names = TRUE) %>%
  read_format_results(corr) %>%
  compare_PLR("biglasso")

ggsave("figures/supp-biglasso.pdf", scale = 1/100, width = 1600, height = 1000)