Methods’ functions

library(tidyverse)
# devtools::install_github("privefl/bigsnpr")
library(bigsnpr)
# devtools::install_github("privefl/paper2-PRS/pkg.paper.PRS")
library(pkg.paper.PRS)  ## See code source for functions' code.

Each method’s function returns a tibble (data frame) with 4 columns:

  1. the name of the method,
  2. the predictive scores and true phenotypes for the test set, as a list-column,
  3. the timing of the main computations (in seconds),
  4. the number of SNPs used for the prediction.
10^(-c(0, -log10(5e-8), exp(seq(log(0.1), log(100), length.out = 100)))) %>%
  sprintf("%.1e", .) %>%
  paste(collapse = ", ")
## [1] "1.0e+00, 5.0e-08, 7.9e-01, 7.8e-01, 7.7e-01, 7.5e-01, 7.4e-01, 7.2e-01, 7.0e-01, 6.9e-01, 6.7e-01, 6.5e-01, 6.3e-01, 6.1e-01, 5.9e-01, 5.7e-01, 5.4e-01, 5.2e-01, 5.0e-01, 4.7e-01, 4.5e-01, 4.2e-01, 3.9e-01, 3.7e-01, 3.4e-01, 3.2e-01, 2.9e-01, 2.7e-01, 2.4e-01, 2.2e-01, 2.0e-01, 1.8e-01, 1.5e-01, 1.3e-01, 1.2e-01, 1.0e-01, 8.5e-02, 7.1e-02, 5.8e-02, 4.8e-02, 3.8e-02, 3.0e-02, 2.3e-02, 1.8e-02, 1.3e-02, 9.8e-03, 7.0e-03, 4.9e-03, 3.3e-03, 2.2e-03, 1.4e-03, 8.8e-04, 5.3e-04, 3.1e-04, 1.7e-04, 9.2e-05, 4.7e-05, 2.3e-05, 1.1e-05, 4.6e-06, 1.9e-06, 7.3e-07, 2.6e-07, 8.8e-08, 2.7e-08, 7.7e-09, 2.0e-09, 4.7e-10, 1.0e-10, 1.9e-11, 3.2e-12, 4.7e-13, 6.0e-14, 6.7e-15, 6.3e-16, 5.0e-17, 3.3e-18, 1.8e-19, 8.1e-21, 2.9e-22, 7.9e-24, 1.7e-25, 2.7e-27, 3.3e-29, 2.9e-31, 1.8e-33, 7.7e-36, 2.2e-38, 4.3e-41, 5.2e-44, 3.8e-47, 1.7e-50, 4.3e-54, 6.0e-58, 4.4e-62, 1.6e-66, 2.8e-71, 2.3e-76, 7.7e-82, 1.1e-87, 5.5e-94, 1.0e-100"

Simulations

Data

celiac2 <- snp_attach("backingfiles/celiacQC_sub1.rds")
CHR <- celiac2$map$chromosome
POS <- celiac2$map$physical.pos
(G <- celiac2$genotypes)
## A Filebacked Big Matrix of type 'code 256' with 7100 rows and 281122 columns.
(G2 <- big_attach("backingfiles/celiacQC_sub1_tripled1.rds"))
## A Filebacked Big Matrix of type 'code 256' with 7100 rows and 843366 columns.
covar.all <- readRDS("backingfiles/PCA2.rds")$u

n <- nrow(G)
ind.HLA <- snp_indLRLDR(CHR, POS, subset(LD.wiki34, ID == "hild12"))

Scenario n°1 (with T-Trees)

params.grid1 <- expand.grid(
  n.train    = 6000,
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = 0.8, 
  par.model  = c("simple", "fancy"),
  num.simu   = 1:5,
  stringsAsFactors = FALSE
) 
if (!dir.exists("results1")) dir.create("results1")

for (i in rows_along(params.grid1)) {
  
  res.file <- paste0("results1/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid1[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    # PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
    logit.CMSA(G,  pheno.all, covar.all, ind.train, ind.test, "logit-simple"),
    # logit.CMSA(G2, pheno.all, covar.all, ind.train, ind.test, "logit-triple"),
    ttrees("../TTree-source/TTree", "backingfiles/ttrees", 
           pheno.all, ind.train, ind.test, n.trees = 100)
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Scenario n°1 (without T-Trees)

params.grid2 <- expand.grid(
  n.train = 6000,
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.5, 0.8), 
  par.model  = c("simple", "fancy"),
  num.simu   = 1:100,
  stringsAsFactors = FALSE
) 
if (!dir.exists("results2")) dir.create("results2")

for (i in rows_along(params.grid2)) {
  
  res.file <- paste0("results2/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid2[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
    logit.CMSA(G,  pheno.all, covar.all, ind.train, ind.test, "logit-simple"),
    logit.CMSA(G2, pheno.all, covar.all, ind.train, ind.test, "logit-triple")
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Scenario n°2 (dataset with only chromosome 6)

params.grid4 <- expand.grid(
  n.train    = 6000,
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.5, 0.8), 
  par.model  = "simple",
  num.simu   = 1:100,
  stringsAsFactors = FALSE
) 
if (!dir.exists("results4")) dir.create("results4")

G6 <- big_copy(G, ind.col = which(CHR == 6))
ind.HLA6 <- snp_indLRLDR(CHR[CHR == 6], POS[CHR == 6], 
                         subset(LD.wiki34, ID == "hild12"))

for (i in rows_along(params.grid4)) {
  
  res.file <- paste0("results4/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid4[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G6,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G6), ind.HLA6),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    PRS(G6, CHR[CHR == 6], POS[CHR == 6], pheno.all, covar.all, ind.train, ind.test),
    logit.CMSA(G6, pheno.all, covar.all, ind.train, ind.test, "logit-simple")
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Scenario n°3 (varying training size)

params.grid5 <- expand.grid(
  n.train    = 1:5 * 1000,
  par.causal = list(c(300, "all")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.5, 0.8), 
  par.model  = "simple",
  num.simu   = 1:100,
  stringsAsFactors = FALSE
) 
if (!dir.exists("results5")) dir.create("results5")

for (i in rows_along(params.grid5)) {
  
  res.file <- paste0("results5/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid5[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
    logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple")
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Scenario n°1 (comparison with biglasso)

library(biglasso)
library(Matrix)
G3 <- bigmemory::big.matrix(nrow(G), ncol(G) + ncol(covar.all),
                            backingfile = "G-PC",                 
                            backingpath = "backingfiles")
big_apply(G, function(X, ind) { G3[, ind] <- X[, ind]; NULL }, a.combine = 'c')
G3[, ncol(G) + cols_along(covar.all)] <- covar.all
G3 <- bigmemory::attach.big.matrix("backingfiles/G-PC.desc")
logit.biglasso <- function(G3, pheno.all, covar.all, ind.train, ind.test) {
  
  timing <- system.time({
    
    biglasso <- biglasso(G3, pheno.all, ind.train, ncores = nb_cores(),
                         penalty = "lasso", alpha = 1, family = "binomial")
    
    preds <- 1 / (1 + exp(-predict(biglasso, G3, ind.test)))
  })[3]
  
  aucs <- apply(preds, 2, bigstatsr::AUC, target = pheno.all[ind.test])
  ind.max <- which.max(aucs)
  
  tibble(
    method   = "biglasso", 
    pred     = list(preds[, ind.max]),
    timing   = timing,
    alpha    = 1,
    set      = list(which(head(biglasso$beta[, ind.max], -ncol(covar.all)) != 0))
  )
}
params.grid7 <- expand.grid(
  n.train = 6000,
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.8), 
  par.model  = c("simple"),
  num.simu   = 1:100,
  stringsAsFactors = FALSE
) 
if (!dir.exists("results7")) dir.create("results7")

for (i in rows_along(params.grid7)) {
  
  res.file <- paste0("results7/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid7[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple", alphas = 1),
    logit.biglasso(G3, pheno.all, covar.all, ind.train, ind.test)
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}