Simulations

Data

celiac2 <- snp_attach("backingfiles/celiacQC_sub1.rds")
CHR <- celiac2$map$chromosome
POS <- celiac2$map$physical.pos
(G <- celiac2$genotypes)

## A Filebacked Big Matrix of type 'code 256' with 7100 rows and 281122 columns.

(G2 <- big_attach("backingfiles/celiacQC_sub1_tripled1.rds"))

## A Filebacked Big Matrix of type 'code 256' with 7100 rows and 843366 columns.

covar.all <- readRDS("backingfiles/PCA2.rds")$u

n <- nrow(G)
ind.HLA <- snp_indLRLDR(CHR, POS, subset(LD.wiki34, ID == "hild12"))

Scenario n°1 (with T-Trees)

params.grid1 <- expand.grid(
  n.train    = 6000,
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = 0.8, 
  par.model  = c("simple", "fancy"),
  num.simu   = 1:5,
  stringsAsFactors = FALSE
)

if (!dir.exists("results1")) dir.create("results1")

for (i in rows_along(params.grid1)) {
  
  res.file <- paste0("results1/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid1[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    # PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
    logit.CMSA(G,  pheno.all, covar.all, ind.train, ind.test, "logit-simple"),
    # logit.CMSA(G2, pheno.all, covar.all, ind.train, ind.test, "logit-triple"),
    ttrees("../TTree-source/TTree", "backingfiles/ttrees", 
           pheno.all, ind.train, ind.test, n.trees = 100)
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Scenario n°1 (without T-Trees)

params.grid2 <- expand.grid(
  n.train = 6000,
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.5, 0.8), 
  par.model  = c("simple", "fancy"),
  num.simu   = 1:100,
  stringsAsFactors = FALSE
)

if (!dir.exists("results2")) dir.create("results2")

for (i in rows_along(params.grid2)) {
  
  res.file <- paste0("results2/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid2[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
    logit.CMSA(G,  pheno.all, covar.all, ind.train, ind.test, "logit-simple"),
    logit.CMSA(G2, pheno.all, covar.all, ind.train, ind.test, "logit-triple")
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Scenario n°2 (dataset with only chromosome 6)

params.grid4 <- expand.grid(
  n.train    = 6000,
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.5, 0.8), 
  par.model  = "simple",
  num.simu   = 1:100,
  stringsAsFactors = FALSE
)

if (!dir.exists("results4")) dir.create("results4")

G6 <- big_copy(G, ind.col = which(CHR == 6))
ind.HLA6 <- snp_indLRLDR(CHR[CHR == 6], POS[CHR == 6], 
                         subset(LD.wiki34, ID == "hild12"))

for (i in rows_along(params.grid4)) {
  
  res.file <- paste0("results4/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid4[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G6,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G6), ind.HLA6),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    PRS(G6, CHR[CHR == 6], POS[CHR == 6], pheno.all, covar.all, ind.train, ind.test),
    logit.CMSA(G6, pheno.all, covar.all, ind.train, ind.test, "logit-simple")
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Scenario n°3 (varying training size)

params.grid5 <- expand.grid(
  n.train    = 1:5 * 1000,
  par.causal = list(c(300, "all")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.5, 0.8), 
  par.model  = "simple",
  num.simu   = 1:100,
  stringsAsFactors = FALSE
)

if (!dir.exists("results5")) dir.create("results5")

for (i in rows_along(params.grid5)) {
  
  res.file <- paste0("results5/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid5[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    PRS(G, CHR, POS, pheno.all, covar.all, ind.train, ind.test),
    logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple")
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Scenario n°1 (comparison with biglasso)

library(biglasso)
library(Matrix)

G3 <- bigmemory::big.matrix(nrow(G), ncol(G) + ncol(covar.all),
                            backingfile = "G-PC",                 
                            backingpath = "backingfiles")
big_apply(G, function(X, ind) { G3[, ind] <- X[, ind]; NULL }, a.combine = 'c')
G3[, ncol(G) + cols_along(covar.all)] <- covar.all

G3 <- bigmemory::attach.big.matrix("backingfiles/G-PC.desc")

logit.biglasso <- function(G3, pheno.all, covar.all, ind.train, ind.test) {
  
  timing <- system.time({
    
    biglasso <- biglasso(G3, pheno.all, ind.train, ncores = nb_cores(),
                         penalty = "lasso", alpha = 1, family = "binomial")
    
    preds <- 1 / (1 + exp(-predict(biglasso, G3, ind.test)))
  })[3]
  
  aucs <- apply(preds, 2, bigstatsr::AUC, target = pheno.all[ind.test])
  ind.max <- which.max(aucs)
  
  tibble(
    method   = "biglasso", 
    pred     = list(preds[, ind.max]),
    timing   = timing,
    alpha    = 1,
    set      = list(which(head(biglasso$beta[, ind.max], -ncol(covar.all)) != 0))
  )
}

params.grid7 <- expand.grid(
  n.train = 6000,
  par.causal = list(c(30, "all"), c(300, "all"), c(3000, "all"), c(30, "HLA")), 
  par.dist   = c("gaussian", "laplace"), 
  par.h2     = c(0.8), 
  par.model  = c("simple"),
  num.simu   = 1:100,
  stringsAsFactors = FALSE
)

if (!dir.exists("results7")) dir.create("results7")

for (i in rows_along(params.grid7)) {
  
  res.file <- paste0("results7/simu_", i, ".rds")
  if (file.exists(res.file)) next
  
  params <- params.grid7[i, ]
  par.causal <- params[["par.causal"]][[1]]
  
  # Simulate phenotypes
  simu_pheno <- get_pheno(
    G,    
    h2 = params[["par.h2"]], 
    M = as.integer(par.causal[1]), 
    ind.possible = `if`(par.causal[2] == "all", cols_along(G), ind.HLA),
    effects.dist = params[["par.dist"]], 
    model = params[["par.model"]]
  )
  pheno.all <- simu_pheno$pheno
  params[["true_set"]] <- list(simu_pheno$set)
  
  # Split in training/test sets
  ind.train <- sort(sample(n, size = params[["n.train"]]))
  ind.test <- setdiff(1:n, ind.train)
  params[["pheno"]] <- list(pheno.all[ind.test])
    
  # Get results from all methods
  res <- bind_rows(
    logit.CMSA(G, pheno.all, covar.all, ind.train, ind.test, "logit-simple", alphas = 1),
    logit.biglasso(G3, pheno.all, covar.all, ind.train, ind.test)
  )
  params[["res"]] <- list(res)
  saveRDS(unnest(params, res, .drop = FALSE), file = res.file)
}

Simulations

Florian Privé

August 28, 2018

Methods’ functions

Simulations

Data

Scenario n°1 (with T-Trees)

Scenario n°1 (without T-Trees)

Scenario n°2 (dataset with only chromosome 6)

Scenario n°3 (varying training size)

Scenario n°1 (comparison with biglasso)