QC and imputation

This was done in another paper.

library(bigsnpr)
## Loading required package: bigstatsr
str(celiac <- snp_attach("backingfiles/celiacQC.rds"), max.level = 1)
## List of 3
##  $ genotypes:Reference class 'FBM.code256' [package "bigstatsr"] with 7 fields
##   ..and 21 methods, of which 7 are  possibly relevant
##  $ fam      :'data.frame':   15155 obs. of  6 variables:
##  $ map      :'data.frame':   281122 obs. of  6 variables:
##  - attr(*, "class")= chr "bigSNP"

Add population from external files

# Get population
pop.files <- list.files(path = "backingfiles", 
                        pattern = "cluster_*", full.names = TRUE)
pop <- snp_getSampleInfos(celiac, pop.files)[[1]]
pop.names <- c("Netherlands", "Italy", "UK1", "UK2", "Finland")
celiac$fam$family.ID <- pop.names[pop]
celiac <- snp_save(celiac)
# Read modified file to check
celiac <- snp_attach("backingfiles/celiacQC.rds")
rle(celiac$fam$family.ID)
## Run Length Encoding
##   lengths: int [1:5] 6736 3325 2436 1623 1035
##   values : chr [1:5] "UK2" "UK1" "Finland" "Netherlands" "Italy"

Filter samples based on population structure

# Compute PCA
obj.svd <- snp_autoSVD(celiac$genotypes, 
                       celiac$map$chromosome,
                       celiac$map$physical.pos, 
                       ncores = nb_cores())
## Phase of clumping at r2 > 0.2.. keep 94520 SNPs.
## 
## Iteration 1:
## Computing SVD..
## 3 long-range LD regions were detected..
## 
## Iteration 2:
## Computing SVD..
## 2 long-range LD regions were detected..
## 
## Iteration 3:
## Computing SVD..
## 
## Converged!
library(ggplot2)
plot(obj.svd, type = "scores", scores = 1:2) + 
  aes(color = celiac$fam$family.ID) + 
  labs(color = "Population")

# Which are UK controls?
ind.UKcontrols <- which(startsWith(celiac$fam$family.ID, "UK") & 
                          celiac$fam$affection == 1)
obj.svd$u[-ind.UKcontrols, ] <- NA
# Compute Mahalanobis distance on PCs
distM <- rep(NA, nrow(celiac$genotypes))
distM[ind.UKcontrols] <- robust::covRob(obj.svd$u[ind.UKcontrols, ])$dist
THR <- 30
# Plot UK controls, coloring with Mahalanobis distance
library(magrittr)
p_list <- lapply(list(1:2, 3:4, 5:6, 7:8), function(scores) {
  plot(obj.svd, type = "scores", scores = scores, coeff = 0.7) + 
    aes(color = (distM > THR)) + 
    labs(color = "Outlier?") +
    scale_color_manual(breaks = c(TRUE, FALSE), values = scales::hue_pal()(2))
}) 
lapply(p_list, function(p) p + theme(legend.position = "none")) %>%
  cowplot::plot_grid(plotlist = ., ncol = 2, align = "hv", scale = 0.95) %>%
  cowplot::plot_grid(cowplot::get_legend(p_list[[1]]),
                     rel_widths = c(1, 0.15))

ggsave("figures/outliers-pop.pdf", scale = 1/90, width = 870, height = 650)
# New dataset with only non-outliers UK controls
length(ind.keep <- which(distM < THR))
## [1] 7100
celiac2 <- snp_attach(subset(celiac, ind.row = ind.keep))
obj.svd2 <- snp_autoSVD(celiac2$genotypes, 
                        celiac2$map$chromosome,
                        celiac2$map$physical.pos,
                        ncores = nb_cores())
## Phase of clumping at r2 > 0.2.. keep 94138 SNPs.
## 
## Iteration 1:
## Computing SVD..
## 4 long-range LD regions were detected..
## 
## Iteration 2:
## Computing SVD..
## 3 long-range LD regions were detected..
## 
## Iteration 3:
## Computing SVD..
## 
## Converged!
plot(obj.svd2)

plot(obj.svd2, type = "scores") 

# Save this new PCA
saveRDS(obj.svd2, "backingfiles/PCA2.rds")
ggsave("figures/no-pop.pdf", scale = 1/80, width = 620, height = 570)