This was done in another paper.
library(bigsnpr)
## Loading required package: bigstatsr
str(celiac <- snp_attach("backingfiles/celiacQC.rds"), max.level = 1)
## List of 3
## $ genotypes:Reference class 'FBM.code256' [package "bigstatsr"] with 7 fields
## ..and 21 methods, of which 7 are possibly relevant
## $ fam :'data.frame': 15155 obs. of 6 variables:
## $ map :'data.frame': 281122 obs. of 6 variables:
## - attr(*, "class")= chr "bigSNP"
# Get population
pop.files <- list.files(path = "backingfiles",
pattern = "cluster_*", full.names = TRUE)
pop <- snp_getSampleInfos(celiac, pop.files)[[1]]
pop.names <- c("Netherlands", "Italy", "UK1", "UK2", "Finland")
celiac$fam$family.ID <- pop.names[pop]
celiac <- snp_save(celiac)
# Read modified file to check
celiac <- snp_attach("backingfiles/celiacQC.rds")
rle(celiac$fam$family.ID)
## Run Length Encoding
## lengths: int [1:5] 6736 3325 2436 1623 1035
## values : chr [1:5] "UK2" "UK1" "Finland" "Netherlands" "Italy"
# Compute PCA
obj.svd <- snp_autoSVD(celiac$genotypes,
celiac$map$chromosome,
celiac$map$physical.pos,
ncores = nb_cores())
## Phase of clumping at r2 > 0.2.. keep 94520 SNPs.
##
## Iteration 1:
## Computing SVD..
## 3 long-range LD regions were detected..
##
## Iteration 2:
## Computing SVD..
## 2 long-range LD regions were detected..
##
## Iteration 3:
## Computing SVD..
##
## Converged!
library(ggplot2)
plot(obj.svd, type = "scores", scores = 1:2) +
aes(color = celiac$fam$family.ID) +
labs(color = "Population")
# Which are UK controls?
ind.UKcontrols <- which(startsWith(celiac$fam$family.ID, "UK") &
celiac$fam$affection == 1)
obj.svd$u[-ind.UKcontrols, ] <- NA
# Compute Mahalanobis distance on PCs
distM <- rep(NA, nrow(celiac$genotypes))
distM[ind.UKcontrols] <- robust::covRob(obj.svd$u[ind.UKcontrols, ])$dist
THR <- 30
# Plot UK controls, coloring with Mahalanobis distance
library(magrittr)
p_list <- lapply(list(1:2, 3:4, 5:6, 7:8), function(scores) {
plot(obj.svd, type = "scores", scores = scores, coeff = 0.7) +
aes(color = (distM > THR)) +
labs(color = "Outlier?") +
scale_color_manual(breaks = c(TRUE, FALSE), values = scales::hue_pal()(2))
})
lapply(p_list, function(p) p + theme(legend.position = "none")) %>%
cowplot::plot_grid(plotlist = ., ncol = 2, align = "hv", scale = 0.95) %>%
cowplot::plot_grid(cowplot::get_legend(p_list[[1]]),
rel_widths = c(1, 0.15))
ggsave("figures/outliers-pop.pdf", scale = 1/90, width = 870, height = 650)
# New dataset with only non-outliers UK controls
length(ind.keep <- which(distM < THR))
## [1] 7100
celiac2 <- snp_attach(subset(celiac, ind.row = ind.keep))
obj.svd2 <- snp_autoSVD(celiac2$genotypes,
celiac2$map$chromosome,
celiac2$map$physical.pos,
ncores = nb_cores())
## Phase of clumping at r2 > 0.2.. keep 94138 SNPs.
##
## Iteration 1:
## Computing SVD..
## 4 long-range LD regions were detected..
##
## Iteration 2:
## Computing SVD..
## 3 long-range LD regions were detected..
##
## Iteration 3:
## Computing SVD..
##
## Converged!
plot(obj.svd2)
plot(obj.svd2, type = "scores")
# Save this new PCA
saveRDS(obj.svd2, "backingfiles/PCA2.rds")
ggsave("figures/no-pop.pdf", scale = 1/80, width = 620, height = 570)