#######This code was pieced together from https://labs.genetics.ucla.edu/horvath/CoexpressionNetwork/Rpackages/WGCNA/ ####### Major portions were provided from Dr. Simon Roux from Dr. Matthew Sullivan's Lab at the Ohio State University. library(ggplot2) library(WGCNA) options(stringsAsFactors = FALSE) # If you have multiple threads enable this. # enableWGCNAThreads() setwd("path/to/directory") ab_matrix_raw<-as.data.frame(t(read.csv("OTU_WGCNA.csv", header=T, row.names=1))) #add this because we are already dealing with normalized data ab_matrix<- ab_matrix_raw # add meta data Metadata<-read.csv("Nut_WGCNA_062116.csv", header=T) ### Our data has already been hellinger transformed ### #ab_matrix_t_raw_hel<-decostand(Metadata_raw,method="hellinger") #Metadata<-log(Metadata + 1) # Natural log (default) #write.csv(Metadata, "Metadata_hellinger.csv") #Metadata<-read.csv("Metadata_hellinger.csv", header=T) ab_matrix_t<- datExpr Metadata<- datTraits # Not necessary to do every time but must be done in R not R studio. ## Identify beta to ensure scale free topology powers = c(seq(4,10,by=1), seq(12,20, by=2)) pickSoftThreshold(ab_matrix_t, powerVector=powers, blockSize = 20000, verbose=2) # We pick the power as soon as it starts to plateau # Create adjacency matrix by raising OTU matrix by beta and identify subnetworks (modules) net = blockwiseModules(ab_matrix_t, power=5, minModuleSize=30, maxBlockSize = 20000, corType = "pearson", saveTOMs = TRUE, saveTOMFileBase = "blockwiseTOM", pamStage=FALSE, verbose=5) # Plot the dendrogram moduleLabels = net$colors moduleColors = net$colors MEs = net$MEs geneTree = net$dendrograms[[1]] pdf("plotDendro.pdf") plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],"Module colors",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05) dev.off() # Save data # Identify Eigenvalues for subnetworks by sample nPop = ncol(ab_matrix_t) nSamples = nrow(ab_matrix_t) MEsO = moduleEigengenes(ab_matrix_t, moduleColors)$eigengenes MEs = orderMEs(MEsO) save(MEs, moduleLabels, moduleColors, geneTree,file = "Module-networkConstruction-auto.RData") # Save data write.csv(file="Module_eigen_values.csv",MEs) write.csv(file="Module_composition.csv",net$colors) ##Correlate Eigenvalues to metadata and create heatmap moduleTraitCor = cor(MEs, Metadata, use = "p") moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples) textMatrix = paste(signif(moduleTraitCor, 2), " (",signif(moduleTraitPvalue, 1), ")", sep = "") dim(textMatrix) = dim(moduleTraitCor) pdf("Correlation.pdf",width=12,height=8) par(mar = c(12, 12, 3, 3)) labeledHeatmap(Matrix = moduleTraitCor,xLabels = names(Metadata),yLabels = names(MEs),ySymbols = names(MEs),colorLabels = FALSE,colors = blueWhiteRed(50),textMatrix = textMatrix,setStdMargins = FALSE,cex.text = 0.5,cex.lab = 0.5,zlim = c(-1,1),main = paste("Module-trait relationships")) dev.off() ## Now make a plot for specific module <-> trait (metadata component) pairings # This allows us to explore the structure of submodule OTU correlations with a given metadata component # Here we will use "PO4" as trait and "turquoise" as module, but these can be altered for whichever pair you wish to investigate # First get the links between all modules and this trait parameter<-"PO4" weight <- as.data.frame(Metadata[,parameter]) names(weight) = "PO4" modNames = substring(names(MEs), 3) geneModuleMembership = as.data.frame(cor(ab_matrix_t, MEs, use = "p")); MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples)); names(geneModuleMembership) = paste("MM", modNames, sep=""); names(MMPvalue) = paste("p.MM", modNames, sep=""); geneTraitSignificance = as.data.frame(cor(ab_matrix_t, weight, use = "p")); GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples)); names(geneTraitSignificance) = paste("GS.", names(weight), sep=""); names(GSPvalue) = paste("p.GS.", names(weight), sep=""); # Then look at the specific module of interest module<-"turquoise" # module<-"red" column = match(module, modNames); moduleGenes = moduleColors==module pdf(paste(module,"-vs-",parameter,".pdf")) par(mfrow=c(1,1)) verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),abs(geneTraitSignificance[moduleGenes, 1]),xlab = paste("Module Membership in", module, "module"),ylab = paste("Population significance for ",parameter),main = paste("Module membership vs. population significance\n"),cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module) dev.off() ##Incorporate OTU taxonomy info to identify specific OTUs within a module, coalesce correlative data by OTU for later use in network analysis annot = read.csv("TAX_WGCNA_062116.csv", colClasses = "character"); dim(annot) names(annot) probes = names(ab_matrix_t) probes2annot = match(probes, annot$X) # The following is the number or probes without annotation: sum(is.na(probes2annot)) # Should return 0. # Create the starting data frame geneInfo0 = data.frame(ab_matrix_t = probes, Genus = annot$Kingdom[probes2annot], names = annot$X[probes2annot], moduleColor = moduleColors, geneTraitSignificance, GSPvalue) # Order modules by their significance according to metadata component modOrder = order(-abs(cor(MEs, weight, use = "p"))); # Add module membership information in the chosen order for (mod in 1:ncol(geneModuleMembership)) { oldNames = names(geneInfo0) geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]], MMPvalue[, modOrder[mod]]); names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""), paste("p.MM.", modNames[modOrder[mod]], sep="")) } # Order the genes in the geneInfo variable first by module color, then by geneTraitSignificance geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.PO4)); geneInfo = geneInfo0[geneOrder, ] write.csv(geneInfo, file = paste("OTUInfo_",module)) # Visualize the network # Too big with actual data # library(gplots) # dissTOM = 1-TOMsimilarityFromExpr(ab_matrix_t, power = 6); # Recomputed, could be taken from a previous computation though # popTree = net$dendrograms[[1]]; # plotTOM = dissTOM^7; # diag(plotTOM) = NA; # pdf("heatmap.pdf") # heatmap(as.matrix(plotTOM),Rowv=as.dendrogram(popTree,hang=0.1),Colv=as.dendrogram(popTree,hang=0.1),scale="none",ColSideColors=as.character(moduleColors),labRow=FALSE,labCol=FALSE,RowSideColors = as.character(moduleColors),revC=TRUE) # dev.off() # Get the similarity between eigen values and weight MET = orderMEs(cbind(MEs, weight)) pdf("Adjacency_trait_ME.pdf") par(cex = 1.0) plotEigengeneNetworks(MET, paste("Eigengene dendrogram with ",parameter), marDendro = c(0,4,2,0),plotHeatmaps = FALSE) par(cex = 1.0) plotEigengeneNetworks(MET, paste("Eigengene adjacency heatmap with ",parameter), marHeatmap = c(3,4,2,2),plotDendrograms = FALSE, xLabelsAngle = 90) dev.off() ################################################################################ # Moving to machine learning PLS and VIP scores # ################################################################################ library("pls") source("VIP.R") # metadata component (e.g. PO4, designated "weight" here as above) is the same as before, we just replicate the row names for pls th_r2<-0.3 subnetwork<-ab_matrix_t[,moduleGenes] row.names(weight)<-row.names(subnetwork) pls_result<-plsr(as.matrix(weight) ~ as.matrix(subnetwork), validation="LOO",method="oscorespls") r2_vector<-R2(pls_result) max<-0 max_comp<--1 for (j in 1:length(r2_vector$val)){ if(r2_vector$val[j]>th_r2){ # We will only look at the PLS if the correlation is better than 0.3 if(r2_vector$val[j]>max){ max<-r2_vector$val[j] max_comp<-r2_vector$comp[j] } } } print(paste(" the max r2 is ",max," corresponding to comp ",max_comp,sep="",".pdf")) if(max==0){ # print ("No good correlation, we stop here") } else{ print("Good correlation, we check the VIP !\n") } # Checking the VIP output<-paste("VIP_values_with_",parameter,sep="") vip_result<-VIP(pls_result) vip_components<-sort(vip_result[max_comp,],decreasing=TRUE)[1:220] for (i in 1:220){ cat(paste("Rank ",i," we have ",names(vip_components[i])," with a VIP of ",vip_components[i],"\n",sep=""),file=output,append=TRUE) } weight_2 <- as.data.frame(weight[!is.na(weight)]) df<-data.frame(x=weight_2,y=pls_result$validation$pred[,,max_comp]) colnames(df)<-c("x","y") pdf(paste("measured_vs_predicted_",module,"-vs-",parameter,".pdf")) ggplot(data=df) + geom_point(aes(x=x,y=y)) + geom_smooth(aes(x=x,y=y),method=lm) + xlab("Measured") + ylab("Predicted") + ggtitle(paste("Comparison of ",parameter," measured vs predicted for module ",module)) + theme(axis.text=element_text(color="black",size=10),axis.ticks=element_line(color="black")) dev.off() # Establish the correlation between predicted and modeled cor.test(df$x,df$y) ## Identify node centrality based on co-occurence data for each OTU in the module TOM = TOMsimilarityFromExpr(ab_matrix_t, power = 5, corType = "pearson"); # Select submodule of interest based on high correlation and signficance module<-"turquoise"; # Select module probes probes = names(ab_matrix_t) inModule = (moduleColors==module); modProbes = probes[inModule]; # Select the corresponding Topological Overlap modTOM = TOM[inModule, inModule]; dimnames(modTOM) = list(modProbes, modProbes) write.csv(modTOM, paste("nodeconnections_",parameter,".csv")) # Number of cells above 0.2 threshhold <-- this number is flexible and should change with your data x<- as.data.frame(rowSums(modTOM > 0.2)) write.csv(x, paste("nodes_",parameter,".csv")) # make scatter hive plots # You will need to make the Nodeworksheet by combining OTUinfo table for submodule of interest, VIP socres, and Nodeconnections. See workflow for more details. hive<- read.csv("Nodeworksheet_NO3.csv", header=T) (Figure 5-8) hive$OUT<-factor(hive$OUT, levels = unique(hive$OUT)) p<- ggplot(hive, aes(x= hive$Node_centrality, y= hive$R2)) + geom_point(aes(size = hive$VIP, colour = hive$Phylum)) + scale_size_area(max_size= 10) + theme_bw() p + geom_text(aes(label=hive$Best, hjust=-.05, size=0.5))+ labs(x="Node centrality", y="Correlation to Nitrate", color="Phylum")