## clear existing workspce rm(list=ls()) ## load packages library(tgp) library(stringr) library(Hmisc) # make sure that THERE IS ONLY ONE .csv FILE IN DIRECTORY! files <- list.files(path = getwd(), pattern = "data_for_analysis.csv", all.files = FALSE, full.names = FALSE, recursive = FALSE, ignore.case = TRUE, include.dirs = FALSE) ## import dataset import1 <- read.table(files[1], sep=",", header=TRUE) import2 <- data.frame(lapply(import1, as.character), stringsAsFactors=FALSE, header=FALSE) # find nearest.delta.up and nearest.delta.down and nearest.delta for(i in 1: nrow(import2)){ row <- import2[i,] delta.up <- row$delta.up strand.up <- row$strand.up delta.down <- row$delta.down strand.down <- row$strand.down delta.up.split <- strsplit(as.character(delta.up),"\\|") strand.up.split <- strsplit(as.character(strand.up),"\\|") # make data frame of delta and strand values for upstream PQS for(j in 1:length(delta.up.split[[1]])){ # do this if first time through loop if(j == 1){ u <- data.frame(as.numeric(delta.up.split[[1]][j]),strand.up.split[[1]][j]) } if(j > 1){ u <- rbind(u, data.frame(as.numeric(delta.up.split[[1]][j]),strand.up.split[[1]][j])) } } delta.down.split <- strsplit(as.character(delta.down),"\\|") strand.down.split <- strsplit(as.character(strand.down),"\\|") # make data frame of delta and strand values for downstream PQS for(j in 1:length(delta.down.split[[1]])){ # do this if first time through loop if(j == 1){ d <- data.frame(as.numeric(delta.down.split[[1]][j]),strand.down.split[[1]][j]) } if(j > 1){ d <- rbind(d, data.frame(as.numeric(delta.down.split[[1]][j]),strand.down.split[[1]][j])) } } # change colnames for dataframes colnames(u) <- colnames(d) <- c("delta","strand") # change factors to character designations u$strand <- as.character(u$strand) d$strand <- as.character(d$strand) # change strand designations to indicate template and non-template strands if(row$strand.x[1] == "+"){ u$strand[u$strand == "+"] <- "nt" u$strand[u$strand == "-"] <- "t" d$strand[d$strand == "+"] <- "nt" d$strand[d$strand == "-"] <- "t" } if(row$strand.x[1] == "-"){ u$strand[u$strand == "+"] <- "t" u$strand[u$strand == "-"] <- "nt" d$strand[d$strand == "+"] <- "t" d$strand[d$strand == "-"] <- "nt" } # rank "u" and "d" dataframes by abs(delta) if(nrow(u) > 1){ rank.u <- u[with(u, order(-delta)), ] } if(nrow(d) > 1){ rank.d <- d[with(d, order(delta)), ] } if(nrow(u) <= 1){ rank.u <- u } if(nrow(d) <= 1){ rank.d <- d } nearest.up <- rank.u[1,]$delta strand.up <- rank.u[1,]$strand nearest.down <- rank.d[1,]$delta strand.down <- rank.d[1,]$strand if(is.na(nearest.up) == FALSE & is.na(nearest.down) == FALSE){ if(abs(nearest.up) >= nearest.down){ nearest <- nearest.down strand <- strand.down } if(abs(nearest.up) < nearest.down){ nearest <- nearest.up strand <- strand.up } } if(is.na(nearest.up) == FALSE & is.na(nearest.down) == TRUE){ nearest <- nearest.up strand <- strand.up } if(is.na(nearest.up) == TRUE & is.na(nearest.down) == FALSE){ nearest <- nearest.down strand <- strand.down } if(is.na(nearest.up) == TRUE & is.na(nearest.down) == TRUE){ nearest <- NA strand <- NA } if(i == 1){ nearest.df <- data.frame(nearest, strand) } if(i > 1){ nearest.df <- rbind(nearest.df, data.frame(nearest, strand)) } print(i) } colnames(nearest.df) <- c("nearest","nearest.strand") import <- cbind(import2, nearest.df) no.quad <- import[with(import, (as.numeric(import$quad.num.up)+as.numeric(import$quad.num.down)) == 0), ] quad <- import[with(import, (as.numeric(import$quad.num.up)+as.numeric(import$quad.num.down)) >= 1), ] cq <- quad[,c(1,3,25,34,36,45,61,62)] colnames(cq)[1] <- "tss" ## split 0.quad data into training and test sets splitdf <- function(dat, seed=NULL) { if (!is.null(seed)) set.seed(seed) index <- 1:nrow(dat) trainindex <- sample(index, trunc(length(index)/2)) trainset <- dat[trainindex, ] testset <- dat[-trainindex, ] list(trainset=trainset,testset=testset) } dat <- splitdf(no.quad, seed=13) train <- dat$trainset test1 <- dat$testset # add test1 dataset back to quad dataset test <- rbind(test1, quad) ## extract n X 1 matrix for "Y" for training Z.train1 <- as.numeric(as.vector(train$log.exp)) Z.train2 <- Z.train1/sd(Z.train1) Z.sd <- sd(Z.train1) Z.train3 <- Z.train2-mean(Z.train2) Z.mean <- mean(Z.train2) Z.train <- as.matrix(Z.train3) ## extract n x d matrix for "X" for training l <- length(train) train.list <- c(46,seq(from=49,to=59, by=1)) # scaletraining data so that distribution is centered on 0 and STDEV=1 names <- colnames(train)[train.list] sd.out <- NULL mean.out <- NULL for(j in 1:12){ data <- as.numeric(as.vector(train[,train.list[j]])) # adjust so that STDEV of data is 1 scale1 <- data/sd(data) sd.out <- c(sd.out, sd(data)) # adjust so that mean of data is 0 scale2 <- scale1-mean(scale1) mean.out <- c(mean.out, mean(scale1)) scale.df <- as.data.frame(scale2) if(j == 1){ scale.out.df <- scale.df } if(j > 1){ scale.out.df <- cbind(scale.out.df, scale.df) } print(j) } colnames(scale.out.df) <- names X.df.train <- scale.out.df X.train <- as.matrix(X.df.train) improve <- FALSE krige <- FALSE Ds2x <- FALSE trace <- FALSE zcov <- FALSE export.ls <- list() get.rmse <- list() get.quantile <- list() get.quantile.div.pe <- list() select <- test # scale the input datasets so that mean is 0 and STDEV is 1 for(j in 1:12){ data <- as.numeric(as.vector(select[,train.list[j]])) # adjust so that STDEV of data is 1 scale1 <- data/sd(data) sd.out <- c(sd.out, sd(data)) # adjust so that mean of data is 0 scale2 <- scale1-mean(scale1) mean.out <- c(mean.out, mean(scale1)) scale.df <- as.data.frame(scale2) if(j == 1){ scale.out.df <- scale.df } if(j > 1){ scale.out.df <- cbind(scale.out.df, scale.df) } print(j) } colnames(scale.out.df) <- names X.df.test <- scale.out.df X.test <- as.matrix(X.df.test) ## extract n X 1 matrix for "Y" for test dataset Z.test1 <- as.numeric(as.vector(select$log.exp)) Z.test2 <- Z.test1/Z.sd Z.test3 <- Z.test2-Z.mean Z.test <- as.matrix(Z.test3) # Fitting ----------------------------------------------------------------- ## Baysian linear regression model1 <- blm(X.train, Z.train, XX = X.test, meanfn = "linear", bprior = "bflat",BTE = c(1000, 4000, 3), R = 1, m0r1 = TRUE, itemps = NULL, pred.n = TRUE, krige = krige, zcov = TRUE, Ds2x = Ds2x, improv = improve, sens.p = NULL, trace = trace, verb = 1) ## Bayesian treed linear model # model2 <- btlm(X.train, Z.train, XX = X.test, meanfn = "linear", bprior = "bflat", # tree = c(0.5, 2), BTE = c(2000, 7000, 2), R = 1, m0r1 = TRUE, # itemps = NULL, pred.n = TRUE, krige = krige, zcov = TRUE, # Ds2x = Ds2x, improv = improve, sens.p = NULL, trace = trace, verb= 1) # # ## Treed constant model # model3 <- bcart(X.train, Z.train, XX = X.test, bprior = "bflat", tree = c(0.5, 2), # BTE = c(2000, 7000, 2), R = 1, m0r1 = TRUE, itemps = NULL, # pred.n = TRUE, krige = krige, zcov = TRUE, Ds2x = Ds2x, # improv=improve, sens.p = NULL, trace = trace, verb = 1) # # # ## Gaussian process regression # model4 <- bgp(X.train, Z.train, XX = X.test, meanfn = "linear", bprior = "bflat", # corr = "expsep", BTE = c(1000, 4000, 2), R = 1, m0r1 = TRUE, # itemps = NULL, pred.n = TRUE, krige = krige, zcov = TRUE, # Ds2x = Ds2x, improv = improve, sens.p = NULL, nu = 1.5, # trace = trace, verb = 1) # # ## Gaussian process with jumps to limiting linear model # model5 <- bgpllm(X.train, Z.train, XX = X.test, meanfn = "linear", bprior = "bflat", # corr = "expsep", gamma=c(10,0.2,0.7), BTE = c(1000, 4000, 2), # R = 1, m0r1 = TRUE, itemps = NULL, pred.n = TRUE, # krige = krige, zcov = TRUE, Ds2x = Ds2x, improv = improve, # sens.p = NULL, nu = 1.5, trace = trace, verb = 1) # # ## treed Gaussian process # model6 <- btgp(X.train, Z.train, XX = X.test, meanfn = "linear", bprior = "bflat", # corr = "expsep", tree = c(0.5, 2), BTE = c(2000, 7000, 2), # R = 1, m0r1 = TRUE, linburn = FALSE, itemps = NULL, # pred.n = TRUE, krige = krige, zcov = TRUE, Ds2x = Ds2x, # improv = improve, sens.p = NULL, nu = 1.5, trace = trace, # verb = 1) # # ## treed Gaussian process with jumps to limiting linear model # model7 <- btgpllm(X.train, Z.train, XX = X.test, meanfn = "linear", bprior = "bflat", # corr = "expsep", tree = c(0.5, 2), gamma=c(10,0.2,0.7), # BTE = c(2000, 7000, 2), R = 1, m0r1 = TRUE, linburn = FALSE, # itemps = NULL, pred.n = TRUE, krige = krige, zcov = TRUE, # Ds2x = Ds2x, improv = improve, sens.p = NULL, nu = 1.5, # trace = trace, verb = 1) # Analysis Loop ----------------------------------------------------------- model <- model1 ## "zp"= predictions at X (input training locations) zp.mean <- as.data.frame(model$Zp.mean) zp.mean2 <- as.vector(zp.mean[,1]) zp.q <- as.vector(as.data.frame(model$Zp.q)) zp.q1 <- as.vector(as.data.frame(model$Zp.q1)) zp.q2 <- as.vector(as.data.frame(model$Zp.q2)) zp.pe <- as.vector(Z.train)-zp.mean2 ## "zz"= predictions at XX (test dataset) zz.mean <- as.data.frame(model$ZZ.mean) zz.mean2 <- as.vector(zz.mean[,1]) zz.q <- as.vector(as.data.frame(model$ZZ.q)) zz.q1 <- as.vector(as.data.frame(model$ZZ.q1)) zz.q2 <- as.vector(as.data.frame(model$ZZ.q2)) zz.pe <- as.vector(Z.test)-zz.mean2 ## export selected parameters from single fit run to .csv export <- data.frame(Z.train, as.data.frame(zp.mean2), as.data.frame(zp.pe), zp.q, zp.q1, zp.q2) export2 <- data.frame(Z.test, as.data.frame(zz.mean2), as.data.frame(zz.pe), zz.q, zz.q1, zz.q2) export.ls[[i]] <- export2 file.name <- as.character("tgp_data.csv") # plot experimental verses predicted expression values pdf(file="fitting.pdf") plot(export2[,2]*Z.sd, export2[,1]*Z.sd, type= "p",xlab= "Predicted Expression (log2)", ylab= "Measured Expression (log2)", pch=16, xlim=c(-9,10), ylim=c(-9,13),cex.axis=1.5,cex.lab=1.5,font=2,font.lab=2) p <- cor.test(export2[,1], export2[,2], alternative = c("two.sided", "less", "greater"), method = c("pearson", "kendall", "spearman"), exact = NULL, conf.level = 0.95, continuity = FALSE) rmse <- sqrt(sum(zz.pe^2)/length(zz.pe))*Z.sd par(font=2) legend("topleft", legend= c(paste("Correlation=", round(p$estimate,2)), paste("P-Value=",p$p.value), paste("RMSE=",round(rmse,1))), bty="n",cex=1.5) dev.off() # export model hyperparameters par1 <- model$trees par <- par1[[1]][13:24] x <- barplot(as.numeric(par),names.arg=NULL, angle = 45, col = NULL, border = NULL, main = NULL, sub = NULL, ylab = NULL, xlim = NULL, ylim = NULL, xpd = TRUE, axes = TRUE, axisnames = TRUE, cex.axis = 1.3, cex.names = 0.8,cex.lab=1.3, plot = FALSE, axis.lty = 0,las=3) pdf("Regression Model Parameters.pdf") barplot(as.numeric(par),names.arg=NULL, angle = 45, col = NULL, border = NULL, main = NULL, sub = NULL, ylab = "Contribution to Model", xlim = NULL, ylim = c(-0.3,0.3), xpd = TRUE, axes = TRUE, axisnames = TRUE, cex.axis = 1.3, cex.names = 0.8,cex.lab=1.3, plot = TRUE, axis.lty = 0,las=3,font.lab=2,font=2) labs <- c("GC Fraction","DNAse I","H3K79me2","H2A.Z", "H3K4me3","H3K4me2","H4K20me1","H3K27ac","H3K36me3","H3K27me3","H3K9ac","H3K4me1") text(cex=1.3, x=x-.9, y=-0.33, labs, xpd=TRUE, srt=45,cex.lab=1.5,font=2) dev.off() # write model paramters to file write.table(par, file="model parameters.csv",sep=",",col.names=colnames(par),row.names=FALSE) # make dataset containing input and output parameters full.dat <- cbind(export2, test) # split full dataset into TSS with at least 1 PQS within 1 kb of the TSS, and those with none quad.only.dat <- full.dat[with(full.dat, is.na(nearest) == FALSE), ] no.quad.dat <- full.dat[with(full.dat, is.na(nearest) == TRUE), ] # write datasets to file for future reference write.table(full.dat, file = "full_test_dataset.csv", sep=",",row.names = FALSE, col.names = colnames(full.dat)) write.table(quad.only.dat, file = "quad_only_test_dataset.csv", sep=",",row.names = FALSE, col.names = colnames(quad.only.dat)) write.table(no.quad.dat, file = "no.quad_test_dataset.csv", sep=",",row.names = FALSE, col.names = colnames(no.quad.dat)) # create histograms showing the prediction error for each of these datasets no.quad.h <- hist(no.quad.dat$zz.pe*Z.sd,breaks=seq(from=-40,to=40,by=0.2),plot=FALSE) quad.h <- hist(quad.only.dat$zz.pe*Z.sd,breaks=seq(from=-40,to=40,by=0.2),plot=FALSE) # calculate means and STDEV for these distributions no.quad.m <- mean(no.quad.dat$zz.pe)*Z.sd no.quad.sd <- sd(no.quad.dat$zz.pe)*Z.sd quad.m <- mean(quad.only.dat$zz.pe)*Z.sd quad.sd <- sd(quad.only.dat$zz.pe)*Z.sd # calculate RMSE for the predictions rmse.no.quad <- sqrt(sum((no.quad.dat$zz.mean2-no.quad.dat$Z.test)^2)/nrow(no.quad.dat))*Z.sd rmse.quad <- sqrt(sum((quad.only.dat$zz.mean2-quad.only.dat$Z.test)^2)/nrow(quad.only.dat))*Z.sd pdf("PE histograms.pdf") par(mfrow=c(2,1)) c <- 1.5 c2 <- 1.5 c3 <- 1.5 plot(no.quad.h, xlab="Prediction Error (log2)", main="Prediction Errors: TSS Near No PQS",xlim=c(-10,10),col="gray", cex.axis=c,cex.lab=c,cex.main=c3,font.lab=2,font.main=2,font=2) par(font=2) legend("topleft", legend=c(paste("Mean=",round(no.quad.m,2)), paste("STDEV=",round(no.quad.sd,1)), paste("RMSE=",round(rmse.no.quad,1))), bty="n",cex=c2) plot(quad.h, xlab="Prediction Error (log2)", main="Prediction Errors: TSS Near at Least One PQS",xlim=c(-10,10),col="gray", cex.axis=c,cex.lab=c,cex.main=c3,font.lab=2,font.main=2,font=2) par(font=2) legend("topleft", legend=c(paste("Mean=",round(quad.m,2)), paste("STDEV=",round(quad.sd,1)), paste("RMSE=",round(rmse.quad,1))), bty="n",cex=c2) dev.off() par(mfrow=c(1,1)) # extract zero quad test dataset for calculating prediction error adjustment pe <- (no.quad.dat$Z.test-no.quad.dat$zz.mean2) pe.adjust <- pe-mean(pe) # plot PE dataset dat1 <- quad.only.dat[with(quad.only.dat, abs(nearest) < 2000), ] dat <- dat1[with(dat1, (as.numeric(quad.num.up)+as.numeric(quad.num.down)) > 0), ] # Y is the prediction error for the selected dataset "dat" minus the pe adjustment # calculated for the zero.quad data Y <- ((dat$Z.test-dat$zz.mean2)-mean(pe))*Z.sd X <- dat$nearest pdf(file="Raw_PE_Dataset.pdf") plot(X,Y, type="p", pch=16, cex=0.6,ylab="Prediction Error (log2)", xlab="Distance Downstream of TSS (bp)",cex.axis=1.3,cex.lab=1.3) abline(h=0) dev.off() ############################################################################################################# # map of quadruplex regulatory inflection points affecting gene expression - analyze template and non-template strands together width <- 200 by=10 map.points <- seq(from=-2000+width/2, to=2000-width/2, by=by) min.p.out <- NULL ctl.p.out.out <- NULL # generate dataframe of zero.quad PE and assign a "1" classifier ctlY <- data.frame(pe.adjust, rep(1,length(pe.adjust))) colnames(ctlY) <- c("prediction.error","classifier") for(j in 1:100){ print("loop number=") print(j) p2.out <- NULL pe.out <- NULL ctl.p.out <- NULL # generate control dataset (for judging statistical significance) ctl.dat <- quad.only.dat # replace the PE value in ctl.dat with randomly picked PE values from the no.quad dataset ctl.dat$zz.pe <- sample(pe.adjust, nrow(ctl.dat), replace=TRUE) # loop through the PQS bins specified above for(i in map.points){ # pick portion of dataset within desired bin dat1 <- quad.only.dat[with(quad.only.dat, abs(nearest-i) < width/2), ] dat <- dat1[with(dat1, (as.numeric(quad.num.up)+as.numeric(quad.num.down)) > 0), ] # Y is the prediction error for the selected dataset "dat" minus the pe adjustment calculated for the zero.quad data Y <- (dat$Z.test-dat$zz.mean2)-mean(pe) # export prediction errors pe.out <- c(pe.out, mean(Y)) # 1 way ANOVA comparing PE in this bin to the PE of the no.quad dataset testY <- data.frame(Y, rep(2, length(Y))) colnames(testY) <- c("prediction.error","classifier") anova.df <- rbind(ctlY, testY) p2 <- oneway.test(prediction.error ~ classifier, data=anova.df) p2.out <- c(p2.out, p2$p.value) # control sample of the same size as dat dat2 <- ctl.dat[with(ctl.dat, abs(nearest-i) < width/2), ] Y2 <- dat2$zz.pe ctl.sampleY <- data.frame(Y2, rep(2, length(Y2))) colnames(ctl.sampleY) <- c("prediction.error","classifier") anova.ctl.df <- rbind(ctlY, ctl.sampleY) ctl.p <- oneway.test(prediction.error ~ classifier, data=anova.ctl.df) ctl.p.out <- c(ctl.p.out, ctl.p$p.value) #print(i) } print(min(ctl.p.out)) plot(map.points, ctl.p.out, log="y",ylim=c(1E-5,1)) points(map.points, p2.out, pch=4) abline(h=0.01) min.p.out <- c(min.p.out, min(ctl.p.out)) ctl.p.out.out <- c(ctl.p.out.out, ctl.p.out) } h <- hist(ctl.p.out.out, breaks=seq(from=0, to=1, by=0.01), plot=FALSE) # plot histogram of min p values plot(h$mids,h$counts, xlim=c(0,0.2),type="l") # calculate p-value threshold for 1% chance of a p-value being randomly below this value fdr.thresh <- 0.01 fdr1 <- sort(ctl.p.out.out, decreasing=FALSE) fdr.coord <- floor(length(fdr1)*fdr.thresh) # p.cut is the p-value threshold, below which there is a 1% chance of a a point in a # randomly-sampled dataset achieving significance p.cut <- fdr1[fdr.coord] print(p.cut) # calculate # of discoveries in test dataset discoveries <- p2.out[p2.out <= p.cut] disc.rate <- length(discoveries)/length(p2.out) # calculate FDR (proportion of discoveries that are likely "false") fdr <- fdr.thresh/disc.rate # export FDR and threshold P-Value to file for future use sink("FDR_thresh-strand aggregates.txt") print("FD fraction of data points=") print(fdr.thresh) print("threshold p-value for specified FDR=") print(p.cut) print("number of false discoveries=") print(length(map.points)*fdr.thresh) print("number of discoveries in test dataset=") print(length(discoveries)) print("discovery fraction=") print(disc.rate) print("FDR=") print(fdr) sink() # calculate subset of map.points for which the p-value is below the specified threshold for FDR=0.01 # goal is to plot shaded areas on PE plot where data is statistically significant sig1.df <- data.frame(seq(from=1, to=length(map.points), by=1),map.points, pe.out, p2.out) # keep only entries below the significance threshold sig2.df <- sig1.df[with(sig1.df, p2.out <= p.cut),] colnames(sig2.df) <- c("number","map.points","pe.out","p2.out") # identify continuous regions (plot each continuous region separately) cont.start <- NULL for(i in 1:nrow(sig2.df)){ if(i ==1){ sig.start <- sig2.df[i,]$number cont.start <- i sig.run <- sig.start } if(i > 1){ num <- sig2.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.df[i,]$number cont.start <- c(cont.start,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.df[i,]$number } } print(i) } # split sig2.df into continuous regions regions <- list() for(i in 1:length(cont.start)){ if(i < length(cont.start)){ df <- sig2.df[(cont.start[i]:(cont.start[i+1]-1)),] regions[i][[1]] <- df } if(i == length(cont.start)){ df <- sig2.df[(cont.start[i]:nrow(sig2.df)),] regions[i][[1]] <- df } } # define series of x and y coordinates for plotting shaded regions cord.xl <- list() cord.yl <- list() for(i in 1:length(regions)){ df <- regions[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -2000+width/2 & df[nrow(df),]$map.points != 2000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl[i][[1]] <- cord.x low.outside <- as.numeric(sig1.df[with(sig1.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.df[with(sig1.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),0) cord.yl[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -2000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl[i][[1]] <- cord.x low.outside <- as.numeric(sig1.df[with(sig1.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.df[with(sig1.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),0) cord.yl[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 2000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl[i][[1]] <- cord.x low.outside <- as.numeric(sig1.df[with(sig1.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.df[with(sig1.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),0) cord.yl[i][[1]] <- cord.y } } df.export <- data.frame(map.points, pe.out*Z.sd, p2.out) write.table(df.export, file="PE mapping (strand aggregate).csv",sep=",",col.names=colnames(df.export),row.names=FALSE) cq2 <- quad.only.dat[,c(7,9,31,40,42,51,67,68)] colnames(cq2) <- c("tss","tss.s","delta.up","strand.up","delta.down","strand.down","nearest","nearest.s") ############################################################################################################# # analyze + and - strands SEPARATELY!!!!! width <- 200 by=10 plus <- quad.only.dat[with(quad.only.dat, nearest.strand == "t"), ] minus <- quad.only.dat[with(quad.only.dat, nearest.strand == "nt"), ] map.points <- seq(from=-2000+width/2, to=2000-width/2, by=by) min.p.plus.out <- NULL min.p.minus.out <- NULL ctl.p.plus.out.out <- NULL ctl.p.minus.out.out <- NULL # generate dataframe of zero.quad PE and assign a "1" classifier ctlY <- data.frame(pe.adjust, rep(1,length(pe.adjust))) colnames(ctlY) <- c("prediction.error","classifier") # i=1 for plus strand analysis; i=2 for minus strand analysis for(i in 1:2){ # run 100 simulations for calculating p-value threshold for FDR=0.01 for(j in 1:100){ if(i ==1){ ctl.plus.dat <- plus ctl.plus.dat$zz.pe <- sample(pe.adjust, nrow(ctl.plus.dat), replace=TRUE) p.plus.out <- NULL p.plus.ctl.out <- NULL pe.plus.out <- NULL } if(i == 2){ ctl.minus.dat <- minus ctl.minus.dat$zz.pe <- sample(pe.adjust, nrow(ctl.minus.dat), replace=TRUE) p.minus.out <- NULL p.minus.ctl.out <- NULL pe.minus.out <- NULL } # calcualte p-value and PE along the TSS map for(k in map.points){ if(i == 1){ dat1 <- plus[with(plus, abs(nearest-k) < width/2), ] dat <- dat1[with(dat1, (as.numeric(quad.num.up)+as.numeric(quad.num.down)) > 0), ] Y <- (dat$Z.test-dat$zz.mean2)-mean(pe) pe.plus.out <- c(pe.plus.out, mean(Y, na.rm=TRUE)) testY <- data.frame(Y, rep(2, length(Y))) colnames(testY) <- c("prediction.error","classifier") anova.df <- rbind(ctlY, testY) p <- oneway.test(prediction.error ~ classifier, data=anova.df) p.plus.out <- c(p.plus.out, p$p.value) # control sample of the same size as dat dat2 <- ctl.plus.dat[with(ctl.plus.dat, abs(nearest-k) < width/2), ] Y2 <- dat2$zz.pe ctl.sampleY <- data.frame(Y2, rep(2, length(Y2))) colnames(ctl.sampleY) <- c("prediction.error","classifier") anova.ctl.df <- rbind(ctlY, ctl.sampleY) ctl.p.plus <- oneway.test(prediction.error ~ classifier, data=anova.ctl.df) p.plus.ctl.out <- c(p.plus.ctl.out, ctl.p.plus$p.value) } if(i == 2){ dat1 <- minus[with(minus, abs(nearest-k) < width/2), ] # set number of PQS per gene that is acceptable for inclusion dat <- dat1[with(dat1, (as.numeric(quad.num.up)+as.numeric(quad.num.down)) > 0), ] Y <- (dat$Z.test-dat$zz.mean2)-mean(pe) pe.minus.out <- c(pe.minus.out, mean(Y, na.rm=TRUE)) testY <- data.frame(Y, rep(2, length(Y))) colnames(testY) <- c("prediction.error","classifier") anova.df <- rbind(ctlY, testY) p <- oneway.test(prediction.error ~ classifier, data=anova.df) p.minus.out <- c(p.minus.out, p$p.value) # control sample of the same size as dat dat2 <- ctl.minus.dat[with(ctl.minus.dat, abs(nearest-k) < width/2), ] Y2 <- dat2$zz.pe ctl.sampleY <- data.frame(Y2, rep(2, length(Y2))) colnames(ctl.sampleY) <- c("prediction.error","classifier") anova.ctl.df <- rbind(ctlY, ctl.sampleY) ctl.p.minus <- oneway.test(prediction.error ~ classifier, data=anova.ctl.df) p.minus.ctl.out <- c(p.minus.ctl.out, ctl.p.minus$p.value) } } print("strand=") print(i) print("simulation=") print(j) print("................................") if(i == 1){ min.p.plus.out <- c(min.p.plus.out, min(p.plus.ctl.out)) ctl.p.plus.out.out <- c(ctl.p.plus.out.out, p.plus.ctl.out) plot(map.points, p.plus.ctl.out, log="y",ylim=c(1E-5,1), main=c(paste("plus",j))) points(map.points, p.plus.out, pch=4) } if(i == 2){ min.p.minus.out <- c(min.p.minus.out, min(p.minus.ctl.out)) ctl.p.minus.out.out <- c(ctl.p.minus.out.out, p.minus.ctl.out) plot(map.points, p.minus.ctl.out, log="y",ylim=c(1E-5,1), main=c(paste("minus",j))) points(map.points, p.minus.out, pch=4) } } } # calculate p-value threshold for FDR=0.01 (1% chance of a p-value in the random dataser being below this value) fdr.thresh <- 0.01 fdr.plus1 <- sort(ctl.p.plus.out.out, decreasing=FALSE) fdr.minus1 <- sort(ctl.p.minus.out.out, decreasing=FALSE) fdr.plus.coord <- floor(length(fdr.plus1)*fdr.thresh) fdr.minus.coord <- floor(length(fdr.minus1)*fdr.thresh) # p.cut is the p-value threshold, below which there is a 1% chance of a a point in a # randomly-sampled dataset achieving significance ... so 1% of the points that achieve signifance # in the test dataset are "false positives" p.plus.cut <- fdr.plus1[fdr.plus.coord] p.minus.cut <- fdr.minus1[fdr.minus.coord] print(p.plus.cut) print(p.minus.cut) # calculate # of discoveries in test dataset discoveries.plus <- p.plus.out[p.plus.out <= p.plus.cut] disc.rate.plus <- length(discoveries.plus)/length(p.plus.out) discoveries.minus <- p.minus.out[p.minus.out <= p.minus.cut] disc.rate.minus <- length(discoveries.minus)/length(p.minus.out) # calculate FDR (proportion of discoveries that are likely "false") fdr.plus <- fdr.thresh/disc.rate.plus fdr.minus <- fdr.thresh/disc.rate.minus # export FDR and threshold P-Value to file for future use sink("FDR_thresh-separate strands.txt") print("FD fraction of data points=") print(fdr.thresh) print("threshold p-value for specified FDR (+ strand)=") print(p.plus.cut) print("number of false discoveries (+ strand)=") print(length(map.points)*fdr.thresh) print("number of discoveries in test dataset (+ strand)=") print(length(discoveries.plus)) print("discovery fraction (+ strand)=") print(disc.rate.plus) print("FDR (+ strand)=") print(fdr.plus) print("") print("FD fraction of data points (- strand)=") print(fdr.thresh) print("threshold p-value for specified FDR (- strand)=") print(p.minus.cut) print("number of false discoveries (- strand)=") print(length(map.points)*fdr.thresh) print("number of discoveries in test dataset (- strand)=") print(length(discoveries.minus)) print("discovery fraction (- strand)=") print(disc.rate.minus) print("FDR (- strand)=") print(fdr.minus) sink() # calculate subset of map.points for which the p-value is below the specified threshold for p-value # goal is to plot shaded areas on PE plot where data is statistically significant sig1.plus.df <- data.frame(seq(from=1, to=length(map.points), by=1),map.points, pe.plus.out, p.plus.out) sig1.minus.df <- data.frame(seq(from=1, to=length(map.points), by=1),map.points, pe.minus.out, p.minus.out) # keep only entries below the significance threshold sig2.plus.df <- sig1.plus.df[with(sig1.plus.df, p.plus.out <= p.plus.cut),] sig2.minus.df <- sig1.minus.df[with(sig1.minus.df, p.minus.out <= p.minus.cut),] colnames(sig2.plus.df) <- colnames(sig2.minus.df) <- c("number","map.points","pe.out","p2.out") # identify continuous regions (plot each continuous region separately) cont.start.plus <- NULL for(i in 1:nrow(sig2.plus.df)){ if(i ==1){ sig.start <- sig2.plus.df[i,]$number cont.start.plus <- i sig.run <- sig.start } if(i > 1){ num <- sig2.plus.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.plus.df[i,]$number cont.start.plus <- c(cont.start.plus,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.plus.df[i,]$number } } print(i) } cont.start.minus <- NULL for(i in 1:nrow(sig2.minus.df)){ if(i ==1){ sig.start <- sig2.minus.df[i,]$number cont.start.minus <- i sig.run <- sig.start } if(i > 1){ num <- sig2.minus.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.minus.df[i,]$number cont.start.minus <- c(cont.start.minus,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.minus.df[i,]$number } } print(i) } # split sig2.df into continuous regions regions.plus <- list() for(i in 1:length(cont.start.plus)){ if(i < length(cont.start.plus)){ df <- sig2.plus.df[(cont.start.plus[i]:(cont.start.plus[i+1]-1)),] regions.plus[i][[1]] <- df } if(i == length(cont.start.plus)){ df <- sig2.plus.df[(cont.start.plus[i]:nrow(sig2.plus.df)),] regions.plus[i][[1]] <- df } } regions.minus <- list() for(i in 1:length(cont.start.minus)){ if(i < length(cont.start.minus)){ df <- sig2.minus.df[(cont.start.minus[i]:(cont.start.minus[i+1]-1)),] regions.minus[i][[1]] <- df } if(i == length(cont.start.minus)){ df <- sig2.minus.df[(cont.start.minus[i]:nrow(sig2.minus.df)),] regions.minus[i][[1]] <- df } } # define series of x and y coordinates for plotting shaded regions cord.xl.plus <- list() cord.yl.plus <- list() cord.xl.minus <- list() cord.yl.minus <- list() for(i in 1:length(regions.plus)){ df <- regions.plus[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -2000+width/2 & df[nrow(df),]$map.points != 2000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.plus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),0) cord.yl.plus[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -2000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.plus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),0) cord.yl.plus[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 2000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl.plus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),0) cord.yl.plus[i][[1]] <- cord.y } } for(i in 1:length(regions.minus)){ df <- regions.minus[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -2000+width/2 & df[nrow(df),]$map.points != 2000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.minus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),0) cord.yl.minus[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -2000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.minus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),0) cord.yl.minus[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 2000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl.minus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, map.points == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, map.points == (max.x+by/2)),]$pe.out) cord.y <- c(0,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),0) cord.yl.minus[i][[1]] <- cord.y } } df.export <- data.frame(map.points, pe.plus.out*Z.sd, p.plus.out, pe.minus.out*Z.sd, p.minus.out) colnames(df.export) <- c("points","plus.pe","plus.p","minus.pe","minus.p") write.table(df.export, file="PE mapping (separate strands).csv",sep=",",col.names=colnames(df.export),row.names=FALSE) sink("dataset stats.txt") print("nrow(quad)") print(nrow(quad)) print("nrow(no.quad)") print(nrow(no.quad)) print("nrow(plus)") print(nrow(plus)) print("nrow(minus)") print(nrow(minus)) sink() ########### # massive plot of all six figures pdf("Running_PE (collective).pdf") par(mfcol=c(2,3)) # mar: margins- bottom, left, top, right; oma: c(bottom, left, top) size of the outer margins in lines of text par(mar = c(0, 0, 4, 0), oma = c(4, 8, 4, 1)) # tcl: length of tick marks as a fraction of the height ofa line of text, defaults to -0.5 par(tcl = -0.25) # mgp: margin line for axis title, axis labels, and axis lines (default c(3,1,0) par(mgp = c(3, 1, 0)) clegend <- 1.2 cmain <- 1.8 clabel <- 1 caxis <- 1.2 fline <- "azure3" vline <- 1 bcol <- "white" #################################################################################################################### ### plot the strand aggregate data plot(map.points,p2.out, type="l",log="y",lty=1,lwd=2,main="Both Strands", ylab="P-Value",xlab="Distance Downstream of TSS (bp)",xaxt="n",yaxt="n",cex.axis=caxis,cex.lab=clabel, font=2,font.main=2,font.lab=2,cex.main=cmain,ylim=c(1E-6,1)) abline(v=seq(-2000,2000,200),col=fline,lwd=1) axis(2, at=c(0.00001,0.0001,0.001,0.01,0.1,1),cex.axis=caxis,las=1,font=2) abline(h=p.cut,lty=3, lwd=2) abline(v=0,lty=1,lwd=vline) par(font=2) legend("bottomright",legend=c(paste("Threshold=",round(p.cut,3)," ")),lty=3,lwd=2,cex=clegend,bg="white",box.col=bcol) box(lwd=2) par(mar = c(4, 0, 0, 0)) plot(map.points, pe.out*Z.sd, type="l",lty=1,lwd=2,main="",xlab="", ylab="Prediction Error (log2)",yaxt="n",xaxt="n",cex.axis=caxis,cex.lab=clabel, ylim=c(-3,1.5),font=2,font.main=2,font.lab=2,cex.main=cmain) abline(v=seq(-2000,2000,200),col=fline,lwd=1) for(i in 1:length(regions)){ polygon(cord.xl[i][[1]],cord.yl[i][[1]]*Z.sd,col="black") } axis(2, at=c(-2.5,-2,-1.5,-1,-0.5,0,0.5,1,1.5),cex.axis=caxis,las=1,font=2) axis(1, at=seq(from=-1600, to=1600, by=400),cex.axis=caxis,font=2,las=2) abline(h=0,lty=1,lwd=2) abline(v=0,lty=1,lwd=vline) legend("bottomright",legend=c(paste("FDR=",round(fdr,3)," ")),cex=clegend,bty="n") box(lwd=2) ##################################################################################################################### ## plot template strand data par(mar = c(0, 0, 4, 0)) plot(map.points,p.plus.out, type="l",log="y",lty=1,lwd=2,main="TS", ylab="",xlab="",xaxt="n",yaxt="n",cex.axis=caxis,cex.lab=clabel,ylim=c(1E-6,1), font=2,font.lab=2,font.main=2,cex.main=cmain) abline(v=seq(-2000,2000,200),col=fline,lwd=1) par(font=2) abline(h=p.plus.cut,lty=3, lwd=2) abline(v=0,lty=1,lwd=vline) legend("bottomright",legend=c(paste("Threshold=",round(p.plus.cut,3)," ")),lty=3,lwd=2,cex=clegend, bg="white",box.col=bcol) box(lwd=2) par(mar = c(4, 0, 0, 0)) plot(map.points, pe.plus.out*Z.sd, type="l",lty=1,lwd=2,main="", xlab="",ylab="",yaxt="n",xaxt="n",cex.axis=caxis,cex.lab=clabel, ylim=c(-3,1.5),font=2,font.lab=2,font.main=2,cex.main=cmain) abline(v=seq(-2000,2000,200), col=fline,lwd=1) for(i in 1:length(regions.plus)){ polygon(cord.xl.plus[i][[1]],cord.yl.plus[i][[1]]*Z.sd,col="black") } axis(1, at=seq(from=-1600, to=1600, by=400),cex.axis=caxis,font=2,las=2) abline(h=0,lty=1,lwd=2) abline(v=0,lty=1,lwd=vline) legend("bottomright",legend=c(paste("FDR=",round(fdr.plus,3)," ")),cex=clegend, bty="n") box(lwd=2) ################################################################################################################## ################### now plot non-template strand par(mar = c(0, 0, 4, 0)) plot(map.points,p.minus.out, type="l",log="y",lty=1,lwd=2,main="Non-TS", ylab="",xlab="",xaxt="n",yaxt="n",cex.axis=caxis,cex.lab=clabel,ylim=c(1E-6,1), font=2,font.lab=2,font.main=2,cex.main=cmain) abline(v=seq(-2000,2000,200),col=fline,lwd=1) par(font=2) abline(h=p.minus.cut,lty=3, lwd=2) abline(v=0,lty=1,lwd=vline) legend("bottomright",legend=c(paste("Threshold=",round(p.minus.cut,3)," ")),lty=3,lwd=2,cex=clegend, bg="white", box.col=bcol) box(lwd=2) par(mar = c(4, 0, 0, 0)) plot(map.points, pe.minus.out*Z.sd, type="l",lty=1,lwd=2,main="", xlab="",ylab="",yaxt="n",xaxt="n",cex.axis=caxis,cex.lab=clabel, ylim=c(-3,1.5),font=2,font.lab=2,font.main=2,cex.main=cmain) abline(v=seq(-2000,2000,200), col=fline,lwd=1) for(i in 1:length(regions.minus)){ polygon(cord.xl.minus[i][[1]],cord.yl.minus[i][[1]]*Z.sd,col="black") } axis(1, at=seq(from=-1600, to=1600, by=400),cex.axis=caxis,font=2,las=2) abline(h=0,lty=1,lwd=2) abline(v=0,lty=1,lwd=vline) legend("bottomright",legend=c(paste("FDR=",round(fdr.minus,3)," ")),cex=clegend, bty="n") box(lwd=2) mtext("Distance Downstream of TSS (bp)", side = 1, line = 0, outer = TRUE, cex=1.3,padj=1) mtext(" Prediction Error (log2) P-Value", side = 2, line = 5, outer = TRUE, adj=0,cex=1.3) dev.off() ########### # massive plot, only four panels pdf("Running_PE (collective, 4 panel).pdf") par(mfcol=c(2,2)) # mar: margins- bottom, left, top, right; oma: c(bottom, left, top, right) size of the outer margins in lines of text par(mar = c(0, 0, 4, 0), oma = c(11, 8, 11, 1)) # tcl: length of tick marks as a fraction of the height ofa line of text, defaults to -0.5 par(tcl = -0.25) # mgp: margin line for axis title, axis labels, and axis lines (default c(3,1,0) par(mgp = c(3, 1, 0)) clegend <- 1.2 cmain <- 1.8 clabel <- 1 caxis <- 1.2 fline <- "azure3" vline <- 1 bcol <- 0 ##################################################################################################################### ## plot template strand data par(mar = c(0.5, 0, 4, 0)) plot(map.points,p.plus.out, type="l",log="y",lty=1,lwd=2,main="TS", ylab="",xlab="",xaxt="n",yaxt="n",cex.axis=caxis,cex.lab=clabel,ylim=c(1E-7,1), font=2,font.lab=2,font.main=2,cex.main=cmain) abline(v=seq(-2000,2000,200),col=fline,lwd=1) axis(2, at=c(0.000001,0.0001,0.01,1),cex.axis=caxis,las=1,font=2) par(font=2) abline(h=p.plus.cut,lty=3, lwd=2) abline(v=0,lty=1,lwd=vline) #legend("bottomright",legend=c(paste("Threshold=",round(p.plus.cut,3)," ")),lty=0,lwd=2,cex=clegend, bg=NULL,box.col=bcol) box(lwd=2) par(mar = c(4, 0, 0.5, 0)) plot(map.points, pe.plus.out*Z.sd, type="l",lty=1,lwd=2,main="", xlab="",ylab="",yaxt="n",xaxt="n",cex.axis=caxis,cex.lab=clabel, ylim=c(-2,1.5),font=2,font.lab=2,font.main=2,cex.main=cmain) abline(v=seq(-2000,2000,200), col=fline,lwd=1) for(i in 1:length(regions.plus)){ polygon(cord.xl.plus[i][[1]],cord.yl.plus[i][[1]]*Z.sd,col="black") } axis(2, at=c(-2,-1,0,1,2),cex.axis=caxis,las=1,font=2) axis(1, at=seq(from=-1600, to=1600, by=400),cex.axis=caxis,font=2,las=2) abline(h=0,lty=1,lwd=1) abline(v=0,lty=1,lwd=vline) legend("bottomright",legend=c(paste("FDR=",round(fdr.plus,3)," ")),cex=clegend, bty="n") box(lwd=2) ################################################################################################################## ################### now plot non-template strand par(mar = c(0.5, 0, 4, 0)) plot(map.points,p.minus.out, type="l",log="y",lty=1,lwd=2,main="NTS", ylab="",xlab="",xaxt="n",yaxt="n",cex.axis=caxis,cex.lab=clabel,ylim=c(1E-7,1), font=2,font.lab=2,font.main=2,cex.main=cmain) abline(v=seq(-2000,2000,200),col=fline,lwd=1) par(font=2) abline(h=p.minus.cut,lty=3, lwd=2) abline(v=0,lty=1,lwd=vline) #legend("bottomright",legend=c(paste("Threshold=",round(p.minus.cut,3)," ")),lty=0,lwd=2,cex=clegend, bg=NULL, box.col=bcol) box(lwd=2) par(mar = c(4, 0, 0.5, 0)) plot(map.points, pe.minus.out*Z.sd, type="l",lty=1,lwd=2,main="", xlab="",ylab="",yaxt="n",xaxt="n",cex.axis=caxis,cex.lab=clabel, ylim=c(-2,1.5),font=2,font.lab=2,font.main=2,cex.main=cmain) abline(v=seq(-2000,2000,200), col=fline,lwd=1) for(i in 1:length(regions.minus)){ polygon(cord.xl.minus[i][[1]],cord.yl.minus[i][[1]]*Z.sd,col="black") } axis(1, at=seq(from=-1600, to=1600, by=400),cex.axis=caxis,font=2,las=2) abline(h=0,lty=1,lwd=1) abline(v=0,lty=1,lwd=vline) legend("bottomright",legend=c(paste("FDR=",round(fdr.minus,3)," ")),cex=clegend, bty="n") box(lwd=2) mtext("Distance Downstream of TSS (bp)", side = 1, line = -1, outer = TRUE, cex=1.3,padj=1) mtext(" P-Value", side = 2, line = 4.5, outer = TRUE, adj=0,cex=1.3) mtext(" Prediction Error", side = 2, line = 3, outer = TRUE, adj=0,cex=1.3) dev.off()