################ Potential coinfection analyses ################ #### 1. Data Sources and Preparing Data #### 2. Sources explaining variation in the level of potential coinfection #### 3. Plotting factors affecting the proportion of tested phage infecting hosts #### 4. Summary Statistics on Potential Coinfection #Load libraries library(ggplot2) library(XLConnect) library(reshape2) library(gridExtra) library(dplyr) #Simple function to calculate standar error of the mean source("sem.R") #### 1. Data Sources and Preparing Data #### #Analyzing data from Flores et al. 2011 Reference: #Flores, C. O., Meyer, J. R., Valverde, S., Farr, L., & Weitz, J. S. (2011). Statistical structure of host-phage interactions. #PNAS, 108(28), E288-E297. doi:10.1073/pnas.1101595108/ #Import phage-host infection matrices excel <- loadWorkbook(filename = "data/unprocessed/Flores_et_al_2011_data.xls") #Now want to offload these sheets into R workable format sheets <- readWorksheet(excel, sheet = getSheets(excel), header = FALSE) #getSheets(excel) gives you a vector with the names of the sheets in the excel file #Now sheets is a list with 38 data frames, each corresponding to a sheet in the file, i.e. # the infection matrix for each study. But these are data frames, not matrices, so need to change #lapply will apply a function to each element of a list, let's try with as.matrix() flores2011 <- lapply(sheets, as.matrix) is.matrix(sheets$'Zinno 2010') #[1] FALSE is.matrix(flores2011$'Zinno 2010') #[1] TRUE #Looks good. #Get the sum of all Rows from all the studies. #In this data set H=rows (one for each bacterial host) and P=columns (one for each phage) # I will primarily be interested in analyzing the rows, because these will indicate the # number of phages that can potentially interact within a cell, i.e. coinfect all_row_sums <- lapply(flores2011, rowSums) #Now make a data frame to manipulate and plot data phages_infecting <- data.frame(all_row_sums) #Doesn't work, see below #Data unequal lengths, maybe this isn't best way to plot, let's work on # summary statistics in list form and then take those summary stats into a data frame #First the name of the studies studies <- names(lapply(all_row_sums, max)) #Also going to use a numeric id for each study, to link with other metadata elsewhere in Flores et al. 2011 id <- 1:length(studies) #reorganize the list to make a 'tidy' data frame phages_infecting <- melt(all_row_sums) #Adjust the names names(phages_infecting) <- c("phages_infecting", "studies") #Creating a Summary Data Frame #number of phages tested? phage_tested <- as.numeric(lapply(flores2011, ncol)) #number of bacteria tested? bacteria_tested <- as.numeric(lapply(flores2011, nrow)) #Summary Stats of the number of phage infecting a host, H in Flores et al 2011 min <- as.numeric(lapply(all_row_sums, min)) mean <- as.numeric(lapply(all_row_sums, mean)) max <- as.numeric(lapply(all_row_sums, max)) sd <- as.numeric(lapply(all_row_sums, sd)) sem <- as.numeric(lapply(all_row_sums, sem)) #this is the summary DF for number of coinfecting phages phages <- data.frame(id, studies, mean, sd, sem, max, min, bacteria_tested, phage_tested) #Add Numeric ID by doing an inner join with the studies as the ID. This might be interesting. phages_infecting_id <- inner_join(phages_infecting, subset(phages, select = c(studies, id, bacteria_tested, phage_tested)), by = "studies") #Actually worked well names(phages_infecting_id) <- c("phages_infecting", "studies", "ID", "bacteria_tested", "phage_tested") #Import the metadata #Import the metadata #Study metadata is taken from pages 20-21 of Flores et al. 2011 Supplementary PDF # These pages were OCR'd and manually curated into a CSV 'data/unprocessed/study_metadata.csv' # A processed version of that file 'data/flores_et_al_2011_metadata_updated.csv' # was manually curated to consolidate categories and remove blanks. metadata <- read.csv("data/flores_et_al_2011_metadata_updated.csv") View(metadata) #Now another join to get the metadata added phages_infecting_complete <- inner_join(phages_infecting_id, metadata, by = "ID") #Since different numbers of phages were tested for each study need to use proportion of tested phage infecting #to graph and maybe for analysis (more on that below) phages_infecting_complete <- mutate(phages_infecting_complete, prop_infecting = phages_infecting/phage_tested) #Now adding metadata to conform to JGI-GOLD Database guidelines and match the Culture Coinfection analyses #Need to add just ECOSYSTEM, because ENERGY_SOURCE is listed here as Bacterialtrophy already #Copying values of Isolation_Habitat_alt just to make new row phages_infecting_complete <- mutate(phages_infecting_complete, ECOSYSTEM = Isolation_Habitat_alt) #Clear factors so I can add new data phages_infecting_complete$ECOSYSTEM <- as.character(phages_infecting_complete$ECOSYSTEM) #Add metadata by study. This was hand curated based on the Flores et al. metadata and the original # publications. When in conflict, I categorized according to the original reference. Note that these # categories refer to the *host bacteria* which were not always isolated from the same place as the phages phages_infecting_complete[phages_infecting_complete$studies == "Abe 2007", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Barrangou 2002", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Braun-Breton 1981", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Campbell 1995", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Capparelli 2010", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Caso 1995", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Ceyssens 2009", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Comeau 2005", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Comeau 2006", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "DePaola 1998", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Doi 2003", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Duplessis 2001", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Gamage 2004", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Goodridge 2003", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Hansen 2007", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Holmfeldt 2007", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Kankila 1994", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Krylov 2006", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Kudva 1999", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Langley 2003", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "McLaughlin 2008", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Meyer unpub", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Middelboe 2009", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Miklic 2003", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Mizoguchi 2003", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Pantucek 1998", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Paterson 2010", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Poullain 2008", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Quiberoni 2003", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Rybniker 2006", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Seed 2005", "ECOSYSTEM"] <- "Host-associated" phages_infecting_complete[phages_infecting_complete$studies == "Stenholm 2009", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Sullivan 2003", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Suttle 1993", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Synott 2009", "ECOSYSTEM"] <- "Engineered" phages_infecting_complete[phages_infecting_complete$studies == "Wang 2008", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Wichels 1998", "ECOSYSTEM"] <- "Environmental" phages_infecting_complete[phages_infecting_complete$studies == "Zinno 2010", "ECOSYSTEM"] <- "Engineered" #Recalculate factors factor(phages_infecting_complete$ECOSYSTEM) #### 2. Sources explaining variation in the level of potential coinfection #### #Analyzing the factors responsible for observed variation in potential coinfection: Can't use the # number of phages infecting directly, because studies tested different numbers of phages ### 2a. One approach is to conduct an ANOVA directly on the proportions (bear with me before you groan, # I will address limitations of this approach below) model_all_prop <- aov(prop_infecting ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, data = phages_infecting_complete) summary(model_all_prop) # Df Sum Sq Mean Sq F value Pr(>F) # Bacterialtrophy 1 1.62 1.6173 19.373 1.19e-05 *** # Majority.source 2 4.63 2.3165 27.749 1.91e-12 *** # Bacterial.Association 7 8.40 1.2005 14.381 < 2e-16 *** # Isolation_Habitat_new 22 29.29 1.3315 15.951 < 2e-16 *** # Bacteria_new 3 1.83 0.6115 7.325 7.38e-05 *** # Residuals 969 80.89 0.0835 summary.lm(model_all_prop) #Residual standard error: 0.2889 on 969 degrees of freedom #Multiple R-squared: 0.3614, Adjusted R-squared: 0.3384 #F-statistic: 15.67 on 35 and 969 DF, p-value: < 2.2e-166 (summary.aov(model_all_prop)[[1]]$'Sum Sq' / sum(summary.aov(model_all_prop)[[1]]$'Sum Sq'))*100 # Trophy #Source #Bact.Assoc #Isolation.Hab #Taxa Residuals #Percentage explained: 1.276707 3.657382 6.634173 23.125500 1.448197 63.858042 #Reduced model model_all_prop <- step(model_all_prop) summary(model_all_prop) # Df Sum Sq Mean Sq F value Pr(>F) # Bacterial.Association 7 10.65 1.5214 18.24 < 2e-16 *** # Isolation_Habitat_new 22 30.85 1.4024 16.81 < 2e-16 *** # Bacteria_new 5 4.26 0.8518 10.21 1.5e-09 *** # Residuals 970 80.91 0.0834 #--- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #Residual standard error: 0.2888 on 970 degrees of freedom #Multiple R-squared: 0.3613, Adjusted R-squared: 0.3389 #F-statistic: 16.14 on 34 and 970 DF, p-value: < 2.2e-16 #Extract Percentage of Variance Explained (summary.aov(model_all_prop)[[1]]$'Sum Sq' / sum(summary.aov(model_all_prop)[[1]]$'Sum Sq'))*100 # #Bact.Assoc #Isolation.Hab #Taxa #Residuals #Percentage explained: 8.407191 24.356492 3.362131 63.874187 plot(model_all_prop) #These diagnostic plots actually look pretty good. Now get all plots in one. #Figure S1. ANOVA Plot diagnostics par(mfrow=c(2,2)) plot(model_all_prop) ### 2b. Objections to ANOVA analysis on proportions say arc-sine transform can take care of issues model_all_prop_arcsin <- lm(asin(prop_infecting) ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, data = phages_infecting_complete) summary.aov(model_all_prop_arcsin) # Df Sum Sq Mean Sq F value Pr(>F) # Bacterialtrophy 1 3.04 3.040 17.98 2.44e-05 *** # Majority.source 2 16.12 8.060 47.67 < 2e-16 *** # Bacterial.Association 7 16.42 2.346 13.88 < 2e-16 *** # Isolation_Habitat_new 22 70.49 3.204 18.95 < 2e-16 *** # Bacteria_new 3 2.77 0.923 5.46 0.00101 ** # Residuals 969 163.84 0.169 #--- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 summary.lm(model_all_prop_arcsin) #Residual standard error: 0.4112 on 969 degrees of freedom #Multiple R-squared: 0.3992, Adjusted R-squared: 0.3775 #F-statistic: 18.39 on 35 and 969 DF, p-value: < 2.2e-16 #Extract Percentage of Variance Explained (summary.aov(model_all_prop_arcsin)[[1]]$'Sum Sq' / sum(summary.aov(model_all_prop_arcsin)[[1]]$'Sum Sq'))*100 #Percentage explained: 1.114948 5.911753 6.023291 25.850664 1.015682 60.083662 #Slightly more variance explained than prop model plot(model_all_prop) #These diagnostic plots look a little better than without the transform ### 2c. Now the "right" way to do this is a logistic regression because I have counts and totals #Make vector of failures (to infect) failures <- phages_infecting_complete$phage_tested - phages_infecting_complete$phages_infecting #Model glm_all <- glm(cbind(phages_infecting, failures) ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, family = binomial, data = phages_infecting_complete) summary(glm_all) anova(glm_all, test = "Chisq") #Analysis of Deviance Table #Model: binomial, link: logit #Response: cbind(phages_infecting, failures) #Terms added sequentially (first to last) #Df Deviance Resid. Df Resid. Dev Pr(>Chi) #NULL 1004 6033.6 # Bacterialtrophy 1 69.29 1003 5964.3 < 2.2e-16 *** # Majority.source 2 45.70 1001 5918.6 1.194e-10 *** # Bacterial.Association 7 506.57 994 5412.1 < 2.2e-16 *** # Isolation_Habitat_new 22 1348.66 972 4063.4 < 2.2e-16 *** # Bacteria_new 3 149.74 969 3913.7 < 2.2e-16 *** #Model is overdispersed, so using quasibinomial family, per Crawley # source http://www.math.chs.nihon-u.ac.jp/~tanaka/files/kenkyuu/ProportionData.pdf glm_all_quasibinaomial <- glm(cbind(phages_infecting, failures) ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, family = quasibinomial, data = phages_infecting_complete) summary(glm_all_quasibinaomial) anova(glm_all_quasibinaomial, test = "Chisq") #Analysis of Deviance Table #Model: quasibinomial, link: logit #Response: cbind(phages_infecting, failures) #Terms added sequentially (first to last) # Df Deviance Resid. Df Resid. Dev Pr(>Chi) # NULL 1004 6033.6 # Bacterialtrophy 1 69.29 1003 5964.3 1.174e-05 *** # Majority.source 2 45.70 1001 5918.6 0.001777 ** # Bacterial.Association 7 506.57 994 5412.1 < 2.2e-16 *** # Isolation_Habitat_new 22 1348.66 972 4063.4 < 2.2e-16 *** # Bacteria_new 3 149.74 969 3913.7 5.118e-09 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #Fit of the model with(glm_all_quasibinaomial, null.deviance - deviance) #[1] 2378.563 with(glm_all_quasibinaomial, df.null - df.residual) #[1] 35 with(glm_all_quasibinaomial, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) #[1] 0 #### 3. Plotting factors affecting the proportion of tested phage infecting hosts #### #Isolation Habitat #Isolation Habitat ("Isolation_Habitat_new" = corrected: inserted missing category see data sheet) gr1 <- ggplot(data=phages_infecting_complete, aes(x=Isolation_Habitat_new, y=prop_infecting)) + geom_boxplot() + geom_point(aes(colour=studies), position=position_jitter(width=0.2, height=0.02), alpha=0.25) + coord_flip() + theme(legend.position="none") + xlab("Isolation Habitat") + ylab("Prop. of tested phage infecting") #use this one #Isolation Habitat alternate classification collapsing too many categories ggplot(data=phages_infecting_complete, aes(x=Isolation_Habitat_alt, y=prop_infecting)) + geom_boxplot() + geom_point(aes(colour=studies), position=position_jitter(width=0.2, height=0.02), alpha=0.75) + coord_flip() + theme(legend.position="none") #Bacterial Association gr2 <- ggplot(data=phages_infecting_complete, aes(x=Bacterial.Association, y=prop_infecting)) + geom_boxplot() + geom_point(aes(colour=studies), position=position_jitter(width=0.2, height=0.02), alpha=0.25) + coord_flip() + theme(legend.position="none") + xlab("Bacterial Association") + ylab("Prop. of tested phage infecting") #Don't use Bacterial_Association_new, previous categories seem adequate #Type of Study (Majority Source) gr3 <- ggplot(data=phages_infecting_complete, aes(x=Majority.source, y=prop_infecting)) + geom_boxplot() + geom_point(aes(colour=studies), position=position_jitter(width=0.3, height=0.02), alpha=0.25) + coord_flip() + theme(legend.position="none") + xlab("Study Type Source") + ylab("Prop. of tested phage infecting") #Bacterial Trophy gr4 <- ggplot(data=phages_infecting_complete, aes(x=Bacterialtrophy, y=prop_infecting)) + geom_boxplot() + geom_point(aes(colour=studies), position=position_jitter(width=0.3, height=0.02), alpha=0.25) + coord_flip() + theme(legend.position="none") +xlab("Bacterial Trophy") + ylab("Prop. of tested phage infecting") #Bacteria ggplot(data=phages_infecting_complete, aes(x=Bacteria, y=prop_infecting)) + geom_boxplot() + geom_point(aes(colour=studies), position=position_jitter(width=0.1, height=0.02), alpha=0.75) + coord_flip() + theme(legend.position="none") #Bacteria, cleaning up and consolidating categories gr5 <- ggplot(data=phages_infecting_complete, aes(x=Bacteria_new, y=prop_infecting)) + geom_boxplot() + geom_point(aes(colour=studies), position=position_jitter(width=0.1, height=0.02), alpha=0.25) + coord_flip() + theme(legend.position="none") + xlab("Bacterial Taxa") + ylab("Prop. of tested phage infecting") #This seems like the most adequate way to describe variability #Bacteria, using families instead ggplot(data=phages_infecting_complete, aes(x=Bacteria_family, y=prop_infecting)) + geom_boxplot() + geom_point(aes(colour=studies), position=position_jitter(width=0.1, height=0.02), alpha=0.75) + coord_flip() + theme(legend.position="none") #All plots full and reduced model Figure S2 #Bring plots together. Requires package GridExtra grid.arrange(gr1, gr2, gr5, gr3, gr4, nrow = 2) #Now draw both ANOVA tables for the remaining open panel full_table <-anova(lm(prop_infecting ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, data = phages_infecting_complete)) ftable <- tableGrob(round(full_table, digits=4)) #Now with step-AIC reduced model reduced_table <-anova(lm(prop_infecting ~ Bacterial.Association + Isolation_Habitat_new + Bacteria_new, data = phages_infecting_complete)) rtable <- tableGrob(round(reduced_table, digits=4)) grid.arrange(ftable, rtable, nrow=2) #Only reduced model Plots for Figure 1 grid.arrange(gr1, gr2, gr5, nrow = 2) reduced_table <-anova(lm(prop_infecting ~ Bacterial.Association + Isolation_Habitat_new + Bacteria_new, data = phages_infecting_complete)) rtable <- tableGrob(round(reduced_table, digits=4)) grid.arrange(rtable)