--- title: "Wood Modification and Life Cycle Assessment Bibliometric analysis" authors: | | Michael Burnard | University of Primorska, Koper, Slovenia [www.upr.si ] | InnoRenew CoE, Izola, Slovenia [ www.innorenew.eu ] | michael.burnard@iam.upr.si or michael.burnard@innorenew.eu output: pdf_document: df_print: kable toc: TRUE toc_depth: 3 bibliography: biblio.bib --- \newpage # Introduction This document contains the steps to reproduce the portion the biobliometric analysis published in [ link to journal article coming soon] and conducted in R. Please note: 1. Many variable names are used more than once and are overwritten throughout the document. This may lead to errors if running sections out of order. 2. Many times files are written, then re-read. This indicates the documents were edited manually. Typically this was to unify names (e.g., equating Burnard, M.D. and Burnard, M.) or combine similar keywords (e.g., equating acid and acids). # Setting the environment ```{r message=FALSE} library(tidyr) library(dplyr) library(ggplot2) library(stringr) library(tm) library(wordcloud2) library(igraph) ``` # Reading the data ```{r} #LCA data lca <- read.delim("EnvImpactWood.txt", strip.white=TRUE, stringsAsFactors=FALSE, fileEncoding="UTF-16") #Wood Mod data mod <- read.delim("ModWood.txt", strip.white=TRUE, stringsAsFactors=FALSE, fileEncoding="UTF-16") ``` # Analysis We take two approaches to extracting summary info from text data. One, using the the tm package, and one using the tidyr, dplyr, stringr suite. tm is better for real text data, and even remains more effective for getting a list of keywords from keyword data. However, it's rather cumbersome and overpowered for that task. tm is used for the word clouds here (I learned about tm in a tutorial for text data mining that included creating word clouds). Both approaches are fine for the type of analysis we're doing here for keywords. The tidyr, dplyr, stringr is used for simplicity creating the network edge lists. ## Word Clouds Are word clouds really analysis? Regardless, they do give us a quick way to see the prominence of certain keywords. We'll look at two time periods. Everything up to 2005, and everything from 2006 to current. ### LCA ```{r} set.seed(12345678) wd.l05 <- lca %>% select(AuthorKeywords, Year) %>% filter(Year %in% 1977:2005, !is.na(AuthorKeywords)) %>% separate_rows(AuthorKeywords, sep=";") wd.l05$AuthorKeywords <- str_replace(trimws(wd.l05$AuthorKeywords, "b"), "-", " ") wd.l05 <- wd.l05 %>% separate_rows(AuthorKeywords, sep=" ") txt05 <- Corpus(VectorSource(paste(wd.l05$AuthorKeywords, collapse=" "))) txt05 <- tm_map(txt05, removePunctuation) txt05 <- tm_map(txt05, stripWhitespace) txt05 <- tm_map(txt05, removeWords, stopwords('english')) txt05 <- as.matrix(DocumentTermMatrix(txt05)) frequency <- colSums(txt05) frequency <- sort(frequency, decreasing=TRUE) wrds <- names(frequency) lca05.df <- as.data.frame(cbind(wrds, frequency)) lca05.df$wrds <- as.character(lca05.df$wrds) lca05.df$frequency <- as.numeric(as.character(lca05.df$frequency)) #write.csv(file="LCAWords7705.csv", lca05.df, row.names = FALSE) # here we do some manual combinations of like words in excel lca05.df <- read.csv("LCAWords7705-corr.csv", stringsAsFactors=FALSE) lca05.df <- lca05.df %>% filter(frequency > 3, !wrds %in% c("life", "cycle", "assessment", "lca")) wordcloud2(lca05.df, color = 'random-dark') wd.l16 <- lca %>% select(AuthorKeywords, Year) %>% filter(Year %in% 2006:2017, !is.na(AuthorKeywords)) %>% separate_rows(AuthorKeywords, sep=";") wd.l16$AuthorKeywords <- str_replace(trimws(wd.l16$AuthorKeywords, "b"), "-", " ") wd.l16 <- wd.l16 %>% separate_rows(AuthorKeywords, sep=" ") txt16 <- Corpus(VectorSource(paste(wd.l16$AuthorKeywords, collapse=" "))) txt16 <- tm_map(txt16, removePunctuation) txt16 <- tm_map(txt16, stripWhitespace) txt16 <- tm_map(txt16, removeWords, stopwords('english')) txt16 <- as.matrix(DocumentTermMatrix(txt16)) frequency <- colSums(txt16) frequency <- sort(frequency, decreasing=TRUE) wrds <- names(frequency) lca16.df <- as.data.frame(cbind(wrds, frequency)) lca16.df$wrds <- as.character(lca16.df$wrds) lca16.df$frequency <- as.numeric(as.character(lca16.df$frequency)) #write.csv(file="LCAWords0616.csv", lca16.df, row.names=FALSE) # here we do some manual combinations of like words in excel lca16.df <- read.csv("LCAWords0616-corr.csv", stringsAsFactors=FALSE) lca16.df <- lca16.df %>% filter(frequency > 10, !wrds %in% c("life", "cycle", "assessment", "lca")) wordcloud2(lca16.df, color = 'random-dark') ``` ### Modwood ```{r} wd.l05 <- mod %>% select(AuthorKeywords, Year) %>% filter(Year %in% 1955:2005, !is.na(AuthorKeywords)) %>% separate_rows(AuthorKeywords, sep=";") wd.l05$AuthorKeywords <- str_replace(trimws(wd.l05$AuthorKeywords, "b"), "-", " ") wd.l05 <- wd.l05 %>% separate_rows(AuthorKeywords, sep=" ") txt05 <- Corpus(VectorSource(paste(wd.l05$AuthorKeywords, collapse=" "))) txt05 <- tm_map(txt05, removePunctuation) txt05 <- tm_map(txt05, stripWhitespace) txt05 <- tm_map(txt05, removeWords, stopwords('english')) txt05 <- as.matrix(DocumentTermMatrix(txt05)) frequency <- colSums(txt05) frequency <- sort(frequency, decreasing=TRUE) wrds <- names(frequency) mod05.df <- as.data.frame(cbind(wrds, frequency)) mod05.df$wrds <- as.character(mod05.df$wrds) mod05.df$frequency <- as.numeric(as.character(mod05.df$frequency)) #write.csv(file="ModWords7705.csv", mod05.df, row.names = FALSE) # here we do some manual combinations of like words in excel mod05.df <- read.csv("ModWords7705-corr.csv", stringsAsFactors=FALSE) mod05.df <- mod05.df %>% group_by(wrds) %>% summarise(frequency = sum(frequency)) %>% filter(frequency > 3, wrds != "wood") wordcloud2(mod05.df, color = 'random-dark') wd.l16 <- mod %>% select(AuthorKeywords, Year) %>% filter(Year %in% 2006:2017, !is.na(AuthorKeywords)) %>% separate_rows(AuthorKeywords, sep=";") wd.l16$AuthorKeywords <- str_replace(trimws(wd.l16$AuthorKeywords, "b"), "-", " ") wd.l16 <- wd.l16 %>% separate_rows(AuthorKeywords, sep=" ") txt16 <- Corpus(VectorSource(paste(wd.l16$AuthorKeywords, collapse=" "))) txt16 <- tm_map(txt16, removePunctuation) txt16 <- tm_map(txt16, stripWhitespace) txt16 <- tm_map(txt16, removeWords, stopwords('english')) txt16 <- as.matrix(DocumentTermMatrix(txt16)) frequency <- colSums(txt16) frequency <- sort(frequency, decreasing=TRUE) wrds <- names(frequency) mod16.df <- as.data.frame(cbind(wrds, frequency)) mod16.df$wrds <- as.character(mod16.df$wrds) mod16.df$frequency <- as.numeric(as.character(mod16.df$frequency)) #write.csv(file="ModWords0616a.csv", mod16.df, row.names=FALSE) # here we do some manual combinations of like words in excel mod16.df <- read.csv("ModWords0616-corr.csv", stringsAsFactors=FALSE) mod16.df <- mod16.df %>% group_by(wrds) %>% summarise(frequency = sum(frequency)) %>% filter(frequency > 10, !wrds %in% c("wood", "modifi-")) wordcloud2(mod16.df, color = 'random-dark') ``` ## LCA - Common Keyword trends add relative freq. column (relative to papers that year) remove searched terms (lca, wood, etc) ```{r} kw <- lca kw$AuthorKeywords <- str_replace_all(kw$AuthorKeywords, "[[:punct:]]", ";") kw$AuthorKeywords <- str_replace_all(kw$AuthorKeywords, " ", ";") kw <- kw %>% select(AuthorKeywords, Year) %>% filter(!is.na(AuthorKeywords)) %>% separate_rows(AuthorKeywords, sep=";") kw$AuthorKeywords <- trimws(str_to_lower(kw$AuthorKeywords)) kw <- kw %>% filter(AuthorKeywords != "") %>% group_by(AuthorKeywords, Year) %>% summarise(Frequency = n()) #write.csv(kw, file="LCAKeywords.csv", row.names = FALSE) kw <- read.csv("LCAKeywords-corr.csv", stringsAsFactors=FALSE) no.pbs <- lca %>% group_by(Year) %>% summarise(PubCount = n()) kw <- kw %>% group_by(AuthorKeywords, Year) %>% summarise(Frequency = sum(Frequency)) kw <- kw %>% full_join(no.pbs, by="Year") kw.sum <- kw %>% group_by(AuthorKeywords) %>% summarise(Frequency = sum(Frequency)) %>% arrange(desc(Frequency)) kw.sum <- kw.sum %>% filter(!AuthorKeywords %in% c("lca", "life", "cycle", "assessment", "wood", "lcc")) kw <- kw %>% group_by(Year) %>% filter(!AuthorKeywords %in% c("lca", "life", "cycle", "assessment", "wood", "lcc")) %>% mutate(Freq.rel = Frequency / PubCount) ``` ### Some figures ```{r} #pubs per year ggplot(data=no.pbs %>% filter(Year < 2017), aes(y=PubCount, x=Year)) + theme_bw() + geom_point() + geom_line() + scale_x_continuous(breaks=seq(1980,2015,5)) + labs(title="LCA publications per year 1977 to 2016", y="Publications") #selected keywords selected <- c("biomass", "bioenergy", "material", "product", "building", "fuel", "recycle", "waste", "carbon", "energy", "sustainab-", "emission") kw.sel <- kw %>% filter(AuthorKeywords %in% selected) ggplot(data=kw.sel, aes(x=Year, y=Freq.rel)) + theme_bw() + geom_line(colour="#999999") + geom_point(size=.5) + facet_wrap(~AuthorKeywords) + scale_x_continuous(breaks=seq(1995,2015,5)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + labs(title="Rate of keyword occurance in LCA publications by year", y="Keyword occurance rate") ``` ## Modified wood - Common Keyword trends add relative freq. column (relative to papers that year) remove searched terms (lca, wood, etc) ```{r} kw <- mod kw$AuthorKeywords <- str_replace_all(kw$AuthorKeywords, "[[:punct:]]", ";") kw$AuthorKeywords <- str_replace_all(kw$AuthorKeywords, " ", ";") kw <- kw %>% select(AuthorKeywords, Year) %>% filter(!is.na(AuthorKeywords)) %>% separate_rows(AuthorKeywords, sep=";") kw$AuthorKeywords <- trimws(str_to_lower(kw$AuthorKeywords)) kw <- kw %>% filter(AuthorKeywords != "") %>% group_by(AuthorKeywords, Year) %>% summarise(Frequency = n()) #write.csv(kw, file="ModKeywords.csv", row.names = FALSE) kw <- read.csv("MODKeywords-corr.csv", stringsAsFactors=FALSE) no.pbs <- mod %>% group_by(Year) %>% summarise(PubCount = n()) kw <- kw %>% group_by(AuthorKeywords, Year) %>% summarise(Frequency = sum(Frequency)) kw <- kw %>% full_join(no.pbs, by="Year") kw.sum <- kw %>% group_by(AuthorKeywords) %>% summarise(Frequency = sum(Frequency)) %>% arrange(desc(Frequency)) kw.sum <- kw.sum %>% filter(!AuthorKeywords %in% c("wood", "modification", "modified")) kw <- kw %>% group_by(Year) %>% filter(!AuthorKeywords %in% c("wood", "modification", "modified")) %>% mutate(Freq.rel = Frequency / PubCount) ``` ### Some figures ```{r} #pubs per year ggplot(data=no.pbs %>% filter(Year < 2017), aes(y=PubCount, x=Year)) + theme_bw() + geom_point() + geom_line() + scale_x_continuous(breaks=seq(1955,2015,5)) + labs(title="Modified wood publications per year 1955 to 2016", y="Publications") #selected keywords selected <- c("thermal", "chemical", "mechanical", "composite", "acetylation", "densification", "heat", "surface", "stability", "moisture", "properties", "resistance") kw.sel <- kw %>% filter(AuthorKeywords %in% selected) ggplot(data=kw.sel[which(kw.sel$Year > 1989),], aes(x=Year, y=Freq.rel)) + theme_bw() + geom_line(colour="#999999") + geom_point(size=.5) + facet_wrap(~AuthorKeywords) + scale_x_continuous(breaks=seq(1990,2015,5)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + labs(title="Keyword occurrence in wood modification publications 1990 to 2016", y="Keyword occurrence rate") ``` ### Edge lists for networks Here we create the edge lists for co-author and keyword co-word networks. #### Keyword co-word analysis This requires the rather cumbersome task of splitting the authors list, check for name spelling, recombining, and then splitting into edges. The sequence is: add a unique ID to each paper, split into rows, export, check, reimport, combine, split into columns, then build edges. ```{r} #kw.e <- mod %>% select(AuthorKeywords) %>% filter(AuthorKeywords != "") #kw.e$uid <-seq(1001,1000+nrow(kw.e),1) #kw.e <- kw.e %>% separate_rows(AuthorKeywords, sep=";") #kw.e$AuthorKeywords <- str_replace_all(str_to_lower(trimws(kw.e$AuthorKeywords)), "[[:punct:]\\s]", "-") #kw.e$AuthorKeywords <- str_replace_all(kw.e$AuthorKeywords, "--", "-") #write.csv(kw.e, "MODKWNodes.csv", fileEncoding = "UTF-8", row.names = FALSE) kw.e <- read.csv("MODKWNodes-corr.csv", stringsAsFactors=FALSE) rm("out", "tb1", "x") #for combn to work in the for loop below, each entry must have more than one keyword #if count = 1 investigate and fix/delete kw.e %>% group_by(uid) %>% summarise(count = n()) %>% arrange(count) #this takes the keywords for each article and makes all of the pairs for them for(i in min(kw.e$uid):max(kw.e$uid)) { x <- kw.e %>% filter(uid == i) tb1 <- t(combn(x$AuthorKeywords,2)) if(exists("out")) { out <- rbind(out, tb1) } else { out <- tb1 } } kw.el <- as.data.frame(out) rm("out", "tb1", "x") names(kw.el) <- c("Source", "Target") kw.el$Weight <- 1 kw.el <- aggregate(Weight ~ Source+Target, kw.el, FUN=sum) kw.el$Type <- "Undirected" write.csv(kw.el, file="MODKWEdges.csv", row.names=FALSE) #use this in gephi! ``` ### Co-Author Networks #### Mod wood ```{r} #au.e <- mod %>% select(Authors) %>% filter(Authors != "") #au.e$uid <-seq(1001,1000+nrow(au.e),1) #au.e <- au.e %>% separate_rows(Authors, sep=",") #au.e$Authors <- trimws(au.e$Authors) #write.csv(au.e, "MODAuthNodes.csv", fileEncoding = "UTF-16", row.names = FALSE) au.e <- read.csv("MODAuthNodes-corr.csv", stringsAsFactors = FALSE) au.e %>% group_by(uid) %>% summarise(count = n()) %>% arrange(count) au.e <- au.e %>% group_by(Authors) %>% mutate(pubs = n()) au.solo <- au.e %>% group_by(uid) %>% mutate(pubAuthorCount = n()) %>% ungroup() %>% filter(pubAuthorCount == 1) au.e <- au.e <- au.e %>% group_by(uid) %>% mutate(pubAuthorCount = n()) %>% ungroup() %>% filter(pubAuthorCount > 1) #this takes the keywords for each article and makes all of the pairs for them rm("out", "tb1", "x") for(i in min(au.e$uid):max(au.e$uid)) { uid <- i if(i %in% au.e$uid) { x <- au.e %>% filter(uid == i) tb1 <- t(combn(x$Authors,2)) if(exists("out")) { out <- rbind(out, tb1) } else { out <- tb1 } } } auth.el <- as.data.frame(out) rm("out", "tb1", "x") names(auth.el) <- c("Source", "Target") auth.el$Weight <- 1 auth.el <- aggregate(Weight ~ Source+Target, auth.el, FUN=sum) auth.el$Type <- "Undirected" write.csv(auth.el, file="MODAuthEdges-g.csv", row.names=FALSE) #use this in gephi! auth.n <- unique(au.e %>% select(uid, Authors, pubs)) auth.n <- unique(rbind(auth.n, (au.solo %>% select(uid, Authors, pubs)))) auth.n <- unique(auth.n %>% select(Authors, pubs)) auth.n$Label <- paste(auth.n$Authors, " | ", "Pubs: ", auth.n$pubs, sep="") names(auth.n)[1] <- "ID" write.csv(auth.n, file="ModAuthNodes-g.csv", row.names=FALSE) ``` #### LCA ```{r} #au.e <- lca %>% select(Authors) %>% filter(Authors != "") #au.e$uid <-seq(1001,1000+nrow(au.e),1) #au.e <- au.e %>% separate_rows(Authors, sep=",") #au.e$Authors <- trimws(au.e$Authors) #write.csv(au.e, "LCAAuthNodes.csv", fileEncoding = "UTF-16", row.names = FALSE) au.e <- read.csv("LCAAuthNodes-corr.csv", stringsAsFactors = FALSE) au.e %>% group_by(uid) %>% summarise(count = n()) %>% arrange(count) au.e <- au.e %>% group_by(Authors) %>% mutate(pubs = n()) au.solo <- au.e %>% group_by(uid) %>% mutate(pubAuthorCount = n()) %>% ungroup() %>% filter(pubAuthorCount == 1) au.e <- au.e <- au.e %>% group_by(uid) %>% mutate(pubAuthorCount = n()) %>% ungroup() %>% filter(pubAuthorCount > 1) #this takes the keywords for each article and makes all of the pairs for them rm("out", "tb1", "x") for(i in min(au.e$uid):max(au.e$uid)) { uid <- i if(i %in% au.e$uid) { x <- au.e %>% filter(uid == i) tb1 <- t(combn(x$Authors,2)) if(exists("out")) { out <- rbind(out, tb1) } else { out <- tb1 } } } auth.el <- as.data.frame(out) rm("out", "tb1", "x") names(auth.el) <- c("Source", "Target") auth.el$Weight <- 1 auth.el <- aggregate(Weight ~ Source+Target, auth.el, FUN=sum) auth.el$Type <- "Undirected" write.csv(auth.el, file="LCAAuthEdges-g.csv", row.names=FALSE) #use this in gephi! auth.n <- unique(au.e %>% select(uid, Authors, pubs)) auth.n <- unique(rbind(auth.n, (au.solo %>% select(uid, Authors, pubs)))) auth.n <- unique(auth.n %>% select(Authors, pubs)) auth.n$Label <- paste(auth.n$Authors, " | ", "Pubs: ", auth.n$pubs, sep="") names(auth.n)[1] <- "ID" write.csv(auth.n, file="LCAAuthNodes-g.csv", row.names=FALSE) ``` ## Author and publication stats ```{r} lca.a <- read.csv("LCAAuthNodes-g.csv", stringsAsFactors = FALSE) mod.a <- read.csv("MODAuthNodes-g.csv", stringsAsFactors = FALSE) head(lca.a %>% arrange(desc(pubs)), 35) head(mod.a %>% arrange(desc(pubs)), 25) nrow(intersect((lca.a %>% select(ID)),(mod.a %>% select(ID)))) #86 authors in both sets. #file of the unique ids given by SCOPUS #it contains all the Ids from both data sets. uids <- read.csv("UniqueIDs.csv", stringsAsFactors=FALSE) nrow(uids) - n_distinct(uids$EID) n_distinct(trimws(mod$SourceTitle)) n_distinct(trimws(lca$SourceTitle)) min(mod$Year) min(lca$Year) nrow(mod.a) nrow(lca.a) nrow(mod) nrow(lca) nrow(intersect((lca %>% select(SourceTitle)),(mod %>% select(SourceTitle)))) ```