PBDB data processing

These code blocks were used to assign international stages to the PBDB data set, to assign missing skeletal mineralogies and taxonomic information and to remove unresolved occurrences.

Getting the data

We used occurrence data from the Paleobiology Database (PBDB, https://paleobiodb.org/) to evaluate the success of marine calcifiers. We downloaded all Phanerozoic occurrence data on 24 January 2017 with standard settings.In the output options, all boldfaced output blocks were selected, as well as “stratigraphy”, “lithology” and “paleoenvironment”.

To run this script, the downloaded data should be stored as a variable named “pbdb”:

#setwd("YOUR WORKING DIRECTORY") # set your appropriate working directory, e.g. ("C://Users/keichenseer/Manuscripts/Nature_Geoscience/Figshare_Uploads/CSVs") which contains all the CSV files from this Figshare project
pbdb <- read.csv(file="pbdb_data.csv", header=TRUE, sep=",", skip=18) # read the PBDB data

Assignment of geological stages to the PBDB data

Based on International stages of Gradstein et al. (2012), updated based on Cohen et al. (2013)

Read in a table with the stages

The table “stages_to_bin.csv” needs to be in the working directory.

stages2 <- read.csv("stages_to_bin.csv") # file with geological stages

stages <- as.character(stages2$stage)
sstages <- as.character(stages2$stage)
subepochs <- as.character(levels(factor(stages2$subepoch)))
subepochs <- subepochs[-length(subepochs)] # remove the "xxx" entry
epochs <- as.character(levels(factor(stages2$epoch)))
periods <- as.character(levels(factor(stages2$period)))

pbdb$early_interval <- as.character(pbdb$early_interval)
pbdb$late_interval <- as.character(pbdb$late_interval)

allnames <- unique(c(pbdb$early_interval, pbdb$late_interval)) # all pbdb stratigraphic interval names

If the column “late_interval” is empty, copy information from “early_interval”

pbdb$late_interval[which(pbdb$late_interval == "")] <- pbdb$early_interval[which(pbdb$late_interval == "")]

Use internal stratigraphic information to assign first and last possible stages

early_int <- rep("YYY",nrow(pbdb)) # vector to store the international earliest stage
late_int <- rep("ZZZ",nrow(pbdb)) # vector to store the international latest

# loop to assign international first and last stages based on stages, subepochs, epochs and periods
# - can be slow with large data sets -
for (i in 1:nrow(stages2)) {
  early_int[which(pbdb$early_interval %in% allnames[
    which(sapply(allnames,function(x) grepl(as.character(stages2$stage[i]),x)))])] <- as.character(stages2$stage[i])
  
  early_int[which(pbdb$early_interval %in% subepochs & pbdb$early_interval %in% allnames[
    which(sapply(allnames,function(x) as.character(stages2$subepoch[i]) %in% x))])] <- as.character(stages2$stage[
      which(stages2$subepoch == stages2$subepoch[i])][which.max(stages2$age[which(stages2$subepoch == stages2$subepoch[i])])] ) 
  
  early_int[which(pbdb$early_interval %in% epochs & pbdb$early_interval %in% allnames[
    which(sapply(allnames,function(x) as.character(stages2$epoch[i]) %in% x))])] <- as.character(stages2$stage[
      which(stages2$epoch == stages2$epoch[i])][which.max(stages2$age[which(stages2$epoch == stages2$epoch[i])])])
  
  early_int[which(pbdb$early_interval %in% periods & pbdb$early_interval %in% allnames[
    which(sapply(allnames,function(x) as.character(stages2$epoch[i]) %in% x))])] <- as.character(stages2$stage[
      which(stages2$period == stages2$period[i])][which.max(stages2$age[which(stages2$period == stages2$period[i])])])
  
  late_int[which(pbdb$late_interval %in% allnames[
    which(sapply(allnames,function(x) grepl(as.character(stages2$stage[i]),x)))])] <- as.character(stages2$stage[i])
  
  late_int[which(pbdb$late_interval %in% subepochs & pbdb$late_interval %in% allnames[
    which(sapply(allnames,function(x) as.character(stages2$subepoch[i]) %in% x))])] <- as.character(stages2$stage[
      which(stages2$subepoch == stages2$subepoch[i])][which.min(stages2$age[which(stages2$subepoch == stages2$subepoch[i])])] ) 
  
  late_int[which(pbdb$late_interval %in% epochs & pbdb$late_interval %in% allnames[
    which(sapply(allnames,function(x) as.character(stages2$epoch[i]) %in% x))])] <- as.character(stages2$stage[
      which(stages2$subepoch == stages2$subepoch[i])][which.min(stages2$age[which(stages2$subepoch == stages2$subepoch[i])])] ) 
  
  late_int[which(pbdb$late_interval %in% periods & pbdb$late_interval %in% allnames[
    which(sapply(allnames,function(x) as.character(stages2$epoch[i]) %in% x))])] <- as.character(stages2$stage[
      which(stages2$period == stages2$period[i])][which.min(stages2$age[which(stages2$period == stages2$period[i])])])
}

Occurrences with regional or old stratigraphic names need to be assigned to international stage. Uses the table “stage_assignment_stratigraphic_names.csv”.

mis_stages <- read.csv(file="stage_assignment_stratigraphic_names.csv", header=TRUE, sep=",") # file with all old / regional strat. names and corresponding first and last stages

early_stage_id <- mis_stages$early_stage
late_stage_id <- mis_stages$late_stage

# get rid of leading spaces:
trim.leading <- function (x)  sub("^\\s+", "", x) # function to delete leading spaces from strings
early_stage_id  <- trim.leading(early_stage_id)
late_stage_id <- trim.leading(late_stage_id)

Assign stage names to pbdb occurrences based on the table

for (i in 1:nrow(mis_stages)) {
  early_int[which(pbdb$early_interval == mis_stages$ori_name[i])] <- as.character(early_stage_id[i])
  late_int[which(pbdb$late_interval == mis_stages$ori_name[i])] <- as.character(late_stage_id[i])
}

Combine the Early Jurassic and the Pleistocene stages, respectively

Realise this by including the Olenekian in the Induan, and the entire Pleistocene in the Gelasian, to avoid very short time intervals

thestage <- rep("SSS",nrow(pbdb))

for(i in 1:length(sstages)){
  thestage[which(early_int == sstages[i] & late_int == sstages[i])] <- sstages[i]
  if(i == 49) thestage[which(early_int == sstages[i] & late_int == sstages[i+1])] <- sstages[i]
  if(i %in% c(96,97,98,99)) thestage[which(early_int == sstages[i] & late_int %in% sstages[c(96,97,98,99)])] <- sstages[i]
}


sstages2 <- c(sstages[11:48], "Induan",
              sstages[51:95],"Gelasian")
thestage[which(thestage %in% c("Induan", "Olenekian"))] <- "Induan"
thestage[which(thestage %in% c("Gelasian", "Calabrian", "Middle Pleistocene", "Late Pleistocene"))] <- "Gelasian"

Include a new column “stage” in the occurrence data

stage <- thestage
pbdb <- cbind(pbdb,stage)

Delete occurrences which could not be assigned to a geological stage

pbdb <- pbdb[which(pbdb$stage != "SSS"),]

Exclude non-marine data

Keep only marine and marginally / likely marine data

marine <- c("basin reef","basinal (carbonate)", "basinal (siliceous)", "basinal (siliciclastic)", "carbonate indet.",
            "coastal indet.", "deep-water indet.", "deep subtidal indet.", "deep subtidal ramp", "deep subtidal shelf",
            "delta front", "estuary/bay", "fissure fill", "foreshore", "interdistributary bay", "intrashelf/intraplatform reef",
            "lagoonal", "lagoonal/restricted shallow subtidal", "marginal marine indet.", "marine indet.", "offshore",
            "offshore indet.", "offshore ramp", "offshore shelf", "open shallow subtidal", "paralic indet.", "perireef or subreef",
            "peritidal", "platform/shelf-margin reef", "prodelta", "reef, buildup or bioherm", "sand shoal", "shallow subtidal indet.",
            "shoreface", "slope", "slope/ramp reef", "submarine fan", "transition zone/lower shoreface")

# exclude nonmarine occurrences
pbdb <- pbdb[which(pbdb$environment %in% marine),]

Fill incomplete taxonomic hierarchies

Using internal information from other PBDB occurrences and the WoRMs database (http://www.marinespecies.org/), where possible

library(taxize) # needs to be installed before the first use: install.packages("taxize")
library(ritis) # needs to be installed before the first use: install.packages("ritis")

pbdb$phylum <- as.character(pbdb$phylum)
pbdb$class <- as.character(pbdb$class)
pbdb$order <- as.character(pbdb$order)
pbdb$family <- as.character(pbdb$family)
pbdb$genus <- as.character(pbdb$genus)
pbdb$phylum[which(pbdb$order %in% c("Craniida","Trimerellida"))] <- "Brachiopoda"


taxa_exclude <- c("Magnoliophyta","Spermatophyta", "Angiospermae",
                  "Bryophyta", "Chordata", "Ciliophora", "Craniata", "Ctenophora",
                  "Coniferophyta", "Cycadeoideophyta", "Cycadophyta", 
                  "Equisetophyta", "Ginkgophyta", "Gnetophyta", "Hemichordata", 
                  "Lobopodia", "Lycophyta", "Lycopodiophyta", "Lycopodophyta", "Nematoda",
                  "Nematomorpha", "Nemertina", "Peltaspermophyta", "Phoronida",
                  "Pinophyta", "Psilophytophyta", "Pteridophyta", "Pteridospermophyta",
                  "Rhizopodea", "Sarcomastigophora", "Sipuncula", "Tracheophyta", 
                  "Zosterophyllophyta")


pbdb <- pbdb[which(!(pbdb$phylum %in% taxa_exclude)),]



# function to retrieve taxonomic names from the WoRMS database 
gettaxname <- function(x,t) {
  temp = classification(strsplit(as.character(x), " ")[[1]][1], db="worms", ask=FALSE)[[1]]
  if (!(is.na(temp))) {
    sapply(t, function(t) { 
      if (is.element(t,unlist(temp[,2]))) {
        temp[which(temp[,2]==t),1]
      } else NA 
    }
    )
  } else rep(NA,length(t))
}


# formate genera to only include genus, not subgenus
pbdb$genus <- sapply(pbdb$genus,function(x) strsplit(as.character(x), " ")[[1]][1])
pbdb$accepted_name <- as.character(pbdb$accepted_name)

sub_genera <- unique(pbdb$genus[grep("\\(",pbdb$genus)])
sub_genera_in_acc <- pbdb$accepted_name[which(pbdb$accepted_rank %in% c("species", "genus"))]
sub_genera_in_acc <- unique(pbdb$accepted_name[grep("\\(",pbdb$accepted_name)])
sub_genera_in_acc_clean <- sapply(sub_genera_in_acc,function(x) strsplit(as.character(x), " ")[[1]][1])
for(i in 1:length(sub_genera_in_acc_clean)){
  pbdb$accepted_name[which(pbdb$accepted_name == sub_genera_in_acc[i])] <- sub_genera_in_acc_clean[i]
}

# pbdb$accepted_name[grep("\\(",pbdb$accepted_name)] <- sapply(pbdb$accepted_name[grep("\\(",pbdb$accepted_name)],function(x) strsplit(as.character(x), " ")[[1]][1])


# Try and fill in Phylum

pbdb$phylum[which(pbdb$class == "Radiolaria")] <- "Retaria"
pbdb$phylum[which(pbdb$class == "Foraminferea")] <- "Foraminifera"
pbdb$phylum[which(pbdb$class == "Rhodophyceae")] <- "Rhodophyta"
pbdb$phylum[which(pbdb$class == "Bacillariophyceae")] <- "Heterokontophyta" # Now Heterokonta
pbdb$phylum[which(pbdb$class == "Dinophyceae")] <- "Myozoa"
pbdb$phylum[which(pbdb$class == "Rhizopodea")] <- "class Rhizopodea"
pbdb$phylum[which(pbdb$class == "Carpoidea")] <- "Echinodermata"
pbdb$phylum[which(pbdb$class == "Tentaculitoidea")] <- "Mollusca"
pbdb$phylum[which(pbdb$class == "Myxophyceae")] <- "Cyanobacteria"


# get genus based on accepted name - WORMS
acc_gen_mis <- unique(pbdb$accepted_name[which(pbdb$genus == "" & pbdb$accepted_name !="" & pbdb$accepted_rank %in% c("species", "genus"))])
acc_gen_mis_gen <- sapply(acc_gen_mis, function(x) classification(x, db = "worms", ask = FALSE))
acc_gen_mis_gen1 <- rep(NA,length(acc_gen_mis_gen))
for ( i in 1:length(acc_gen_mis_gen)) {
  if(!(is.na(acc_gen_mis_gen[i]))) acc_gen_mis_gen1[i] <- acc_gen_mis_gen[[i]][which(acc_gen_mis_gen[[i]][2]=="Genus"),1]
}
for(i in 1:length(acc_gen_mis_gen1)) {
  if(!(is.na(acc_gen_mis_gen1[[i]]))){
    pbdb$genus[which(pbdb$accepted_name == acc_gen_mis[i])] <- acc_gen_mis_gen1[[i]]
  }
}

stage_get <- function(x) {
  
  maxtemp <- max(match(x,sstages2), na.rm = T)+1
  if(maxtemp == 86) maxtemp <- 85
  mintemp <- min(match(x,sstages2), na.rm = T)-1
  if (mintemp == 0) mintemp <- 1
  
  sstages2[mintemp:maxtemp]
}

# get genus based on accepted name - PBDB
gen_levels <- unique(as.character(pbdb$genus))
acc_gen_mis <- unique(pbdb$accepted_name[which(pbdb$genus == "" & pbdb$accepted_name !="" & pbdb$accepted_rank %in% c("species", "genus"))])
temp <- NULL
famtemp <- NULL
ordtemp <- NULL
classtemp <- NULL
phyltemp <- NULL
timebins <- NULL
for (i in 1:length(acc_gen_mis)) {
  temp =  strsplit(as.character(acc_gen_mis[i]), " ")[[1]][1]
 if (temp %in% gen_levels) {
   timebins <- stage_get(unique(pbdb$stage[which(pbdb$genus == temp)]))
   famtemp <- unique(pbdb$family[which(pbdb$genus == temp)])
   ordtemp <- unique(pbdb$order[which(pbdb$genus == temp)])
   classtemp <- unique(pbdb$class[which(pbdb$genus == temp)])
   phyltemp <- unique(pbdb$phylum[which(pbdb$genus == temp)])
   if (length(famtemp[which(famtemp != "")]) %in% c(0,1) 
       & length(ordtemp[which(ordtemp != "")]) %in% c(0,1)
       & length(classtemp[which(classtemp != "")]) %in% c(0,1)
       & length(phyltemp[which(phyltemp != "")]) %in% c(0,1)) {
     pbdb$genus[which(pbdb$accepted_name == acc_gen_mis[i] & pbdb$stage %in% timebins)] <- temp
   }
 }
}



# get family based on genus - WORMS
no_family <- length(pbdb$family[which(pbdb$family == "")])
gen_names <- names(sort(table(pbdb$genus[which(pbdb$family == "")])))[-length(sort(table(pbdb$genus[which(pbdb$family == "")])))]
gen_familys <- sapply(gen_names, function(x) classification(x, db = "worms", ask = FALSE))
gen_familys1 <- rep(NA,length(gen_familys))
gen_time <- NULL
for ( i in 1:length(gen_familys)) {
  if(!(is.na(gen_familys[i])) ) {
   if ("Family" %in% gen_familys[[i]][[2]]) {
      gen_familys1[i] <- gen_familys[[i]][which(gen_familys[[i]][[2]]=="Family"),1]
    }
  } 
}

for(i in 1:length(gen_names)) {
  if(!(is.na(gen_familys1[[i]])) & length(gen_familys1[[i]] == 1)){
    pbdb$family[which(pbdb$genus == gen_names[i])] <- gen_familys1[[i]]
  }
}


# get family based on other occurrences of the same genus with that family name - PBDB
fam_levels <- unique(as.character(pbdb$family))
gen_fam_mis <- unique(pbdb$genus[which(pbdb$family == "" & pbdb$genus !="")])
fam_of_mis <- sapply(gen_fam_mis, function(x) unique(pbdb$family[which(pbdb$genus == x & pbdb$family != "")]))
temp <- NULL
famtemp <- NULL
ordtemp <- NULL
classtemp <- NULL
phyltemp <- NULL
timebins <- NULL
for (i in 1:length(fam_of_mis)) {
  temp =  unlist(fam_of_mis[i])
  if(!(identical(temp, character(0)))) {
  if (temp %in% fam_levels) {
    timebins <- stage_get(unique(pbdb$stage[which(pbdb$family == temp)]))
    #famtemp <- unique(pbdb$family[which(pbdb$genus == temp)])
    ordtemp <- unique(pbdb$order[which(pbdb$family == temp)])
    classtemp <- unique(pbdb$class[which(pbdb$family == temp)])
    phyltemp <- unique(pbdb$phylum[which(pbdb$family == temp)])
    if (length(ordtemp[which(ordtemp != "")]) %in% c(0,1)
        & length(classtemp[which(classtemp != "")]) %in% c(0,1)
        & length(phyltemp[which(phyltemp != "")]) %in% c(0,1)) {
      pbdb$family[which(pbdb$genus == fam_mis[i] & pbdb$stage %in% timebins)] <- temp
    }
  }
  }
}

# get order based on family - WORMS
no_order <- length(pbdb$order[which(pbdb$order == "")])
fam_names <- unique(pbdb$family[which(pbdb$order == "")])
# fam_names[[1]] <- NULL # check format of fam_names if that is even necessary
fam_orders <- sapply(fam_names, function(x) classification(x, db = "worms", ask = FALSE))
fam_orders1 <- rep(NA,length(fam_orders))
fam_time <- NULL
for ( i in 1:length(fam_orders)) {
  if(!(is.na(fam_orders[i]))) fam_orders1[i] <- fam_orders[[i]][which(fam_orders[[i]][2]=="Order"),1]
}

for(i in 1:length(fam_names)) {
  if(!(is.na(fam_orders1[[i]])) & length(fam_orders1[[i]] == 1)){
    pbdb$order[which(pbdb$family == fam_names[i])] <- fam_orders1[[i]]
  }
}


# get order based on other occurrences of the same family with that order name - PBDB
ord_levels <- unique(as.character(pbdb$order))
fam_ord_mis <- unique(pbdb$family[which(pbdb$order == "" & pbdb$family !="")])
ord_of_mis <- sapply(fam_ord_mis, function(x) unique(pbdb$order[which(pbdb$family == x & pbdb$order != "")]))
temp <- NULL
famtemp <- NULL
ordtemp <- NULL
classtemp <- NULL
phyltemp <- NULL
timebins <- NULL
for (i in 1:length(ord_of_mis)) {
  temp =  unlist(ord_of_mis[i])
  if(!(identical(temp, character(0)))) {
    if(length(temp)==1) {
    if (temp %in% ord_levels) {
      timebins <- stage_get(unique(pbdb$stage[which(pbdb$order == temp)]))
      #famtemp <- unique(pbdb$family[which(pbdb$genus == temp)])
      #ordtemp <- unique(pbdb$order[which(pbdb$family == temp)])
      classtemp <- unique(pbdb$class[which(pbdb$order == temp)])
      phyltemp <- unique(pbdb$phylum[which(pbdb$order == temp)])
      if (length(classtemp[which(classtemp != "")]) %in% c(0,1)
          & length(phyltemp[which(phyltemp != "")]) %in% c(0,1)) {
        pbdb$order[which(pbdb$family == unlist(fam_ord_mis[[i]]) & pbdb$stage %in% timebins)] <- temp
      }
    }
    }
  }
}


# get class based on order - WORMS
order_names[1]
order_names <- names(sort(table(pbdb$order[which(pbdb$class == "")])))# [-length(sort(table(pbdb$order[which(pbdb$class == "")])))] # necessary if first entry is nonsensical
order_class <- sapply(order_names, function(x) classification(x, db = "worms", ask = FALSE))
order_class1 <- rep(NA,length(order_class))
for ( i in 1:length(order_class)) {
  if(!(is.na(order_class[i]))) order_class1[i] <- order_class[[i]][which(order_class[[i]][2]=="Class"),1]
}

for(i in 1:length(order_names)) {
  if(!(is.na(order_class1[[i]])) & length(order_class1[[i]] == 1)){
    pbdb$class[which(pbdb$order == order_names[i])] <- order_class1[[i]]
  }
}



# get class based on other occurrences of the same order with that class name - PBDB
class_levels <- unique(as.character(pbdb$class))
ord_class_mis <- unique(pbdb$order[which(pbdb$class == "" & pbdb$order !="")])
class_of_mis <- sapply(ord_class_mis, function(x) unique(pbdb$class[which(pbdb$order == x & pbdb$class != "")]))
temp <- NULL
famtemp <- NULL
ordtemp <- NULL
classtemp <- NULL
phyltemp <- NULL
timebins <- NULL
for (i in 1:length(class_of_mis)) {
  temp =  unlist(class_of_mis[i])
  if(!(identical(temp, character(0)))) {
    if(length(temp)==1) {
      if (temp %in% class_levels) {
        timebins <- stage_get(unique(pbdb$stage[which(pbdb$class == temp)]))
        #famtemp <- unique(pbdb$family[which(pbdb$genus == temp)])
        #ordtemp <- unique(pbdb$order[which(pbdb$family == temp)])
        #classtemp <- unique(pbdb$class[which(pbdb$order == temp)])
        phyltemp <- unique(pbdb$phylum[which(pbdb$class == temp)])
        if (length(phyltemp[which(phyltemp != "")]) %in% c(0,1)) {
          pbdb$class[which(pbdb$order == unlist(ord_class_mis[[i]]) & pbdb$stage %in% timebins)] <- temp
        }
      }
    }
  }
}

# get phylum based on class - WORMS
class_names <- names(sort(table(pbdb$class[which(pbdb$phylum == "")])))# [-length(sort(table(pbdb$class[which(pbdb$phylum == "")])))] # check if entry 1 is nonsensical
class_names[1]

class_phylum <- sapply(class_names, function(x) classification(x, db = "worms", ask = FALSE))
class_phylum1 <- rep(NA,length(class_phylum))
for ( i in 1:length(class_phylum)) {
  if(!(is.na(class_phylum[i]))) class_phylum1[i] <- class_phylum[[i]][which(class_phylum[[i]][2]=="Phylum"),1]
}

for(i in 1:length(class_names)) {
  if(!(is.na(class_phylum1[[i]])) & length(class_phylum1[[i]] == 1)){
    pbdb$phylum[which(pbdb$class == class_names[i])] <- class_phylum1[[i]]
  }
}


# get phylum based on other occurrences of the same class with that phylum name - PBDB
phylum_levels <- unique(as.character(pbdb$phylum))
class_phylum_mis <- unique(pbdb$class[which(pbdb$phylum == "" & pbdb$class !="")])
phylum_of_mis <- sapply(class_phylum_mis, function(x) unique(pbdb$phylum[which(pbdb$class == x & pbdb$phylum != "")]))
temp <- NULL
famtemp <- NULL
ordtemp <- NULL
classtemp <- NULL
phyltemp <- NULL
timebins <- NULL
for (i in 1:length(phylum_of_mis)) {
  temp =  unlist(phylum_of_mis[i])
  if(!(identical(temp, character(0)))) {
    if(length(temp)==1) {
      if (temp %in% phylum_levels) {
        timebins <- stage_get(unique(pbdb$stage[which(pbdb$phylum == temp)]))
        #famtemp <- unique(pbdb$family[which(pbdb$genus == temp)])
        #ordtemp <- unique(pbdb$order[which(pbdb$family == temp)])
        #classtemp <- unique(pbdb$class[which(pbdb$order == temp)])
        #phyltemp <- unique(pbdb$phylum[which(pbdb$class == temp)])
        
          pbdb$phylum[which(pbdb$class == unlist(class_phylum_mis[[i]]) & pbdb$stage %in% timebins)] <- temp
        
      }
    }
  }
}

### Now check for genera with multiple families, orders, classes, phyla

# replace genus names that are in different families and not in the same order with "genus family"
pbdbgen <- unique(pbdb$genus)
genfams <- as.list(rep(NA,length(pbdbgen)))
genfams <- sapply(pbdbgen, function(x) unique(pbdb$family[which(pbdb$genus == x & pbdb$family != "")]))
multiples <-  names(which(sapply(genfams,function(x) length(x)) == 2))
multiple_order <- sapply(multiples, function(x) length(unique(pbdb$order[which(pbdb$genus==x)])))

for (i in 1:length(multiples)) {
  if (multiple_order[i] == 2) {
    pbdb$genus[which(pbdb$genus==multiples[i])] <- paste(pbdb$genus[which(pbdb$genus==multiples[i])], pbdb$family[which(pbdb$genus==multiples[i])])
  }
  else print("No")
}

# check that also for orders
genord <- as.list(rep(NA,length(pbdbgen)))
genord <- sapply(pbdbgen, function(x) unique(pbdb$order[which(pbdb$genus == x & pbdb$order != "")]))
multiples <-  names(which(sapply(genord,function(x) length(x)) == 2))

View(pbdb[which(pbdb$genus=="Onychoceras Bassleroceratidae"),])
paste(pbdb$genus[which(pbdb$genus==multiples[i])], pbdb$family[which(pbdb$genus==multiples[i])])


table(pbdb$order[which(pbdb$genus==multiples[4])])
table(sapply(genord,function(x) length(x)))

Assignment of mineralogy and ecological parameters

param <- c("composition", "vision", "motility", "life_habit", "diet", "reproduction", 
           "ontogeny", "architecture", "thickness", "reinforcement")

pbdb[,c(param, "genus", "family", "order", "class", "phylum")] <- apply(
  pbdb[,c(param, "genus", "family", "order", "class", "phylum")], 2, as.character)

phyla <- c("Brachiopoda", "Mollusca", "Echinodermata", "Bryozoa", "Foraminifera", 
           "Coelenterata", "Porifera", "Arthropoda", "Hyolitha",
           "Rhodophyta", "Chlorophyta", "Haptophyta", "Heterokontophyta", "Annelida")
pbdb$phylum[which(pbdb$phylum == "Cnidaria")] <- "Coelenterata"
pbdb$phylum[which(pbdb$phylum == "Craniata")] <- "Brachiopoda"

The following chunk needs to be run seperately for each of the 14 phyla (Adjust the table name by specifying the group in the first code line)

group <- phyla[6] # phylum that's currently investigated
# pbdb taxonomy table for that group
group_inf <- read.csv(file=paste(group,".csv", sep = ""), header=TRUE, sep=",", skip=16) # Brachiopoda skip = 15, Mollusca and the rest = 16
group_inf[,c(param, "genus", "family", "order", "class", "phylum")] <- apply(
  group_inf[,c(param, "genus", "family", "order", "class", "phylum")], 2, as.character)


pbd_group <- pbdb[which(pbdb$phylum == group),]

gen <- unique(pbd_group$genus)
fam <- unique(pbd_group$family)
ord <- unique(pbd_group$order)
cla <- unique(pbd_group$class)

# get parameter based on genus
composition <- rep("", length(gen))
vision <- rep("", length(gen))
motility <- rep("", length(gen))
life_habit <- rep(NA, length(gen))
diet <- rep("", length(gen))
reproduction <- rep("", length(gen))
ontogeny <- rep("", length(gen))
architecture <- rep("", length(gen))
thickness <- rep("", length(gen))
reinforcement <- rep("", length(gen))

# create a vector for each parameter with a unique attribute for each genus

for (i in 1: length(gen)){
  if(length(unique(group_inf$composition[which(group_inf$genus == gen[i] & group_inf$composition != "")]))  == 1
  ) composition[i] <- unique(group_inf$composition[which(group_inf$genus == gen[i] & group_inf$composition != "")])
  
  if(length(unique(group_inf$vision[which(group_inf$genus == gen[i] & group_inf$vision != "")]))  == 1
  ) vision[i] <- unique(group_inf$vision[which(group_inf$genus == gen[i] & group_inf$vision != "")])
  
  if(length(unique(group_inf$motility[which(group_inf$genus == gen[i] & group_inf$motility != "")]))  == 1
  ) motility[i] <- unique(group_inf$motility[which(group_inf$genus == gen[i] & group_inf$motility != "")])
  
  if(length(unique(group_inf$life_habit[which(group_inf$genus == gen[i] & group_inf$life_habit != "")]))  == 1
  ) life_habit[i] <- unique(group_inf$life_habit[which(group_inf$genus == gen[i] & group_inf$life_habit != "")])
  
  if(length(unique(group_inf$diet[which(group_inf$genus == gen[i] & group_inf$diet != "")]))  == 1
  ) diet[i] <- unique(group_inf$diet[which(group_inf$genus == gen[i] & group_inf$diet != "")])
  
  if(length(unique(group_inf$reproduction[which(group_inf$genus == gen[i] & group_inf$reproduction != "")]))  == 1
  ) reproduction[i] <- unique(group_inf$reproduction[which(group_inf$genus == gen[i] & group_inf$reproduction != "")])

  if(length(unique(group_inf$ontogeny[which(group_inf$genus == gen[i] & group_inf$ontogeny != "")]))  == 1
  ) ontogeny[i] <- unique(group_inf$ontogeny[which(group_inf$genus == gen[i] & group_inf$ontogeny != "")])
  
  if(length(unique(group_inf$architecture[which(group_inf$genus == gen[i] & group_inf$architecture != "")]))  == 1
  ) architecture[i] <- unique(group_inf$architecture[which(group_inf$genus == gen[i] & group_inf$architecture != "")])
  
  if(length(unique(group_inf$thickness[which(group_inf$genus == gen[i] & group_inf$thickness != "")]))  == 1
  ) thickness[i] <- unique(group_inf$thickness[which(group_inf$genus == gen[i] & group_inf$thickness != "")])
  
  if(length(unique(group_inf$reinforcement[which(group_inf$genus == gen[i] & group_inf$reinforcement != "")]))  == 1
  ) reinforcement[i] <- unique(group_inf$reinforcement[which(group_inf$genus == gen[i] & group_inf$reinforcement != "")])
  
}



# fill in the group_pbd file based on the attribute vectors

for (i in 1:length(gen)) {
pbd_group$composition[which(pbd_group$genus == gen[i] & pbd_group$composition =="")] <- composition[i]
pbd_group$vision[which(pbd_group$genus == gen[i] & pbd_group$vision =="")] <- vision[i]
pbd_group$motility[which(pbd_group$genus == gen[i] & pbd_group$motility =="")] <- motility[i]
pbd_group$life_habit[which(pbd_group$genus == gen[i] & pbd_group$life_habit =="")] <- life_habit[i]
pbd_group$diet[which(pbd_group$genus == gen[i] & pbd_group$diet =="")] <- diet[i]
pbd_group$reproduction[which(pbd_group$genus == gen[i] & pbd_group$reproduction =="")] <- reproduction[i]
pbd_group$ontogeny[which(pbd_group$genus == gen[i] & pbd_group$ontogeny =="")] <- ontogeny[i]
pbd_group$architecture[which(pbd_group$genus == gen[i] & pbd_group$architecture =="")] <- architecture[i]
pbd_group$thickness[which(pbd_group$genus == gen[i] & pbd_group$thickness =="")] <- thickness[i]
pbd_group$reinforcement[which(pbd_group$genus == gen[i] & pbd_group$reinforcement =="")] <- reinforcement[i]
}

# repeat this with families instead of genera


# get parameter based on family
composition <- rep("", length(fam))
vision <- rep("", length(fam))
motility <- rep("", length(fam))
life_habit <- rep("", length(fam))
diet <- rep("", length(fam))
reproduction <- rep("", length(fam))
ontogeny <- rep("", length(fam))
architecture <- rep("", length(fam))
thickness <- rep("", length(fam))
reinforcement <- rep("", length(fam))

# create a vector for each parameter with a unique attribute for each family

for (i in 1: length(fam)){
  if(length(unique(group_inf$composition[which(group_inf$family == fam[i] & group_inf$composition != "")]))  == 1
  ) composition[i] <- unique(group_inf$composition[which(group_inf$family == fam[i] & group_inf$composition != "")])
  
  if(length(unique(group_inf$vision[which(group_inf$family == fam[i] & group_inf$vision != "")]))  == 1
  ) vision[i] <- unique(group_inf$vision[which(group_inf$family == fam[i] & group_inf$vision != "")])
  
  if(length(unique(group_inf$motility[which(group_inf$family == fam[i] & group_inf$motility != "")]))  == 1
  ) motility[i] <- unique(group_inf$motility[which(group_inf$family == fam[i] & group_inf$motility != "")])
  
  if(length(unique(group_inf$life_habit[which(group_inf$family == fam[i] & group_inf$life_habit != "")]))  == 1
  ) life_habit[i] <- unique(group_inf$life_habit[which(group_inf$family == fam[i] & group_inf$life_habit != "")])
  
  if(length(unique(group_inf$diet[which(group_inf$family == fam[i] & group_inf$diet != "")]))  == 1
  ) diet[i] <- unique(group_inf$diet[which(group_inf$family == fam[i] & group_inf$diet != "")])
  
  if(length(unique(group_inf$reproduction[which(group_inf$family == fam[i] & group_inf$reproduction != "")]))  == 1
  ) reproduction[i] <- unique(group_inf$reproduction[which(group_inf$family == fam[i] & group_inf$reproduction != "")])
  
  if(length(unique(group_inf$ontogeny[which(group_inf$family == fam[i] & group_inf$ontogeny != "")]))  == 1
  ) ontogeny[i] <- unique(group_inf$ontogeny[which(group_inf$family == fam[i] & group_inf$ontogeny != "")])
  
  if(length(unique(group_inf$architecture[which(group_inf$family == fam[i] & group_inf$architecture != "")]))  == 1
  ) architecture[i] <- unique(group_inf$architecture[which(group_inf$family == fam[i] & group_inf$architecture != "")])
  
  if(length(unique(group_inf$thickness[which(group_inf$family == fam[i] & group_inf$thickness != "")]))  == 1
  ) thickness[i] <- unique(group_inf$thickness[which(group_inf$family == fam[i] & group_inf$thickness != "")])
  
  if(length(unique(group_inf$reinforcement[which(group_inf$family == fam[i] & group_inf$reinforcement != "")]))  == 1
  ) reinforcement[i] <- unique(group_inf$reinforcement[which(group_inf$family == fam[i] & group_inf$reinforcement != "")])
  
}

# fill in the group_pbd file based on the attribute vectors

for (i in 1:length(fam)) {
  pbd_group$composition[which(pbd_group$family == fam[i] & pbd_group$composition =="")] <- composition[i]
  pbd_group$vision[which(pbd_group$family == fam[i] & pbd_group$vision =="")] <- vision[i]
  pbd_group$motility[which(pbd_group$family == fam[i] & pbd_group$motility =="")] <- motility[i]
  pbd_group$life_habit[which(pbd_group$family == fam[i] & pbd_group$life_habit =="")] <- life_habit[i]
  pbd_group$diet[which(pbd_group$family == fam[i] & pbd_group$diet =="")] <- diet[i]
  pbd_group$reproduction[which(pbd_group$family == fam[i] & pbd_group$reproduction =="")] <- reproduction[i]
  pbd_group$ontogeny[which(pbd_group$family == fam[i] & pbd_group$ontogeny =="")] <- ontogeny[i]
  pbd_group$architecture[which(pbd_group$family == fam[i] & pbd_group$architecture =="")] <- architecture[i]
  pbd_group$thickness[which(pbd_group$family == fam[i] & pbd_group$thickness =="")] <- thickness[i]
  pbd_group$reinforcement[which(pbd_group$family == fam[i] & pbd_group$reinforcement =="")] <- reinforcement[i]

}

# if everything is ok, insert the new data into the pbdb file
table(pbd_group$composition)
sum(table(pbd_group$composition))
table(pbd_group$order)

table(pbd_group$class[which(pbd_group$composition == "")])
sum(table(pbd_group$composition))
length(pbd_group$composition)

pbdb[which(pbdb$phylum == group),] <- pbd_group # do not forget this
table(pbdb$composition[which(pbdb$phylum==group)])

Specific changes

Run these after the chunk above has been run for each of the 14 phyla

# 1. Brachiopods

table(pbdb$composition[which(pbdb$phylum==group)])

pbdb$composition[which(pbdb$order %in% c("Craniida", "Craniopsida"))] <- "high Mg calcite"
table(pbdb$composition[which(pbdb$order %in% c("Craniida", "Craniopsida"))])
table(pbdb$composition[which(pbdb$order %in% c("Craniida", "Craniopsida") & pbdb$phylum %in% c("Brachiopoda"))])

pbdb$composition[which(pbdb$family == c("Craniidae"))] <- "high Mg calcite"
pbdb$composition[which(pbdb$order == "Orthotetida")] <- "low Mg calcite"
table(pbdb$composition[which(pbdb$phylum==group)])


# potentially aragonite:
# class "Kutorginata" and family "Isogrammiidae"

# 2. Mollusca


table(pbdb$class[which(pbdb$phylum =="Mollusca" & pbdb$composition == "")])
table(pbdb$composition[which(pbdb$class =="Monoplacophora")])

pbdb$composition[which(pbdb$class == c("Helcionelloida"))] <- "aragonite"
pbdb$composition[which(pbdb$class == c("Diplacophora"))] <- "aragonite"
pbdb$composition[which(pbdb$class == c("Paragastropoda"))] <- "aragonite"
pbdb$composition[which(pbdb$class == c("Rostroconchia"))] <- "aragonite"
pbdb$composition[which(pbdb$class == c("Tergomya"))] <- "aragonite"
pbdb$composition[which(pbdb$class =="Monoplacophora")] <- "aragonite"
pbdb$class[which(pbdb$class == c("Monoplacophora"))] <- "Tergomya"
pbdb$composition[which(pbdb$class == c("Stenothecoida"))] <- "low Mg calcite"
pbdb$composition[which(pbdb$class == c("Tentaculitoidea"))] <- "low Mg calcite"
pbdb$class[which(pbdb$class == c("Tentaculitoidea"))] <- "Tentaculita"

table(pbdb$order[which(pbdb$phylum =="Mollusca" & pbdb$composition == "" & pbdb$class != "")])
table(pbdb$genus[which(pbdb$phylum =="Mollusca" & pbdb$family == "Pterineidae")])



table(pbdb$genus[which(pbdb$class == "" & pbdb$phylum =="Mollusca" & pbdb$composition == "")])


# correct Tentaculitida
pbdb$phylum[which(pbdb$phylum == "Lophotrochozoa" & pbdb$order == "Tentaculitida")] <- "Mollusca"
pbdb$class[which(pbdb$order == "Tentaculitida")] <- "Tentaculita"

# 3. Echinodermata

pbdb$composition[which(pbdb$order == "Soluta")] <- "high Mg calcite" # 8 entries of the order "Soluta" remained unclassified in composition


# 4. Bryozoa
# 
pbdb$composition[which(pbdb$genus == "Dianulites")] <- "high magnesium calcite"

 

# 5. Foraminifera

# continue checking the missing composition of foram families

# 6. Coelenterata
# careful in the dataset are some with the now accepted phylum name "Cnidaria", PBDB and this script still  uses "Coelenterata"


# 7. Porifera

pbdb$composition[which(pbdb$order == "Labechiida")] <- "high Mg calcite"
pbdb$comp[which(pbdb$order == "Labechiida")]
pbdb$composition[which(pbdb$order == "Chaetetida" & pbdb$tenmbin %in% as.character(bin$name[1:25]))] <- "high Mg calcite"
pbdb$composition[which(pbdb$order == "Chaetetida" & pbdb$tenmbin %in% as.character(bin$name[26:49]))] <- "aragonite"
pbdb$composition[which(pbdb$class == "Archaeocyatha")] <- "high Mg calcite"
pbdb$composition[which(pbdb$order == "Archaeocyathida")] <- "high Mg calcite"

# 8. Arthropoda

# 9. Hyolitha

# 10. Rhodophyta
pbdb$composition[which(pbdb$genus == "Tetradium")] <- "aragonite"

# some are aragonitic - really?! 

# 11. Chlorophyta

# 12. Haptophyta

# 13. Heterokontophyta

# 14. Annelida

pbdb$comp[which(pbdb$class == "Machaeridia")] <- "low Mg calcite"

Now create a column with categories for composition, latitude, environemnt and some ecological categories

feeding <- read.csv(file="feeding.csv", header=TRUE, sep=",")
motility <- read.csv(file="motility.csv", header=TRUE, sep=",")
tiering <- read.csv(file="tiering.csv", header=TRUE, sep=",")
environment <-  read.csv(file="pbdb_environments.csv", header=TRUE, sep=",")
composition <- read.csv(file="composition.csv", header=TRUE, sep=",")
latitude <- read.csv(file="latitude.csv", header=TRUE, sep=",")

feed <- rep("",nrow(pbdb))
  for (i in 1:length(unique(feeding$category))) {
    feed[which(pbdb$diet %in% feeding$diet[which(feeding$category == unique(feeding$category)[i])])] <- as.character(unique(feeding$category)[i])
  }

mot <- rep("",nrow(pbdb))
for (i in 1:length(unique(motility$category))) {
  mot[which(pbdb$motility %in% motility$motility[which(motility$category == unique(motility$category)[i])])] <- as.character(unique(motility$category)[i])
}

tier <- rep("",nrow(pbdb))
for (i in 1:length(unique(tiering$category))) {
  tier[which(pbdb$life_habit %in% tiering$life_habit[which(tiering$category == unique(tiering$category)[i])])] <- as.character(unique(tiering$category)[i])
}

depth <- rep("",nrow(pbdb))
for (i in 1:length(unique(environment$depth))) {
  depth[which(pbdb$environment %in% environment$environment[which(environment$depth == unique(environment$depth)[i])])] <- as.character(unique(environment$depth)[i])
}

depth2 <- rep("", nrow(pbdb))
depth2[which(depth %in% c("shallow", "medium", "shallow/medium"))] <- "aboveSWB"
depth2[which(depth == "deep")] <- "belowSWB"


sett <- rep("",nrow(pbdb))
for (i in 1:length(unique(environment$depositional_setting))) {
  sett[which(pbdb$environment %in% environment$environment[which(environment$depositional_setting == unique(environment$depositional_setting)[i])])] <- as.character(unique(environment$depositional_setting)[i])
}

comp <- rep("",nrow(pbdb))
for (i in 1:length(unique(composition$category))) {
  comp[which(pbdb$composition %in% composition$composition[which(composition$category == unique(composition$category)[i])])] <- as.character(unique(composition$category)[i])
}


lati <- rep("",nrow(pbdb))
lati[which(pbdb$paleolat >35 | pbdb$paleolat < -35)] <- "high latitude"
lati[which(pbdb$paleolat <= 35 & pbdb$paleolat >= -35)] <- "tropical"

reef <- rep("",nrow(pbdb))
reef[which(pbdb$environment %in% environment$environment[which(environment$reef == 1)])] <- "reef"


pbdb <- cbind(pbdb,comp,sett,depth,depth2,tier,mot,feed,reef,lati)

The pbdb data set is now ready for further use.