Bundling Simões et al 2024 to a DwC Archive
This is an R Markdown Notebook for converting the eDNA data found in the following reference to Darwin Core format for upload into OBIS:
Call the necessary libraries and variables. Suppresses loading messages.
suppressMessages(library(dplyr)) # To clean input data
library(stringr) # To clean input data
suppressMessages(library(taxize)) # To get WoRMS IDs
Read source data
Read in source csv table
input_file <- "input/OBIS-UNESCO_final.xlsx"
input_sequences <- "input/occurrence_seqeunces.csv"
input_reads <- "input/Number of reads-data processing.xlsx"
input_data <- as.data.frame(read_excel(input_file))
input_sequence_data <- read.csv(input_sequences, sep = ";")
input_sequence_data <- input_sequence_data %>% distinct(taxonID, Sequence)
input_read_data <- as.data.frame(read_excel(input_reads))
input_read_data <- input_read_data %>%
rename(CleanedData.Sample_name = ...1, Filtered_reads = filtered) %>%
select(CleanedData.Sample_name, Filtered_reads)
#to preview pretty table
Sample_name | Amount of water | DNA_source | Collection method | Date | Month | Season | Year | Bathymetry | Geographical location N | Geographical location W | nº reads | OTU_number | kingdom | phylum | class | order | family | genus | species | associatedSequences |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 189 | OTU212 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 269 | OTU213 | Animalia | Chordata | Actinopteri | NA | Moronidae | Dicentrarchus | Dicentrarchus labrax | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 172 | OTU243 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 1832 | OTU257 | Animalia | Chordata | Actinopteri | Gadiformes | Gadidae | Gadus | Gadus morhua | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 294 | OTU262 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 149 | OTU296 | Animalia | Chordata | Actinopteri | Spariformes | Sparidae | Diplodus | Diplodus puntazzo | NCBI - PRJNA1110393 |
Split source data into occurrence fields and DNA fields
add occurrenceID
OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.
# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)
# Generate occurrenceID:
input_data %<>% mutate(occurrenceID = paste(vdigest (paste(OTU_number, `Geographical location N`, `Geographical location W`, Date), algo="md5"), sep=":"))
#split data
input_data <- input_data %>% rename_with(~"n reads", .cols = 12)
occurrence_input_data <- input_data %>% select(occurrenceID,
`Collection method`,
`Geographical location N`,
`Geographical location W`,
`n reads`,
species) %>% distinct()
edna_input_data <- input_data %>% select(occurrenceID,
`Amount of water`) %>% distinct()
Auto matching
First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.
#Parse author names out
parsed_names <- rgnparser::gn_parse(occurrence_input_data[,"species"])
#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- NA
#Call the function
worms_ids <- lapply(parsed_names, function(element) {
if (element$parsed) {
} else {
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()
for (i in 1:nrow(occurrence_input_data)) {
cleaned_value <- occurrence_input_data[i,]
canonical_value <- parsed_names[[i]]$canonical$full
worms_id_value <- worms_ids[[i]][1]
if (is.null(canonical_value)){
canonical_value <- NA
temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
combined_dataframe <- rbind(combined_dataframe, temp_row)
Darwin Core Occurrence Mapping
OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.
Format locality information to decimal degrees
dms_to_dd <- function(dms) {
# Extract degrees, minutes, seconds, and direction using regex
parts <- str_match(dms, "([0-9]+)[°º]([0-9]+)['’]([0-9.]+)(['’‘’\"]{0,2})[NSEW]")
degrees <- as.numeric(parts[,2])
minutes <- as.numeric(parts[,3])
seconds <- as.numeric(parts[,4])
direction <- parts[,5]
# Convert to decimal degrees
decimal_degrees <- degrees + (minutes / 60) + (seconds / 3600)
# Apply function to Latitude and Longitude columns
combined_dataframe$decimalLatitude <- sapply(combined_dataframe$CleanedData.Geographical.location..N, dms_to_dd)
combined_dataframe$decimalLongitude <- -sapply(combined_dataframe$CleanedData.Geographical.location.W, dms_to_dd)
# Add missing locality information
combined_dataframe$coordinateUncertaintyInMeters <- 50
combined_dataframe$country <- "Portugal"
combined_dataframe$locality <- "Berlengas Biosphere Reserve"
combined_dataframe$geodeticDatum <- "WGS84"
combined_dataframe <- combined_dataframe %>% select(-CleanedData.Geographical.location..N,
#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- occurrence %>%
rename(scientificName = CanonicalFull) %>%
rename(scientificNameID = WormsIDs) %>%
mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA)) %>%
occurrence <- occurrence %>%
CleanedData.Month = ifelse(CleanedData.Month == "Sept", "Sep", CleanedData.Month),
eventDate = match(CleanedData.Month, month.abb),
eventDate = sprintf("%02d", eventDate),
eventDate = paste0(CleanedData.Year, "-", eventDate)
) %>%
Total number of reads in the sample post processing.
occurrence <- occurrence %>%
mutate(materialSampleID = case_when(
CleanedData.Sample_name == 'B00' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345430',
CleanedData.Sample_name == 'B01' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345431',
CleanedData.Sample_name == 'B02' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345432',
CleanedData.Sample_name == 'B03' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345433',
CleanedData.Sample_name == 'B04' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345434',
CleanedData.Sample_name == 'B05' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345435',
CleanedData.Sample_name == 'B06' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345436',
CleanedData.Sample_name == 'B07' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345437',
CleanedData.Sample_name == 'B08' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345438',
CleanedData.Sample_name == 'B09' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345439',
CleanedData.Sample_name == 'B10' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345440',
CleanedData.Sample_name == 'B11' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345441',
CleanedData.Sample_name == 'GM01' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345442',
CleanedData.Sample_name == 'GM02' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345443',
CleanedData.Sample_name == 'GM03' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345444',
CleanedData.Sample_name == 'GM04' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345445',
CleanedData.Sample_name == 'GM05' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345446',
CleanedData.Sample_name == 'GM06' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345447',
CleanedData.Sample_name == 'GM07' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345448',
CleanedData.Sample_name == 'GM08' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345449',
CleanedData.Sample_name == 'GM09' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345450',
CleanedData.Sample_name == 'GM10' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345451',
CleanedData.Sample_name == 'GM11' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345452',
CleanedData.Sample_name == 'GM13' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345453',
CleanedData.Sample_name == 'GM14' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345454',
CleanedData.Sample_name == 'GM15' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345455',
CleanedData.Sample_name == 'GM16' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345456',
CleanedData.Sample_name == 'GM18' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345457',
CleanedData.Sample_name == 'GM19' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345458',
CleanedData.Sample_name == 'GM20' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345459',
CleanedData.Sample_name == 'GM21' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345460',
CleanedData.Sample_name == 'GM22' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345461',
CleanedData.Sample_name == 'RM01' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345462',
CleanedData.Sample_name == 'RM02' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345463',
CleanedData.Sample_name == 'RM04' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345464',
CleanedData.Sample_name == 'RM05' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345465'
)) %>%
identificationRemarks <- "Taxonomic classification confidence (at lowest specified taxon): 0.97 at species level, 0.90 at genus level, using BLAST in QIIME 2 against NCBI and MitoFish reference databases. Unassigned ASVs and non-fish species excluded. Cross-validation performed using WoRMS, OBIS, and FishBase."
occurrence %<>% mutate(identificationRemarks)