Bundling Simões et al 2024 to a DwC Archive

This is an R Markdown Notebook for converting the eDNA data found in the following reference to Darwin Core format for upload into OBIS:

Simões M, Costa C, da Luz Calado M, Vasco-Rodrigues N, Campos MJ, Leandro SM, Antunes A. Unveiling the UNESCO Biosphere Reserve of the Berlengas Archipelago in Portugal as a Hotspot of Fish Species Using eDNA Metabarcoding and the Collaboration of Fishing Crews. Journal of Marine Science and Engineering. 2025; 13(1):60.

Setup

Call the necessary libraries and variables. Suppresses loading messages.

library(magrittr)                       # To use %<>% pipes
library(readxl)                         # To read excel file
## Warning: package 'readxl' was built under R version 4.3.3
suppressMessages(library(dplyr))        # To clean input data
library(stringr)                        # To clean input data
suppressMessages(library(taxize))       # To get WoRMS IDs
## Warning: package 'taxize' was built under R version 4.3.3
library(worrms)                         # To get WoRMS IDs
## Warning: package 'worrms' was built under R version 4.3.3
library(digest)                         # To generate hashes

Read source data

Read in source csv table

input_file <- "input/OBIS-UNESCO_final.xlsx"
input_sequences <- "input/occurrence_seqeunces.csv"
input_reads <- "input/Number of reads-data processing.xlsx"

input_data <- as.data.frame(read_excel(input_file))
input_sequence_data <- read.csv(input_sequences, sep = ";")
input_sequence_data <- input_sequence_data %>% distinct(taxonID, Sequence)
input_read_data <- as.data.frame(read_excel(input_reads))
## New names:
## • `` -> `...1`
input_read_data <- input_read_data %>%
  rename(CleanedData.Sample_name = ...1, Filtered_reads = filtered) %>%
  select(CleanedData.Sample_name, Filtered_reads)

#to preview pretty table
knitr::kable(head(input_data))
Sample_name Amount of water DNA_source Collection method Date Month Season Year Bathymetry Geographical location N Geographical location W nº reads OTU_number kingdom phylum class order family genus species associatedSequences
B00 2 L eDNA Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 189 OTU212 Animalia Chordata Actinopteri Clupeiformes Clupeidae Sardina Sardina pilchardus NCBI - PRJNA1110393
B00 2 L eDNA Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 269 OTU213 Animalia Chordata Actinopteri NA Moronidae Dicentrarchus Dicentrarchus labrax NCBI - PRJNA1110393
B00 2 L eDNA Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 172 OTU243 Animalia Chordata Actinopteri Clupeiformes Clupeidae Sardina Sardina pilchardus NCBI - PRJNA1110393
B00 2 L eDNA Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 1832 OTU257 Animalia Chordata Actinopteri Gadiformes Gadidae Gadus Gadus morhua NCBI - PRJNA1110393
B00 2 L eDNA Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 294 OTU262 Animalia Chordata Actinopteri Clupeiformes Clupeidae Sardina Sardina pilchardus NCBI - PRJNA1110393
B00 2 L eDNA Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 149 OTU296 Animalia Chordata Actinopteri Spariformes Sparidae Diplodus Diplodus puntazzo NCBI - PRJNA1110393

Split source data into occurrence fields and DNA fields

add occurrenceID

OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.

# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)

# Generate occurrenceID:
input_data %<>% mutate(occurrenceID = paste(vdigest (paste(OTU_number, `Geographical location  N`, `Geographical location W`, Date), algo="md5"), sep=":"))

#split data
input_data <- input_data %>% rename_with(~"n reads", .cols = 12)
occurrence_input_data <- input_data %>% select(occurrenceID,
                                               Sample_name,
                                               `Collection method`,
                                               Date,
                                               Month,
                                               Season,
                                               Year,
                                               Bathymetry,
                                               `Geographical location  N`,
                                               `Geographical location W`,
                                               `n reads`,
                                               OTU_number,
                                               kingdom,
                                               phylum,
                                               class,
                                               order,
                                               family,
                                               genus,
                                               species) %>% distinct()
edna_input_data <- input_data %>% select(occurrenceID,
                                         Sample_name,
                                         OTU_number,
                                         `Amount of water`) %>% distinct()

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(occurrence_input_data[,"species"])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})

#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(occurrence_input_data)) {
  cleaned_value <- occurrence_input_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))
CleanedData.occurrenceID CleanedData.Sample_name CleanedData.Collection.method CleanedData.Date CleanedData.Month CleanedData.Season CleanedData.Year CleanedData.Bathymetry CleanedData.Geographical.location..N CleanedData.Geographical.location.W CleanedData.n.reads CleanedData.OTU_number CleanedData.kingdom CleanedData.phylum CleanedData.class CleanedData.order CleanedData.family CleanedData.genus CleanedData.species CanonicalFull WormsIDs
ac09967beb89ae73a6f52e7ba48355e7 B00 Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 189 OTU212 Animalia Chordata Actinopteri Clupeiformes Clupeidae Sardina Sardina pilchardus Sardina pilchardus 126421
9133f272bc630842f5dd503bed24b679 B00 Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 269 OTU213 Animalia Chordata Actinopteri NA Moronidae Dicentrarchus Dicentrarchus labrax Dicentrarchus labrax 126975
8d62db8bda4b8bce476ecca23a5b4c43 B00 Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 172 OTU243 Animalia Chordata Actinopteri Clupeiformes Clupeidae Sardina Sardina pilchardus Sardina pilchardus 126421
8f002f4451111e3d0ded3bb775a08fe5 B00 Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 1832 OTU257 Animalia Chordata Actinopteri Gadiformes Gadidae Gadus Gadus morhua Gadus morhua 126436
c70a84d5b596fa9ff20de671bca82cbf B00 Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 294 OTU262 Animalia Chordata Actinopteri Clupeiformes Clupeidae Sardina Sardina pilchardus Sardina pilchardus 126421
e2d0d29f13c17cdeb5a9a7849c891d48 B00 Research Team 01.2020 Jan Winter 2020 10-20 m 39°25’31.3’’N 009°30’54.1’’W 149 OTU296 Animalia Chordata Actinopteri Spariformes Sparidae Diplodus Diplodus puntazzo Diplodus puntazzo 127052

Human Verification

Added taxa not identified to species

combined_dataframe[c(48, 81, 130, 150, 176, 320, 338, 349, 350, 557, 657, 765, 777), 
                   c("CanonicalFull", "WormsIDs")] <- 
  matrix(rep(c("Sparidae", "125564"), 13), nrow = 13, byrow = TRUE)

Darwin Core Occurrence Mapping

OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.

locality

Format locality information to decimal degrees

dms_to_dd <- function(dms) {
  # Extract degrees, minutes, seconds, and direction using regex
  parts <- str_match(dms, "([0-9]+)[°º]([0-9]+)['’]([0-9.]+)(['’‘’\"]{0,2})[NSEW]")
  
  degrees <- as.numeric(parts[,2])
  minutes <- as.numeric(parts[,3])
  seconds <- as.numeric(parts[,4])
  direction <- parts[,5]
  
  # Convert to decimal degrees
  decimal_degrees <- degrees + (minutes / 60) + (seconds / 3600)
  
  return(decimal_degrees)
}

# Apply function to Latitude and Longitude columns
combined_dataframe$decimalLatitude <- sapply(combined_dataframe$CleanedData.Geographical.location..N, dms_to_dd)
combined_dataframe$decimalLongitude <- -sapply(combined_dataframe$CleanedData.Geographical.location.W, dms_to_dd)

# Add missing locality information
combined_dataframe$coordinateUncertaintyInMeters <- 50
combined_dataframe$country <- "Portugal"
combined_dataframe$locality <- "Berlengas Biosphere Reserve"
combined_dataframe$geodeticDatum <- "WGS84"

combined_dataframe <- combined_dataframe %>% select(-CleanedData.Geographical.location..N, 
                                                    -CleanedData.Geographical.location.W)

occurrenceID

occurrence <- combined_dataframe %>%
  rename(occurrenceID = CleanedData.occurrenceID)

scientificName/scientificNameID

#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- occurrence %>%
  rename(scientificName = CanonicalFull) %>%
  rename(scientificNameID = WormsIDs) %>%
  mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA)) %>% 
  select(-CleanedData.kingdom,
         -CleanedData.phylum,
         -CleanedData.class,
         -CleanedData.order,
         -CleanedData.family,
         -CleanedData.genus,
         -CleanedData.species)

eventDate

occurrence <- occurrence %>%
  mutate(
    CleanedData.Month = ifelse(CleanedData.Month == "Sept", "Sep", CleanedData.Month),
    eventDate = match(CleanedData.Month, month.abb),
    eventDate = sprintf("%02d", eventDate),
    eventDate = paste0(CleanedData.Year, "-", eventDate)
  ) %>% 
  select(-CleanedData.Date,
         -CleanedData.Month,
         -CleanedData.Season,
         -CleanedData.Year)

depth

occurrence <- occurrence %>%
  mutate(
    verbatimDepth = CleanedData.Bathymetry,
    minimumDepthInMeters = as.numeric(str_extract(CleanedData.Bathymetry, "^[0-9]+")),
    maximumDepthInMeters = as.numeric(str_extract(CleanedData.Bathymetry, "(?<=-)[0-9]+"))
  ) %>% 
  select(-CleanedData.Bathymetry)

recordedBy

occurrence <- occurrence %>%
  mutate(
    recordedBy = CleanedData.Collection.method
  ) %>% 
  select(-CleanedData.Collection.method)

organismQuantity

occurrence <- occurrence %>%
  mutate(
    organismQuantity = CleanedData.n.reads
  ) %>% 
  select(-CleanedData.n.reads)

occurrenceStatus

occurrenceStatus <- "present"
occurrence %<>% mutate(occurrenceStatus)

basisOfRecord

basisOfRecord <- "MaterialSample"
occurrence %<>% mutate(basisOfRecord)

organismQuantityType

organismQuantityType <- "DNA sequence reads"
occurrence %<>% mutate(organismQuantityType)

sampleSizeUnit

sampleSizeUnit <- "DNA sequence reads"
occurrence %<>% mutate(sampleSizeUnit)

sampleSizeValue

Total number of reads in the sample post processing.

occurrence <- occurrence %>%
  left_join(input_read_data, by = "CleanedData.Sample_name") %>%
  rename(sampleSizeValue = Filtered_reads)

materialSampleID

occurrence <- occurrence %>%
  mutate(materialSampleID = case_when(
    CleanedData.Sample_name == 'B00'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345430',
    CleanedData.Sample_name == 'B01'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345431',
    CleanedData.Sample_name == 'B02'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345432',
    CleanedData.Sample_name == 'B03'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345433',
    CleanedData.Sample_name == 'B04'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345434',
    CleanedData.Sample_name == 'B05'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345435',
    CleanedData.Sample_name == 'B06'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345436',
    CleanedData.Sample_name == 'B07'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345437',
    CleanedData.Sample_name == 'B08'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345438',
    CleanedData.Sample_name == 'B09'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345439',
    CleanedData.Sample_name == 'B10'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345440',
    CleanedData.Sample_name == 'B11'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345441',
    CleanedData.Sample_name == 'GM01'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345442',
    CleanedData.Sample_name == 'GM02'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345443',
    CleanedData.Sample_name == 'GM03'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345444',
    CleanedData.Sample_name == 'GM04'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345445',
    CleanedData.Sample_name == 'GM05'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345446',
    CleanedData.Sample_name == 'GM06'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345447',
    CleanedData.Sample_name == 'GM07'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345448',
    CleanedData.Sample_name == 'GM08'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345449',
    CleanedData.Sample_name == 'GM09'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345450',
    CleanedData.Sample_name == 'GM10'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345451',
    CleanedData.Sample_name == 'GM11'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345452',
    CleanedData.Sample_name == 'GM13'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345453',
    CleanedData.Sample_name == 'GM14'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345454',
    CleanedData.Sample_name == 'GM15'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345455',
    CleanedData.Sample_name == 'GM16'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345456',
    CleanedData.Sample_name == 'GM18'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345457',
    CleanedData.Sample_name == 'GM19'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345458',
    CleanedData.Sample_name == 'GM20'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345459',
    CleanedData.Sample_name == 'GM21'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345460',
    CleanedData.Sample_name == 'GM22'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345461',
    CleanedData.Sample_name == 'RM01'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345462',
    CleanedData.Sample_name == 'RM02'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345463',
    CleanedData.Sample_name == 'RM04'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345464',
    CleanedData.Sample_name == 'RM05'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345465'
  )) %>% 
  select(-CleanedData.Sample_name)

associatedSequences

associatedSequences <- "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA1110393"
occurrence %<>% mutate(associatedSequences)

identificationRemarks

identificationRemarks <- "Taxonomic classification confidence (at lowest specified taxon): 0.97 at species level, 0.90 at genus level, using BLAST in QIIME 2 against NCBI and MitoFish reference databases. Unassigned ASVs and non-fish species excluded. Cross-validation performed using WoRMS, OBIS, and FishBase."
occurrence %<>% mutate(identificationRemarks)

taxonID

occurrence <- occurrence %>%
  mutate(
    taxonID = CleanedData.OTU_number
  ) %>% 
  select(-CleanedData.OTU_number)

Darwin Core eDNA Mapping

dna_derived_data <- edna_input_data

DNA_sequence

dna_derived_data <- dna_derived_data %>%
  left_join(input_sequence_data, by = c("OTU_number" = "taxonID")) %>%
  rename(DNA_sequence = Sequence) %>% 
  select(-OTU_number)

target_gene

target_gene <- "12S rRNA"
dna_derived_data %<>% mutate(target_gene)

pcr_primer_forward

pcr_primer_forward <- "GTCGGTAAAACTCGTGCC"
dna_derived_data %<>% mutate(pcr_primer_forward)

pcr_primer_reverse

pcr_primer_reverse <- "CATAGTGGGGTATCTAATCCCAGTTTG"
dna_derived_data %<>% mutate(pcr_primer_reverse)

pcr_primer_name_forward

pcr_primer_name_forward <- "MiFish-U-F"
dna_derived_data %<>% mutate(pcr_primer_name_forward)

pcr_primer_name_reverse

pcr_primer_name_reverse <- "MiFish-U-R"
dna_derived_data %<>% mutate(pcr_primer_name_reverse)

pcr_primer_reference

pcr_primer_reference <- "https://doi.org/10.1098/rsos.150088"
dna_derived_data %<>% mutate(pcr_primer_reference)

env_broad_scale

env_broad_scale <- "marine biome [ENVO:00000447]"
dna_derived_data %<>% mutate(env_broad_scale)

env_local_scale

env_local_scale <- "coastal water [ENVO:00001250]"
dna_derived_data %<>% mutate(env_local_scale)

env_medium

env_medium <- "ocean water [ENVO:00002149]"
dna_derived_data %<>% mutate(env_medium)

lib_layout

lib_layout <- "Paired"
dna_derived_data %<>% mutate(lib_layout)

seq_meth

seq_meth <- "lllumina MiSeq"
dna_derived_data %<>% mutate(seq_meth)

otu_class_appr

otu_class_appr <- "DADA2; QIIME 2 v2022.2; ASV"
dna_derived_data %<>% mutate(otu_class_appr)

otu_seq_comp_appr

otu_seq_comp_appr <- "BLAST; QIIME 2 v2022.2; similarity thresholds: 97% (species), 90% (genus)"
dna_derived_data %<>% mutate(otu_seq_comp_appr)

otu_db

otu_db <- "NCBI (via Entrez Direct); MitoFish"
dna_derived_data %<>% mutate(otu_db)

samp_size

samp_size <- "2 liter"
dna_derived_data %<>% mutate(samp_size) %>% 
  select(-`Amount of water`)

source_mat_id

dna_derived_data <- dna_derived_data %>%
  mutate(
    source_mat_id = Sample_name
  ) %>% 
  select(-Sample_name)

Save outputs

dwc_output_dir <- "output"

write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write.csv(dna_derived_data, paste(dwc_output_dir, "/dna_derived_data.csv", sep = ""), na = "", row.names=FALSE)