Bundling Simões et al 2024 to a DwC Archive

This is an R Markdown Notebook for converting the eDNA data found in the following reference to Darwin Core format for upload into OBIS:

Simões M, Costa C, da Luz Calado M, Vasco-Rodrigues N, Campos MJ, Leandro SM, Antunes A. Unveiling the UNESCO Biosphere Reserve of the Berlengas Archipelago in Portugal as a Hotspot of Fish Species Using eDNA Metabarcoding and the Collaboration of Fishing Crews. Journal of Marine Science and Engineering. 2025; 13(1):60.

Setup

Call the necessary libraries and variables. Suppresses loading messages.

library(magrittr)                       # To use %<>% pipes
library(readxl)                         # To read excel file

## Warning: package 'readxl' was built under R version 4.3.3

suppressMessages(library(dplyr))        # To clean input data
library(stringr)                        # To clean input data
suppressMessages(library(taxize))       # To get WoRMS IDs

## Warning: package 'taxize' was built under R version 4.3.3

library(worrms)                         # To get WoRMS IDs

## Warning: package 'worrms' was built under R version 4.3.3

library(digest)                         # To generate hashes

Read source data

Read in source csv table

input_file <- "input/OBIS-UNESCO_final.xlsx"
input_sequences <- "input/occurrence_seqeunces.csv"
input_reads <- "input/Number of reads-data processing.xlsx"

input_data <- as.data.frame(read_excel(input_file))
input_sequence_data <- read.csv(input_sequences, sep = ";")
input_sequence_data <- input_sequence_data %>% distinct(taxonID, Sequence)
input_read_data <- as.data.frame(read_excel(input_reads))

## New names:
## • `` -> `...1`

input_read_data <- input_read_data %>%
  rename(CleanedData.Sample_name = ...1, Filtered_reads = filtered) %>%
  select(CleanedData.Sample_name, Filtered_reads)

#to preview pretty table
knitr::kable(head(input_data))

Sample_name	Amount of water	DNA_source	Collection method	Date	Month	Season	Year	Bathymetry	Geographical location N	Geographical location W	nº reads	OTU_number	kingdom	phylum	class	order	family	genus	species	associatedSequences
B00	2 L	eDNA	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	189	OTU212	Animalia	Chordata	Actinopteri	Clupeiformes	Clupeidae	Sardina	Sardina pilchardus	NCBI - PRJNA1110393
B00	2 L	eDNA	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	269	OTU213	Animalia	Chordata	Actinopteri	NA	Moronidae	Dicentrarchus	Dicentrarchus labrax	NCBI - PRJNA1110393
B00	2 L	eDNA	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	172	OTU243	Animalia	Chordata	Actinopteri	Clupeiformes	Clupeidae	Sardina	Sardina pilchardus	NCBI - PRJNA1110393
B00	2 L	eDNA	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	1832	OTU257	Animalia	Chordata	Actinopteri	Gadiformes	Gadidae	Gadus	Gadus morhua	NCBI - PRJNA1110393
B00	2 L	eDNA	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	294	OTU262	Animalia	Chordata	Actinopteri	Clupeiformes	Clupeidae	Sardina	Sardina pilchardus	NCBI - PRJNA1110393
B00	2 L	eDNA	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	149	OTU296	Animalia	Chordata	Actinopteri	Spariformes	Sparidae	Diplodus	Diplodus puntazzo	NCBI - PRJNA1110393

Split source data into occurrence fields and DNA fields

add occurrenceID

OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.

# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)

# Generate occurrenceID:
input_data %<>% mutate(occurrenceID = paste(vdigest (paste(OTU_number, `Geographical location  N`, `Geographical location W`, Date), algo="md5"), sep=":"))

#split data
input_data <- input_data %>% rename_with(~"n reads", .cols = 12)
occurrence_input_data <- input_data %>% select(occurrenceID,
                                               Sample_name,
                                               `Collection method`,
                                               Date,
                                               Month,
                                               Season,
                                               Year,
                                               Bathymetry,
                                               `Geographical location  N`,
                                               `Geographical location W`,
                                               `n reads`,
                                               OTU_number,
                                               kingdom,
                                               phylum,
                                               class,
                                               order,
                                               family,
                                               genus,
                                               species) %>% distinct()
edna_input_data <- input_data %>% select(occurrenceID,
                                         Sample_name,
                                         OTU_number,
                                         `Amount of water`) %>% distinct()

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(occurrence_input_data[,"species"])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})

#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(occurrence_input_data)) {
  cleaned_value <- occurrence_input_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))

CleanedData.occurrenceID	CleanedData.Sample_name	CleanedData.Collection.method	CleanedData.Date	CleanedData.Month	CleanedData.Season	CleanedData.Year	CleanedData.Bathymetry	CleanedData.Geographical.location..N	CleanedData.Geographical.location.W	CleanedData.n.reads	CleanedData.OTU_number	CleanedData.kingdom	CleanedData.phylum	CleanedData.class	CleanedData.order	CleanedData.family	CleanedData.genus	CleanedData.species	CanonicalFull	WormsIDs
ac09967beb89ae73a6f52e7ba48355e7	B00	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	189	OTU212	Animalia	Chordata	Actinopteri	Clupeiformes	Clupeidae	Sardina	Sardina pilchardus	Sardina pilchardus	126421
9133f272bc630842f5dd503bed24b679	B00	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	269	OTU213	Animalia	Chordata	Actinopteri	NA	Moronidae	Dicentrarchus	Dicentrarchus labrax	Dicentrarchus labrax	126975
8d62db8bda4b8bce476ecca23a5b4c43	B00	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	172	OTU243	Animalia	Chordata	Actinopteri	Clupeiformes	Clupeidae	Sardina	Sardina pilchardus	Sardina pilchardus	126421
8f002f4451111e3d0ded3bb775a08fe5	B00	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	1832	OTU257	Animalia	Chordata	Actinopteri	Gadiformes	Gadidae	Gadus	Gadus morhua	Gadus morhua	126436
c70a84d5b596fa9ff20de671bca82cbf	B00	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	294	OTU262	Animalia	Chordata	Actinopteri	Clupeiformes	Clupeidae	Sardina	Sardina pilchardus	Sardina pilchardus	126421
e2d0d29f13c17cdeb5a9a7849c891d48	B00	Research Team	01.2020	Jan	Winter	2020	10-20 m	39°25’31.3’’N	009°30’54.1’’W	149	OTU296	Animalia	Chordata	Actinopteri	Spariformes	Sparidae	Diplodus	Diplodus puntazzo	Diplodus puntazzo	127052

Human Verification

Added taxa not identified to species

combined_dataframe[c(48, 81, 130, 150, 176, 320, 338, 349, 350, 557, 657, 765, 777), 
                   c("CanonicalFull", "WormsIDs")] <- 
  matrix(rep(c("Sparidae", "125564"), 13), nrow = 13, byrow = TRUE)

Darwin Core Occurrence Mapping

OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.

locality

Format locality information to decimal degrees

dms_to_dd <- function(dms) {
  # Extract degrees, minutes, seconds, and direction using regex
  parts <- str_match(dms, "([0-9]+)[°º]([0-9]+)['’]([0-9.]+)(['’‘’\"]{0,2})[NSEW]")
  
  degrees <- as.numeric(parts[,2])
  minutes <- as.numeric(parts[,3])
  seconds <- as.numeric(parts[,4])
  direction <- parts[,5]
  
  # Convert to decimal degrees
  decimal_degrees <- degrees + (minutes / 60) + (seconds / 3600)
  
  return(decimal_degrees)
}

# Apply function to Latitude and Longitude columns
combined_dataframe$decimalLatitude <- sapply(combined_dataframe$CleanedData.Geographical.location..N, dms_to_dd)
combined_dataframe$decimalLongitude <- -sapply(combined_dataframe$CleanedData.Geographical.location.W, dms_to_dd)

# Add missing locality information
combined_dataframe$coordinateUncertaintyInMeters <- 50
combined_dataframe$country <- "Portugal"
combined_dataframe$locality <- "Berlengas Biosphere Reserve"
combined_dataframe$geodeticDatum <- "WGS84"

combined_dataframe <- combined_dataframe %>% select(-CleanedData.Geographical.location..N, 
                                                    -CleanedData.Geographical.location.W)

occurrenceID

occurrence <- combined_dataframe %>%
  rename(occurrenceID = CleanedData.occurrenceID)

scientificName/scientificNameID

#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- occurrence %>%
  rename(scientificName = CanonicalFull) %>%
  rename(scientificNameID = WormsIDs) %>%
  mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA)) %>% 
  select(-CleanedData.kingdom,
         -CleanedData.phylum,
         -CleanedData.class,
         -CleanedData.order,
         -CleanedData.family,
         -CleanedData.genus,
         -CleanedData.species)

eventDate

occurrence <- occurrence %>%
  mutate(
    CleanedData.Month = ifelse(CleanedData.Month == "Sept", "Sep", CleanedData.Month),
    eventDate = match(CleanedData.Month, month.abb),
    eventDate = sprintf("%02d", eventDate),
    eventDate = paste0(CleanedData.Year, "-", eventDate)
  ) %>% 
  select(-CleanedData.Date,
         -CleanedData.Month,
         -CleanedData.Season,
         -CleanedData.Year)

depth

occurrence <- occurrence %>%
  mutate(
    verbatimDepth = CleanedData.Bathymetry,
    minimumDepthInMeters = as.numeric(str_extract(CleanedData.Bathymetry, "^[0-9]+")),
    maximumDepthInMeters = as.numeric(str_extract(CleanedData.Bathymetry, "(?<=-)[0-9]+"))
  ) %>% 
  select(-CleanedData.Bathymetry)

recordedBy

occurrence <- occurrence %>%
  mutate(
    recordedBy = CleanedData.Collection.method
  ) %>% 
  select(-CleanedData.Collection.method)

organismQuantity

occurrence <- occurrence %>%
  mutate(
    organismQuantity = CleanedData.n.reads
  ) %>% 
  select(-CleanedData.n.reads)

occurrenceStatus

occurrenceStatus <- "present"
occurrence %<>% mutate(occurrenceStatus)

basisOfRecord

basisOfRecord <- "MaterialSample"
occurrence %<>% mutate(basisOfRecord)

organismQuantityType

organismQuantityType <- "DNA sequence reads"
occurrence %<>% mutate(organismQuantityType)

sampleSizeUnit

sampleSizeUnit <- "DNA sequence reads"
occurrence %<>% mutate(sampleSizeUnit)

sampleSizeValue

Total number of reads in the sample post processing.

occurrence <- occurrence %>%
  left_join(input_read_data, by = "CleanedData.Sample_name") %>%
  rename(sampleSizeValue = Filtered_reads)

materialSampleID

occurrence <- occurrence %>%
  mutate(materialSampleID = case_when(
    CleanedData.Sample_name == 'B00'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345430',
    CleanedData.Sample_name == 'B01'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345431',
    CleanedData.Sample_name == 'B02'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345432',
    CleanedData.Sample_name == 'B03'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345433',
    CleanedData.Sample_name == 'B04'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345434',
    CleanedData.Sample_name == 'B05'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345435',
    CleanedData.Sample_name == 'B06'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345436',
    CleanedData.Sample_name == 'B07'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345437',
    CleanedData.Sample_name == 'B08'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345438',
    CleanedData.Sample_name == 'B09'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345439',
    CleanedData.Sample_name == 'B10'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345440',
    CleanedData.Sample_name == 'B11'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345441',
    CleanedData.Sample_name == 'GM01'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345442',
    CleanedData.Sample_name == 'GM02'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345443',
    CleanedData.Sample_name == 'GM03'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345444',
    CleanedData.Sample_name == 'GM04'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345445',
    CleanedData.Sample_name == 'GM05'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345446',
    CleanedData.Sample_name == 'GM06'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345447',
    CleanedData.Sample_name == 'GM07'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345448',
    CleanedData.Sample_name == 'GM08'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345449',
    CleanedData.Sample_name == 'GM09'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345450',
    CleanedData.Sample_name == 'GM10'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345451',
    CleanedData.Sample_name == 'GM11'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345452',
    CleanedData.Sample_name == 'GM13'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345453',
    CleanedData.Sample_name == 'GM14'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345454',
    CleanedData.Sample_name == 'GM15'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345455',
    CleanedData.Sample_name == 'GM16'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345456',
    CleanedData.Sample_name == 'GM18'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345457',
    CleanedData.Sample_name == 'GM19'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345458',
    CleanedData.Sample_name == 'GM20'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345459',
    CleanedData.Sample_name == 'GM21'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345460',
    CleanedData.Sample_name == 'GM22'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345461',
    CleanedData.Sample_name == 'RM01'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345462',
    CleanedData.Sample_name == 'RM02'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345463',
    CleanedData.Sample_name == 'RM04'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345464',
    CleanedData.Sample_name == 'RM05'  ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345465'
  )) %>% 
  select(-CleanedData.Sample_name)

associatedSequences

associatedSequences <- "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA1110393"
occurrence %<>% mutate(associatedSequences)

identificationRemarks

identificationRemarks <- "Taxonomic classification confidence (at lowest specified taxon): 0.97 at species level, 0.90 at genus level, using BLAST in QIIME 2 against NCBI and MitoFish reference databases. Unassigned ASVs and non-fish species excluded. Cross-validation performed using WoRMS, OBIS, and FishBase."
occurrence %<>% mutate(identificationRemarks)

taxonID

occurrence <- occurrence %>%
  mutate(
    taxonID = CleanedData.OTU_number
  ) %>% 
  select(-CleanedData.OTU_number)

Darwin Core eDNA Mapping

dna_derived_data <- edna_input_data

DNA_sequence

dna_derived_data <- dna_derived_data %>%
  left_join(input_sequence_data, by = c("OTU_number" = "taxonID")) %>%
  rename(DNA_sequence = Sequence) %>% 
  select(-OTU_number)

target_gene

target_gene <- "12S rRNA"
dna_derived_data %<>% mutate(target_gene)

pcr_primer_forward

pcr_primer_forward <- "GTCGGTAAAACTCGTGCC"
dna_derived_data %<>% mutate(pcr_primer_forward)

pcr_primer_reverse

pcr_primer_reverse <- "CATAGTGGGGTATCTAATCCCAGTTTG"
dna_derived_data %<>% mutate(pcr_primer_reverse)

pcr_primer_name_forward

pcr_primer_name_forward <- "MiFish-U-F"
dna_derived_data %<>% mutate(pcr_primer_name_forward)

pcr_primer_name_reverse

pcr_primer_name_reverse <- "MiFish-U-R"
dna_derived_data %<>% mutate(pcr_primer_name_reverse)

pcr_primer_reference

pcr_primer_reference <- "https://doi.org/10.1098/rsos.150088"
dna_derived_data %<>% mutate(pcr_primer_reference)

env_broad_scale

env_broad_scale <- "marine biome [ENVO:00000447]"
dna_derived_data %<>% mutate(env_broad_scale)

env_local_scale

env_local_scale <- "coastal water [ENVO:00001250]"
dna_derived_data %<>% mutate(env_local_scale)

env_medium

env_medium <- "ocean water [ENVO:00002149]"
dna_derived_data %<>% mutate(env_medium)

lib_layout

lib_layout <- "Paired"
dna_derived_data %<>% mutate(lib_layout)

seq_meth

seq_meth <- "lllumina MiSeq"
dna_derived_data %<>% mutate(seq_meth)

otu_class_appr

otu_class_appr <- "DADA2; QIIME 2 v2022.2; ASV"
dna_derived_data %<>% mutate(otu_class_appr)

otu_seq_comp_appr

otu_seq_comp_appr <- "BLAST; QIIME 2 v2022.2; similarity thresholds: 97% (species), 90% (genus)"
dna_derived_data %<>% mutate(otu_seq_comp_appr)

otu_db

otu_db <- "NCBI (via Entrez Direct); MitoFish"
dna_derived_data %<>% mutate(otu_db)

samp_size

samp_size <- "2 liter"
dna_derived_data %<>% mutate(samp_size) %>% 
  select(-`Amount of water`)

source_mat_id

dna_derived_data <- dna_derived_data %>%
  mutate(
    source_mat_id = Sample_name
  ) %>% 
  select(-Sample_name)

Save outputs

dwc_output_dir <- "output"

write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write.csv(dna_derived_data, paste(dwc_output_dir, "/dna_derived_data.csv", sep = ""), na = "", row.names=FALSE)