Bundling Simões et al 2024 to a DwC Archive
This is an R Markdown Notebook for converting the eDNA data found in the following reference to Darwin Core format for upload into OBIS:
Setup
Call the necessary libraries and variables. Suppresses loading messages.
## Warning: package 'readxl' was built under R version 4.3.3
suppressMessages(library(dplyr)) # To clean input data
library(stringr) # To clean input data
suppressMessages(library(taxize)) # To get WoRMS IDs
## Warning: package 'taxize' was built under R version 4.3.3
## Warning: package 'worrms' was built under R version 4.3.3
Read source data
Read in source csv table
input_file <- "input/OBIS-UNESCO_final.xlsx"
input_sequences <- "input/occurrence_seqeunces.csv"
input_reads <- "input/Number of reads-data processing.xlsx"
input_data <- as.data.frame(read_excel(input_file))
input_sequence_data <- read.csv(input_sequences, sep = ";")
input_sequence_data <- input_sequence_data %>% distinct(taxonID, Sequence)
input_read_data <- as.data.frame(read_excel(input_reads))
## New names:
## • `` -> `...1`
input_read_data <- input_read_data %>%
rename(CleanedData.Sample_name = ...1, Filtered_reads = filtered) %>%
select(CleanedData.Sample_name, Filtered_reads)
#to preview pretty table
knitr::kable(head(input_data))
Sample_name | Amount of water | DNA_source | Collection method | Date | Month | Season | Year | Bathymetry | Geographical location N | Geographical location W | nº reads | OTU_number | kingdom | phylum | class | order | family | genus | species | associatedSequences |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 189 | OTU212 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 269 | OTU213 | Animalia | Chordata | Actinopteri | NA | Moronidae | Dicentrarchus | Dicentrarchus labrax | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 172 | OTU243 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 1832 | OTU257 | Animalia | Chordata | Actinopteri | Gadiformes | Gadidae | Gadus | Gadus morhua | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 294 | OTU262 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | NCBI - PRJNA1110393 |
B00 | 2 L | eDNA | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 149 | OTU296 | Animalia | Chordata | Actinopteri | Spariformes | Sparidae | Diplodus | Diplodus puntazzo | NCBI - PRJNA1110393 |
Split source data into occurrence fields and DNA fields
add occurrenceID
OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.
# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)
# Generate occurrenceID:
input_data %<>% mutate(occurrenceID = paste(vdigest (paste(OTU_number, `Geographical location N`, `Geographical location W`, Date), algo="md5"), sep=":"))
#split data
input_data <- input_data %>% rename_with(~"n reads", .cols = 12)
occurrence_input_data <- input_data %>% select(occurrenceID,
Sample_name,
`Collection method`,
Date,
Month,
Season,
Year,
Bathymetry,
`Geographical location N`,
`Geographical location W`,
`n reads`,
OTU_number,
kingdom,
phylum,
class,
order,
family,
genus,
species) %>% distinct()
edna_input_data <- input_data %>% select(occurrenceID,
Sample_name,
OTU_number,
`Amount of water`) %>% distinct()
Get WoRMS IDs
Auto matching
First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.
#Parse author names out
parsed_names <- rgnparser::gn_parse(occurrence_input_data[,"species"])
#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- NA
}
}
return(worms_id)
}
#Call the function
worms_ids <- lapply(parsed_names, function(element) {
if (element$parsed) {
return(get_worms_id_from_element(element))
} else {
return(NA)
}
})
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()
for (i in 1:nrow(occurrence_input_data)) {
cleaned_value <- occurrence_input_data[i,]
canonical_value <- parsed_names[[i]]$canonical$full
worms_id_value <- worms_ids[[i]][1]
if (is.null(canonical_value)){
canonical_value <- NA
}
temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
combined_dataframe <- rbind(combined_dataframe, temp_row)
}
knitr::kable(head(combined_dataframe))
CleanedData.occurrenceID | CleanedData.Sample_name | CleanedData.Collection.method | CleanedData.Date | CleanedData.Month | CleanedData.Season | CleanedData.Year | CleanedData.Bathymetry | CleanedData.Geographical.location..N | CleanedData.Geographical.location.W | CleanedData.n.reads | CleanedData.OTU_number | CleanedData.kingdom | CleanedData.phylum | CleanedData.class | CleanedData.order | CleanedData.family | CleanedData.genus | CleanedData.species | CanonicalFull | WormsIDs |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ac09967beb89ae73a6f52e7ba48355e7 | B00 | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 189 | OTU212 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | Sardina pilchardus | 126421 |
9133f272bc630842f5dd503bed24b679 | B00 | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 269 | OTU213 | Animalia | Chordata | Actinopteri | NA | Moronidae | Dicentrarchus | Dicentrarchus labrax | Dicentrarchus labrax | 126975 |
8d62db8bda4b8bce476ecca23a5b4c43 | B00 | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 172 | OTU243 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | Sardina pilchardus | 126421 |
8f002f4451111e3d0ded3bb775a08fe5 | B00 | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 1832 | OTU257 | Animalia | Chordata | Actinopteri | Gadiformes | Gadidae | Gadus | Gadus morhua | Gadus morhua | 126436 |
c70a84d5b596fa9ff20de671bca82cbf | B00 | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 294 | OTU262 | Animalia | Chordata | Actinopteri | Clupeiformes | Clupeidae | Sardina | Sardina pilchardus | Sardina pilchardus | 126421 |
e2d0d29f13c17cdeb5a9a7849c891d48 | B00 | Research Team | 01.2020 | Jan | Winter | 2020 | 10-20 m | 39°25’31.3’’N | 009°30’54.1’’W | 149 | OTU296 | Animalia | Chordata | Actinopteri | Spariformes | Sparidae | Diplodus | Diplodus puntazzo | Diplodus puntazzo | 127052 |
Darwin Core Occurrence Mapping
OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.
locality
Format locality information to decimal degrees
dms_to_dd <- function(dms) {
# Extract degrees, minutes, seconds, and direction using regex
parts <- str_match(dms, "([0-9]+)[°º]([0-9]+)['’]([0-9.]+)(['’‘’\"]{0,2})[NSEW]")
degrees <- as.numeric(parts[,2])
minutes <- as.numeric(parts[,3])
seconds <- as.numeric(parts[,4])
direction <- parts[,5]
# Convert to decimal degrees
decimal_degrees <- degrees + (minutes / 60) + (seconds / 3600)
return(decimal_degrees)
}
# Apply function to Latitude and Longitude columns
combined_dataframe$decimalLatitude <- sapply(combined_dataframe$CleanedData.Geographical.location..N, dms_to_dd)
combined_dataframe$decimalLongitude <- -sapply(combined_dataframe$CleanedData.Geographical.location.W, dms_to_dd)
# Add missing locality information
combined_dataframe$coordinateUncertaintyInMeters <- 50
combined_dataframe$country <- "Portugal"
combined_dataframe$locality <- "Berlengas Biosphere Reserve"
combined_dataframe$geodeticDatum <- "WGS84"
combined_dataframe <- combined_dataframe %>% select(-CleanedData.Geographical.location..N,
-CleanedData.Geographical.location.W)
scientificName/scientificNameID
#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- occurrence %>%
rename(scientificName = CanonicalFull) %>%
rename(scientificNameID = WormsIDs) %>%
mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA)) %>%
select(-CleanedData.kingdom,
-CleanedData.phylum,
-CleanedData.class,
-CleanedData.order,
-CleanedData.family,
-CleanedData.genus,
-CleanedData.species)
eventDate
occurrence <- occurrence %>%
mutate(
CleanedData.Month = ifelse(CleanedData.Month == "Sept", "Sep", CleanedData.Month),
eventDate = match(CleanedData.Month, month.abb),
eventDate = sprintf("%02d", eventDate),
eventDate = paste0(CleanedData.Year, "-", eventDate)
) %>%
select(-CleanedData.Date,
-CleanedData.Month,
-CleanedData.Season,
-CleanedData.Year)
depth
recordedBy
organismQuantity
organismQuantityType
sampleSizeValue
Total number of reads in the sample post processing.
materialSampleID
occurrence <- occurrence %>%
mutate(materialSampleID = case_when(
CleanedData.Sample_name == 'B00' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345430',
CleanedData.Sample_name == 'B01' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345431',
CleanedData.Sample_name == 'B02' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345432',
CleanedData.Sample_name == 'B03' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345433',
CleanedData.Sample_name == 'B04' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345434',
CleanedData.Sample_name == 'B05' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345435',
CleanedData.Sample_name == 'B06' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345436',
CleanedData.Sample_name == 'B07' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345437',
CleanedData.Sample_name == 'B08' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345438',
CleanedData.Sample_name == 'B09' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345439',
CleanedData.Sample_name == 'B10' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345440',
CleanedData.Sample_name == 'B11' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345441',
CleanedData.Sample_name == 'GM01' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345442',
CleanedData.Sample_name == 'GM02' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345443',
CleanedData.Sample_name == 'GM03' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345444',
CleanedData.Sample_name == 'GM04' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345445',
CleanedData.Sample_name == 'GM05' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345446',
CleanedData.Sample_name == 'GM06' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345447',
CleanedData.Sample_name == 'GM07' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345448',
CleanedData.Sample_name == 'GM08' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345449',
CleanedData.Sample_name == 'GM09' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345450',
CleanedData.Sample_name == 'GM10' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345451',
CleanedData.Sample_name == 'GM11' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345452',
CleanedData.Sample_name == 'GM13' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345453',
CleanedData.Sample_name == 'GM14' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345454',
CleanedData.Sample_name == 'GM15' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345455',
CleanedData.Sample_name == 'GM16' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345456',
CleanedData.Sample_name == 'GM18' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345457',
CleanedData.Sample_name == 'GM19' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345458',
CleanedData.Sample_name == 'GM20' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345459',
CleanedData.Sample_name == 'GM21' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345460',
CleanedData.Sample_name == 'GM22' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345461',
CleanedData.Sample_name == 'RM01' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345462',
CleanedData.Sample_name == 'RM02' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345463',
CleanedData.Sample_name == 'RM04' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345464',
CleanedData.Sample_name == 'RM05' ~ 'https://www.ncbi.nlm.nih.gov/biosample/SAMN41345465'
)) %>%
select(-CleanedData.Sample_name)
associatedSequences
identificationRemarks
identificationRemarks <- "Taxonomic classification confidence (at lowest specified taxon): 0.97 at species level, 0.90 at genus level, using BLAST in QIIME 2 against NCBI and MitoFish reference databases. Unassigned ASVs and non-fish species excluded. Cross-validation performed using WoRMS, OBIS, and FishBase."
occurrence %<>% mutate(identificationRemarks)