Translating Antonsen 2020 to Darwin Core
Introduction
This is an R Markdown Notebook for converting the data found in the following reference to Darwin Core format for upload into GBIF:
Introduction to project data
Setup
Call the necessary libraries and variables. Suppresses loading messages.
library(magrittr) # To use %<>% pipes
suppressMessages(library(dplyr)) # To clean input data
library(stringr) # To clean input data
library(digest) # To generate hashes## Warning: package 'digest' was built under R version 4.3.3
Read source data
Read in source csv tables
input_butterfly <- "inputs/butterfly_data.csv"
input_flower <- "inputs/flower_data.csv"
input_site_coords <- "inputs/site_coordinates.csv"
input_site_vegetation <- "inputs/vegetation_data.csv"
input_species_codes <- "inputs/species_codes.csv"
input_site_name_mappings <- "inputs/site_name_mappings.csv"
butterfly_data <- read.csv(input_butterfly)
flower_data <- read.csv(input_flower)
site_coords_data <- read.csv(input_site_coords)
site_vegetation_data <- read.csv(input_site_vegetation)
species_codes_data <- read.csv(input_species_codes)
site_name_mappings <- read.csv(input_site_name_mappings)
species_codes_data <- rbind(
species_codes_data,
data.frame(
Code = "grass",
Scientific.Name = "Poaceae",
Common.Name = "True grasses"
)
)
species_codes_data <- rbind(
species_codes_data,
data.frame(
Code = "Grass",
Scientific.Name = "Poaceae",
Common.Name = "True grasses"
)
)
species_codes_data <- rbind(
species_codes_data,
data.frame(
Code = "GRASS",
Scientific.Name = "Poaceae",
Common.Name = "True grasses"
)
)
# preview pretty table
#knitr::kable(head(butterfly_data))Clean data
#standardize site names using mappings
butterfly_data$Original.Site <- butterfly_data$Site
butterfly_data$Site <- site_name_mappings$Clean.Site.Abbr[
match(butterfly_data$Site, site_name_mappings$Dirty.Site)
]
flower_data$Original.Site <- flower_data$Site
flower_data$Site <- site_name_mappings$Clean.Site.Abbr[
match(flower_data$Site, site_name_mappings$Dirty.Site)
]
site_coords_data$Original.Site <- site_coords_data$Site.Name
site_coords_data$Site.Name <- site_name_mappings$Clean.Site.Abbr[
match(site_coords_data$Site.Name, site_name_mappings$Dirty.Site)
]
site_vegetation_data$Original.Site <- site_vegetation_data$Site
site_vegetation_data$Site <- site_name_mappings$Clean.Site.Abbr[
match(site_vegetation_data$Site, site_name_mappings$Dirty.Site)
]
# Does BEN White Horse Hill NWR = BEN Sully's Lake NWR = BEN Sully's Hill NWR?
#clean specific cells
flower_data$Date[
flower_data$Site == "CAS Alice WPA 1" &
flower_data$Code %in% c("FLTDS-A0176", "FLTDS-A0177", "FLTDS-A0178", "FLTDS-A0179", "FLTDS-A0180", "FLTDS-A0181", "FLTDS-A0182")
] <- "8/22/17"
flower_data$Code[
flower_data$Site == "CAS Alice WPA 1" &
flower_data$Code %in% c("FLTDS-A0176", "FLTDS-A0177", "FLTDS-A0178", "FLTDS-A0179", "FLTDS-A0180", "FLTDS-A0181", "FLTDS-A0182")
] <- "FLTDS-A0175"
flower_data$Code[
flower_data$Code %in% c("FT-2B0837")
] <- "FT2-B0837"
butterfly_data$Code[
butterfly_data$Code %in% c("Lt1-B0712")
] <- "LT1-B0712"
butterfly_data$Code[
butterfly_data$Site == "DUN Lake Ilo NWR" &
butterfly_data$Code %in% c("LT1-A0910")
] <- "LT1-A0917"
butterfly_data$Code[
butterfly_data$Site == "DUN Zahn" &
butterfly_data$Code %in% c("FT2-A0910")
] <- "FT2-A0914"
#set count = 0 where empty
flower_data$Count[flower_data$Count == ""] <- 0
#remove commas from numbers
flower_data$Count <- gsub(",", "", flower_data$Count)
#remove rows where there is a species but count is 0, count is NOT ENTERED and species is NOT ENTERED
flower_data <- flower_data %>%
dplyr::filter(!(Flower.Species != "none" & Count == 0)) %>%
dplyr::filter(Count != "NOT ENTERED") %>%
dplyr::filter(Flower.Species != "NOT ENTERED")
#remove rows where species codes are NA or empty
butterfly_data <- butterfly_data %>%
dplyr::filter(!is.na(Butterfly.Code) & Butterfly.Code != "")
#group data counts the same way across rows
butterfly_data <- butterfly_data %>%
dplyr::group_by(Year, Code, Observer, County, Site, Date, Time, Temperature, Wind.Speed, Humidity, Cloud.Cover, Butterfly.Code, Behavior, Plant.Species, Original.Site) %>%
dplyr::summarise(Count = sum(Count, na.rm = TRUE), .groups = "drop")
flower_data <- flower_data %>%
dplyr::group_by(Year, Code, Observer, County, Site, Date, Flower.Species, X) %>%
dplyr::summarise(Count = as.integer(sum(as.numeric(Count), na.rm = TRUE)), .groups = "drop")
#add unique occurrenceID for each observation
vdigest <- Vectorize(digest)
butterfly_data %<>% mutate(occurrenceID = paste(vdigest (paste(Code, Site, Date, Observer, Behavior, Butterfly.Code, Plant.Species), algo="md5"), sep=":"))
flower_data %<>% mutate(occurrenceID = paste(vdigest (paste(Code, Site, Date, Observer, Flower.Species, X), algo="md5"), sep=":"))
#remove rows with duplicate occurrenceIDs - mostly incorrect Codes or Times
butterfly_data <- butterfly_data %>%
dplyr::distinct(occurrenceID, .keep_all = TRUE)Map Data
Map Event Core & Humboldt Data
Project Event
The project event describes the overall project. It defines the scope of the survey (taxonomic, geographic, temporal), the general protocol, and whether absence, abundance, and material samples are reported. The data for the Event Core and Humboldt extension are split into two separate files.
Mapping Event core fields for the project event level:
NDbutterflies_project_event_core <- data.frame(eventID = "NDbutterflies_flowers",
parentEventID = NA,
fieldNumber = NA,
habitat = NA,
locationID = NA,
countryCode = "US",
decimalLatitude = 47.4501,
decimalLongitude = -100.4659,
coordinateUncertaintyInMeters = 335000,
geodeticDatum = "WGS84",
country = "United States",
stateProvince = "North Dakota",
county = NA,
locality = NA,
sampleSizeValue = NA,
sampleSizeUnit = NA,
footprintWKT = NA,
footprintSRS = NA,
eventDate = paste(
format(min(as.Date(butterfly_data$Date, format = "%m/%d/%y"), na.rm = TRUE), "%Y-%m-%d"),
format(max(as.Date(butterfly_data$Date, format = "%m/%d/%y"), na.rm = TRUE), "%Y-%m-%d"),
sep = "/"
),
eventTime = NA,
eventType = "project",
samplingEffort = "Two surveys annually at three sites per county across 53 counties",
samplingProtocol = "Butterfly fixed transect survey | Butterfly visual encounter survey | Floral fixed transect survey | Floral visual encounter survey",
dataGeneralizations = NA,
informationWithheld = NA,
fieldNotes = NA,
eventRemarks = NA,
identifiedBy = NA
)Mapping Humboldt extension fields for the project event level:
NDbutterflies_project_event_Humboldt <- data.frame(eventID = "NDbutterflies_flowers",
siteNestingDescription = "Three grassland survey sites within each of North Dakota's 53 counties were surveyed twice annually for butterflies and flowering plants via transects and encounter surveys; Individual site locations could vary among years.",
siteCount = length(unique(site_coords_data$Site.Name)),
verbatimSiteNames = paste(sort(unique(site_coords_data$Site.Name)), collapse = " | "),
verbatimSiteDescriptions = NA,
reportedWeather = NA,
reportedExtremeConditions = NA,
totalAreaSampledValue = NA,
totalAreaSampledUnit = NA,
geospatialScopeAreaValue = 178694, #full geographic extent of ND
geospatialScopeAreaUnit = "km2",
isVegetationCoverReported = NA,
eventDurationValue = NA,
eventDurationUnit = NA,
inventoryTypes = NA,
compilationTypes = NA,
compilationSourceTypes = NA,
protocolNames = "Butterfly fixed transect survey | Butterfly visual encounter survey | Floral fixed transect survey | Floral visual encounter survey",
protocolDescriptions = "Butterflies were surveyed at grassland sites across North Dakota using fixed transect and visual encounter surveys conducted by two observers under suitable weather conditions. In 2017–2018, surveys included both fixed transects and roaming visual encounter surveys. In 2019–2020, protocols switched to entirely fixed transect surveys. Floral resources were surveyed using corresponding floral transect and visual encounter survey methods, recording flowering ramets and nectar resources within survey areas.",
protocolReferences = "",
isAbsenceReported = "true",
absentTaxa = NA,
isAbundanceReported = "true",
isAbundanceCapReported = "false",
abundanceCap = NA,
hasMaterialSamples = "false",
materialSampleTypes = NA,
hasVouchers = "false",
voucherInstitutions = NA,
isLeastSpecificTargetCategoryQuantityInclusive = "true",
verbatimTargetScope = "butterflies | flowering plants",
targetTaxonomicScope = "Lepidoptera | Magnoliophyta",
excludedTaxonomicScope = NA,
isTaxonomicScopeFullyReported = "true",
taxonCompletenessReported = "notReported",
taxonCompletenessProtocols = NA,
hasNonTargetTaxa = "true",
areNonTargetTaxaFullyReported = "false",
nonTargetTaxa = "non-flowering plants",
targetLifeStageScope = "adult",
excludedLifeStageScope = "egg | larva | pupa",
isLifeStageScopeFullyReported = "true",
targetDegreeOfEstablishmentScope = NA,
excludedDegreeOfEstablishmentScope = NA,
isDegreeOfEstablishmentScopeFullyReported = NA,
targetGrowthFormScope = "flowering individuals",
excludedGrowthFormScope = NA,
isGrowthFormScopeFullyReported = NA,
hasNonTargetOrganisms = NA,
targetHabitatScope = NA,
excludedHabitatScope = NA,
isSamplingEffortReported = "true",
samplingEffortProtocol = "Two observers separately and simultaneously conducted butterfly and floral surveys at each site. In 2017–2018, each observer completed one fixed transect survey (100 m, 10 minutes) and one visual encounter survey (4 ha, 30 minutes). In 2019–2020, surveys switched entirely to fixed transect methods, with each observer completing two fixed transect surveys (100 m, 10 minutes each).",
samplingEffortValue = "40",
samplingEffortUnit = "minutes per site visit per observer",
samplingPerformedBy = "Field staff"
)County Events
53 counties
Setup:
Mapping Event core fields for the county event level:
added_counties_data_core <- data.frame(eventID = county_list,
parentEventID = "NDbutterflies_flowers",
fieldNumber = NA,
habitat = NA,
locationID = NA,
countryCode = "US",
decimalLatitude = NA, #need
decimalLongitude = NA, #need
coordinateUncertaintyInMeters = NA, #need
geodeticDatum = "WGS84",
country = "United States",
stateProvince = "North Dakota",
county = county_list,
locality = paste0(county_list, " County"),
sampleSizeValue = NA,
sampleSizeUnit = NA,
footprintWKT = NA,
footprintSRS = NA,
eventDate = tapply(
as.Date(butterfly_data$Date, format = "%m/%d/%y"),
butterfly_data$County,
function(x) paste(
format(min(x, na.rm = TRUE), "%Y-%m-%d"),
format(max(x, na.rm = TRUE), "%Y-%m-%d"),
sep = "/"
)
)[county_list],
eventTime = NA,
eventType = "county",
samplingEffort = "Two surveys annually at three sites",
samplingProtocol = "Butterfly fixed transect survey | Butterfly visual encounter survey | Floral fixed transect survey | Floral visual encounter survey",
dataGeneralizations = NA,
informationWithheld = NA,
fieldNotes = NA,
eventRemarks = NA,
identifiedBy = NA
)Mapping Humboldt extension fields for the county event level:
added_county_data_humboldt <- data.frame(eventID = county_list,
siteNestingDescription = "Three grassland survey sites within each of North Dakota's 53 counties were surveyed twice annually for butterflies and flowering plants via transects and encounter surveys; Individual site locations could vary among years.",
siteCount = tapply(
butterfly_data$Site,
butterfly_data$County,
function(x) length(unique(x))
)[county_list],
verbatimSiteNames = tapply(
butterfly_data$Site,
butterfly_data$County,
function(x) paste(sort(unique(x)), collapse = " | ")
)[county_list],
verbatimSiteDescriptions = NA,
reportedWeather = NA,
reportedExtremeConditions = NA,
geospatialScopeAreaValue = NA, #need
geospatialScopeAreaUnit = NA, #area of county
totalAreaSampledValue = as.character(
tapply(
butterfly_data$Site,
butterfly_data$County,
function(x) length(unique(x)) * 4
)[county_list]
),
totalAreaSampledUnit = "hectares",
isVegetationCoverReported = NA,
eventDurationValue = NA,
eventDurationUnit = NA,
inventoryTypes = NA,
compilationTypes = NA,
compilationSourceTypes = NA,
protocolNames = "Butterfly fixed transect survey | Butterfly visual encounter survey | Floral fixed transect survey | Floral visual encounter survey",
protocolDescriptions = "Butterflies were surveyed at grassland sites across North Dakota using fixed transect and visual encounter surveys conducted by two observers under suitable weather conditions. In 2017–2018, surveys included both fixed transects and roaming visual encounter surveys. In 2019–2020, protocols switched to entirely fixed transect surveys. Floral resources were surveyed using corresponding floral transect and visual encounter survey methods, recording flowering ramets and nectar resources within survey areas.",
protocolReferences = "",
isAbsenceReported = "true",
absentTaxa = NA,
isAbundanceReported = "true",
isAbundanceCapReported = "false",
abundanceCap = NA,
hasMaterialSamples = "false",
materialSampleTypes = NA,
hasVouchers = "false",
voucherInstitutions = NA,
isLeastSpecificTargetCategoryQuantityInclusive = "true",
verbatimTargetScope = "butterflies | flowering plants",
targetTaxonomicScope = "Lepidoptera | Magnoliophyta",
excludedTaxonomicScope = NA,
isTaxonomicScopeFullyReported = "true",
taxonCompletenessReported = "notReported",
taxonCompletenessProtocols = NA,
hasNonTargetTaxa = "true",
areNonTargetTaxaFullyReported = "false",
nonTargetTaxa = "non-flowering plants",
targetLifeStageScope = "adult",
excludedLifeStageScope = "egg | larva | pupa",
isLifeStageScopeFullyReported = "true",
targetDegreeOfEstablishmentScope = NA,
excludedDegreeOfEstablishmentScope = NA,
isDegreeOfEstablishmentScopeFullyReported = NA,
targetGrowthFormScope = "flowering individuals",
excludedGrowthFormScope = NA,
isGrowthFormScopeFullyReported = NA,
hasNonTargetOrganisms = NA,
targetHabitatScope = NA,
excludedHabitatScope = NA,
isSamplingEffortReported = "true",
samplingEffortProtocol = "Two observers separately and simultaneously conducted butterfly and floral surveys at each site. In 2017–2018, each observer completed one fixed transect survey (100 m, 10 minutes) and one visual encounter survey (4 ha, 30 minutes). In 2019–2020, surveys switched entirely to fixed transect methods, with each observer completing two fixed transect surveys (100 m, 10 minutes each).",
samplingEffortValue = "40",
samplingEffortUnit = "minutes per site visit per observer",
samplingPerformedBy = "Field staff"
)Site Events
3 sites per county per year - can be different sites
Setup:
Mapping Event core fields for the site event level:
added_site_data_core <- data.frame(eventID = site_list,
parentEventID = butterfly_data$County[match(site_list, butterfly_data$Site)],
fieldNumber = site_list,
habitat = "grassland",
locationID = site_list,
countryCode = "US",
decimalLatitude = tapply(site_coords_data$Y, site_coords_data$Site.Name, mean, na.rm = TRUE)[site_list],
decimalLongitude = tapply(site_coords_data$X, site_coords_data$Site.Name, mean, na.rm = TRUE)[site_list],
coordinateUncertaintyInMeters = 500,
geodeticDatum = "WGS84",
country = "United States",
stateProvince = "North Dakota",
county = butterfly_data$County[match(site_list, butterfly_data$Site)],
locality = site_name_mappings$Clean.Site[
match(site_list, site_name_mappings$Clean.Site.Abbr)
],
sampleSizeValue = NA,
sampleSizeUnit = NA,
footprintWKT = NA,
footprintSRS = NA,
eventDate = tapply(
as.Date(butterfly_data$Date, format = "%m/%d/%y"),
butterfly_data$Site,
function(x) paste(
format(min(x, na.rm = TRUE), "%Y-%m-%d"),
format(max(x, na.rm = TRUE), "%Y-%m-%d"),
sep = "/"
)
)[site_list],
eventTime = NA,
eventType = "site",
samplingEffort = "Two surveys annually at each site",
samplingProtocol = "Butterfly fixed transect survey | Butterfly visual encounter survey | Floral fixed transect survey | Floral visual encounter survey",
dataGeneralizations = NA,
informationWithheld = NA,
fieldNotes = NA,
eventRemarks = NA,
identifiedBy = NA
)Mapping Humboldt extension fields for the site event level:
added_site_data_humboldt <- data.frame(eventID = site_list,
siteNestingDescription = "Three grassland survey sites within each of North Dakota's 53 counties were surveyed twice annually for butterflies and flowering plants via transects and encounter surveys; Individual site locations could vary among years.",
siteCount = 1,
verbatimSiteNames = site_list,
verbatimSiteDescriptions = NA,
reportedWeather = NA,
reportedExtremeConditions = NA,
geospatialScopeAreaValue = 4,
geospatialScopeAreaUnit = "hectares",
totalAreaSampledValue = "100 | 4",
totalAreaSampledUnit = "m transect | ha survey area",
isVegetationCoverReported = NA,
eventDurationValue = NA,
eventDurationUnit = NA,
inventoryTypes = NA,
compilationTypes = NA,
compilationSourceTypes = NA,
protocolNames = "Butterfly fixed transect survey | Butterfly visual encounter survey | Floral fixed transect survey | Floral visual encounter survey",
protocolDescriptions = "Butterflies were surveyed at grassland sites across North Dakota using fixed transect and visual encounter surveys conducted by two observers under suitable weather conditions. In 2017–2018, surveys included both fixed transects and roaming visual encounter surveys. In 2019–2020, protocols switched to entirely fixed transect surveys. Floral resources were surveyed using corresponding floral transect and visual encounter survey methods, recording flowering ramets and nectar resources within survey areas.",
protocolReferences = "", #need
isAbsenceReported = "true",
absentTaxa = NA,
isAbundanceReported = "true",
isAbundanceCapReported = "false",
abundanceCap = NA,
hasMaterialSamples = "false",
materialSampleTypes = NA,
hasVouchers = "false",
voucherInstitutions = NA,
isLeastSpecificTargetCategoryQuantityInclusive = "true",
verbatimTargetScope = "butterflies | flowering plants",
targetTaxonomicScope = "Lepidoptera | Magnoliophyta",
excludedTaxonomicScope = NA,
isTaxonomicScopeFullyReported = "true",
taxonCompletenessReported = "notReported",
taxonCompletenessProtocols = NA,
hasNonTargetTaxa = "true",
areNonTargetTaxaFullyReported = "false",
nonTargetTaxa = "non-flowering plants",
targetLifeStageScope = "adult",
excludedLifeStageScope = "egg | larva | pupa",
isLifeStageScopeFullyReported = "true",
targetDegreeOfEstablishmentScope = NA,
excludedDegreeOfEstablishmentScope = NA,
isDegreeOfEstablishmentScopeFullyReported = NA,
targetGrowthFormScope = "flowering individuals",
excludedGrowthFormScope = NA,
isGrowthFormScopeFullyReported = NA,
hasNonTargetOrganisms = NA,
targetHabitatScope = NA,
excludedHabitatScope = NA,
isSamplingEffortReported = "true",
samplingEffortProtocol = "Two observers separately and simultaneously conducted butterfly and floral surveys at each site. In 2017–2018, each observer completed one fixed transect survey (100 m, 10 minutes) and one visual encounter survey (4 ha, 30 minutes). In 2019–2020, surveys switched entirely to fixed transect methods, with each observer completing two fixed transect surveys (100 m, 10 minutes each).",
samplingEffortValue = "40",
samplingEffortUnit = "minutes per site visit per observer",
samplingPerformedBy = "Field staff"
)Site Visit Events
Each site visit - 2 people; twice a year for each site
Setup:
site_visit_list <- dplyr::bind_rows(
butterfly_data %>%
dplyr::group_by(Site, Date, Year) %>%
dplyr::summarise(
Observers = paste(sort(unique(Observer)), collapse = " | "),
Code = first(Code),
.groups = "drop"
),
flower_data %>%
dplyr::group_by(Site, Date, Year) %>%
dplyr::summarise(
Observers = paste(sort(unique(Observer)), collapse = " | "),
Code = first(Code),
.groups = "drop"
)
) %>%
dplyr::group_by(Site, Date, Year) %>%
dplyr::summarise(
Observers = paste(sort(unique(unlist(strsplit(Observers, " \\| ")))), collapse = " | "),
Code = first(Code),
.groups = "drop"
) %>%
dplyr::mutate(eventID = sub("^[A-Z0-9]+-[AB]", "", Code)) %>%
dplyr::select(-Code)
site_visit_list <- site_visit_list %>%
dplyr::group_by(Site, eventID) %>%
dplyr::summarise(
Year = paste(sort(unique(Year)), collapse = " | "),
Date = if(length(unique(Date)) > 1) {
paste(
format(min(as.Date(Date, format = "%m/%d/%y")), "%Y-%m-%d"),
format(max(as.Date(Date, format = "%m/%d/%y")), "%Y-%m-%d"),
sep = "/"
)
} else {
format(as.Date(first(Date), format = "%m/%d/%y"), "%Y-%m-%d")
},
Observers = paste(sort(unique(unlist(strsplit(paste(Observers, collapse = " | "), " \\| ")))), collapse = " | "),
.groups = "drop"
)
site_visit_list <- site_visit_list %>%
dplyr::arrange(eventID)Mapping Event core fields for the site visit event level:
added_site_visit_data_core <- data.frame(eventID = site_visit_list$eventID,
parentEventID = site_visit_list$Site,
fieldNumber = site_visit_list$Site,
habitat = "grassland",
locationID = site_visit_list$Site,
countryCode = "US",
decimalLatitude = site_coords_data$Y[match(paste(site_visit_list$Site, site_visit_list$Year), paste(site_coords_data$Site.Name, site_coords_data$Year))],
decimalLongitude = site_coords_data$X[match(paste(site_visit_list$Site, site_visit_list$Year), paste(site_coords_data$Site.Name, site_coords_data$Year))],
coordinateUncertaintyInMeters = 200,
geodeticDatum = "WGS84",
country = "United States",
stateProvince = "North Dakota",
county = butterfly_data$County[match(site_visit_list$Site, butterfly_data$Site)],
locality = site_name_mappings$Clean.Site[match(site_visit_list$Site, site_name_mappings$Clean.Site.Abbr)],
sampleSizeValue = NA,
sampleSizeUnit = NA,
footprintWKT = NA,
footprintSRS = NA,
eventDate = site_visit_list$Date,
eventTime = NA,
eventType = "site visit",
samplingProtocol = dplyr::case_when(
site_visit_list$Year %in% c(2017, 2018) ~ "Fixed transect butterfly survey | Visual encounter survey | Floral fixed transect survey | Floral visual encounter survey",
site_visit_list$Year %in% c(2019, 2020) ~ "Fixed transect butterfly survey | Floral fixed transect survey",
TRUE ~ NA
),
dataGeneralizations = NA,
informationWithheld = NA,
fieldNotes = NA,
eventRemarks = NA,
identifiedBy = site_visit_list$Observers
)Mapping Humboldt extension fields for the site visit event level:
added_site_visit_data_humboldt <- data.frame(eventID = site_visit_list$eventID,
siteNestingDescription = "Three grassland survey sites within each of North Dakota's 53 counties were surveyed twice annually for butterflies and flowering plants via transects and encounter surveys; Individual site locations could vary among years.",
siteCount = 1,
verbatimSiteNames = site_visit_list$Site,
verbatimSiteDescriptions = NA,
reportedWeather = NA,
reportedExtremeConditions = NA,
totalAreaSampledValue = dplyr::case_when(
site_visit_list$Year %in% c(2017, 2018) ~ "100 | 4",
site_visit_list$Year %in% c(2019, 2020) ~ "100",
TRUE ~ NA_character_
),
totalAreaSampledUnit = dplyr::case_when(
site_visit_list$Year %in% c(2017, 2018) ~ "m transect | ha survey area",
site_visit_list$Year %in% c(2019, 2020) ~ "m transect",
TRUE ~ NA_character_
),
geospatialScopeAreaValue = 4,
geospatialScopeAreaUnit = "hectares",
isVegetationCoverReported = "true",
eventDurationValue = 40,
eventDurationUnit = "minutes",
inventoryTypes = NA,
compilationTypes = NA,
compilationSourceTypes = NA,
protocolNames = dplyr::case_when(
site_visit_list$Year %in% c(2017, 2018) ~ "Butterfly fixed transect survey | Butterfly visual encounter survey | Floral fixed transect survey | Floral visual encounter survey",
site_visit_list$Year %in% c(2019, 2020) ~ "Butterfly fixed transect survey | Floral fixed transect survey",
TRUE ~ NA
),
protocolDescriptions = "Butterflies were surveyed at grassland sites across North Dakota using fixed transect and visual encounter surveys conducted by two observers under suitable weather conditions. In 2017–2018, surveys included both fixed transects and roaming visual encounter surveys. In 2019–2020, protocols switched to entirely fixed transect surveys. Floral resources were surveyed using corresponding floral transect and visual encounter survey methods, recording flowering ramets and nectar resources within survey areas.",
protocolReferences = "", #need
isAbsenceReported = "true",
absentTaxa = NA,
isAbundanceReported = "true",
isAbundanceCapReported = "false",
abundanceCap = NA,
hasMaterialSamples = "false",
materialSampleTypes = NA,
hasVouchers = "false",
voucherInstitutions = NA,
isLeastSpecificTargetCategoryQuantityInclusive = "true",
verbatimTargetScope = "butterflies | flowering plants",
targetTaxonomicScope = "Lepidoptera | Magnoliophyta",
excludedTaxonomicScope = NA,
isTaxonomicScopeFullyReported = "true",
taxonCompletenessReported = "notReported",
taxonCompletenessProtocols = NA,
hasNonTargetTaxa = "true",
areNonTargetTaxaFullyReported = "false",
nonTargetTaxa = "non-flowering plants",
targetLifeStageScope = "adult",
excludedLifeStageScope = "egg | larva | pupa",
isLifeStageScopeFullyReported = "true",
targetDegreeOfEstablishmentScope = NA,
excludedDegreeOfEstablishmentScope = NA,
isDegreeOfEstablishmentScopeFullyReported = NA,
targetGrowthFormScope = "flowering individuals",
excludedGrowthFormScope = NA,
isGrowthFormScopeFullyReported = NA,
hasNonTargetOrganisms = NA,
targetHabitatScope = NA,
excludedHabitatScope = NA,
isSamplingEffortReported = "true",
samplingEffortProtocol = "Two observers separately and simultaneously conducted butterfly and floral surveys at each site. In 2017–2018, each observer completed one fixed transect survey (100 m, 10 minutes) and one visual encounter survey (4 ha, 30 minutes). In 2019–2020, surveys switched entirely to fixed transect methods, with each observer completing two fixed transect surveys (100 m, 10 minutes each).",
samplingEffortValue = "80",
samplingEffortUnit = "observer minutes",
samplingPerformedBy = site_visit_list$Observers
)Survey Events
Surveys at each visit - transects and visual encounters
Setup:
survey_list <- dplyr::bind_rows(
butterfly_data %>% dplyr::distinct(Code, Site, Date, Year, Observer, Time, Temperature, Wind.Speed, Humidity, Cloud.Cover),
flower_data %>% dplyr::distinct(Code, Site, Date, Year, Observer)
) %>% dplyr::distinct(Code, .keep_all = TRUE)Mapping Event core fields for the survey event level:
added_survey_data_core <- data.frame(eventID = survey_list$Code,
parentEventID = sub("^[A-Z0-9]+-[AB]", "", survey_list$Code),
fieldNumber = survey_list$Site,
habitat = "grassland",
locationID = survey_list$Site,
countryCode = "US",
decimalLatitude = site_coords_data$Y[match(paste(survey_list$Site, survey_list$Year), paste(site_coords_data$Site.Name, site_coords_data$Year))],
decimalLongitude = site_coords_data$X[match(paste(survey_list$Site, survey_list$Year), paste(site_coords_data$Site.Name, site_coords_data$Year))],
coordinateUncertaintyInMeters = 200,
geodeticDatum = "WGS84",
country = "United States",
stateProvince = "North Dakota",
county = butterfly_data$County[match(survey_list$Site, butterfly_data$Site)],
locality = site_name_mappings$Clean.Site[match(survey_list$Site, site_name_mappings$Clean.Site.Abbr)],
sampleSizeValue = dplyr::case_when(
grepl("LTDS|FLTDS|LT1|LT2|FT1|FT2", survey_list$Code) ~ 100,
grepl("VES|FVES", survey_list$Code) ~ 4,
TRUE ~ NA_real_
),
sampleSizeUnit = dplyr::case_when(
grepl("LTDS|FLTDS|LT1|LT2|FT1|FT2", survey_list$Code) ~ "m",
grepl("VES|FVES", survey_list$Code) ~ "ha",
TRUE ~ NA_character_
),
footprintWKT = NA,
footprintSRS = NA,
eventDate = format(as.Date(as.character(survey_list$Date), format = "%m/%d/%y"), "%Y-%m-%d"),
eventTime = survey_list$Time,
eventType = "survey",
samplingProtocol = dplyr::case_when(
grepl("^LTDS", survey_list$Code) ~ "Butterfly fixed transect survey",
grepl("^VES", survey_list$Code) ~ "Butterfly visual encounter survey",
grepl("^FLTDS", survey_list$Code) ~ "Floral fixed transect survey",
grepl("^FVES", survey_list$Code) ~ "Floral visual encounter survey",
grepl("^LT[12]", survey_list$Code) ~ "Butterfly fixed transect survey",
grepl("^FT[12]", survey_list$Code) ~ "Floral fixed transect survey",
TRUE ~ NA
),
dataGeneralizations = NA,
informationWithheld = NA,
fieldNotes = NA,
eventRemarks = NA,
identifiedBy = survey_list$Observer
)
added_survey_data_core <- added_survey_data_core %>%
dplyr::arrange(parentEventID)Mapping Humboldt extension fields for the survey event level:
added_survey_data_humboldt <- data.frame(eventID = survey_list$Code,
siteNestingDescription = "Three grassland survey sites within each of North Dakota's 53 counties were surveyed twice annually for butterflies and flowering plants via transects and encounter surveys; Individual site locations could vary among years.",
siteCount = NA,
verbatimSiteNames = NA,
verbatimSiteDescriptions = NA,
reportedWeather = apply(
survey_list[, c("Temperature", "Wind.Speed", "Humidity", "Cloud.Cover")],
1,
function(x) {
vals <- c(
if(!is.na(x[1])) paste("Temperature:", x[1]),
if(!is.na(x[2])) paste("Wind speed:", x[2]),
if(!is.na(x[3])) paste("Humidity:", x[3]),
if(!is.na(x[4])) paste("Cloud cover:", x[4])
)
if(length(vals) == 0) NA else paste(vals, collapse = " | ")
}
),
reportedExtremeConditions = NA,
totalAreaSampledValue = dplyr::case_when(
grepl("LTDS|FLTDS|LT1|LT2|FT1|FT2", survey_list$Code) ~ "100",
grepl("VES|FVES", survey_list$Code) ~ "4",
TRUE ~ NA_character_
),
totalAreaSampledUnit = dplyr::case_when(
grepl("LTDS|FLTDS|LT1|LT2|FT1|FT2", survey_list$Code) ~ "m",
grepl("VES|FVES", survey_list$Code) ~ "ha",
TRUE ~ NA_character_
),
geospatialScopeAreaValue = dplyr::case_when(
grepl("LTDS|FLTDS|LT1|LT2|FT1|FT2", survey_list$Code) ~ 100,
grepl("VES|FVES", survey_list$Code) ~ 4,
TRUE ~ NA_real_
),
geospatialScopeAreaUnit = dplyr::case_when(
grepl("LTDS|FLTDS|LT1|LT2|FT1|FT2", survey_list$Code) ~ "m",
grepl("VES|FVES", survey_list$Code) ~ "ha",
TRUE ~ NA_character_
),
isVegetationCoverReported = NA,
eventDurationValue = dplyr::case_when(
grepl("LTDS|FLTDS|LT1|LT2|FT1|FT2", survey_list$Code) ~ 10,
grepl("VES|FVES", survey_list$Code) ~ 30,
TRUE ~ NA_real_
),
eventDurationUnit = "minutes",
inventoryTypes = NA,
compilationTypes = NA,
compilationSourceTypes = NA,
protocolNames = dplyr::case_when(
grepl("^LTDS|^LT[12]", survey_list$Code) ~ "Butterfly fixed transect survey",
grepl("^VES", survey_list$Code) ~ "Butterfly visual encounter survey",
grepl("^FLTDS|^FT[12]", survey_list$Code) ~ "Floral fixed transect survey",
grepl("^FVES", survey_list$Code) ~ "Floral visual encounter survey",
TRUE ~ NA
),
protocolDescriptions = "Butterflies were surveyed at grassland sites across North Dakota using fixed transect and visual encounter surveys conducted by two observers under suitable weather conditions. In 2017–2018, surveys included both fixed transects and roaming visual encounter surveys. In 2019–2020, protocols switched to entirely fixed transect surveys. Floral resources were surveyed using corresponding floral transect and visual encounter survey methods, recording flowering ramets and nectar resources within survey areas.",
protocolReferences = "", #need
isAbsenceReported = "true",
absentTaxa = NA,
isAbundanceReported = "true",
isAbundanceCapReported = "false",
abundanceCap = NA,
hasMaterialSamples = "false",
materialSampleTypes = NA,
hasVouchers = "false",
voucherInstitutions = NA,
isLeastSpecificTargetCategoryQuantityInclusive = "true",
verbatimTargetScope = "butterflies | flowering plants",
targetTaxonomicScope = "Lepidoptera | Magnoliophyta",
excludedTaxonomicScope = NA,
isTaxonomicScopeFullyReported = "true",
taxonCompletenessReported = "notReported",
taxonCompletenessProtocols = NA,
hasNonTargetTaxa = "true",
areNonTargetTaxaFullyReported = "false",
nonTargetTaxa = "non-flowering plants",
targetLifeStageScope = "adult",
excludedLifeStageScope = "egg | larva | pupa",
isLifeStageScopeFullyReported = "true",
targetDegreeOfEstablishmentScope = NA,
excludedDegreeOfEstablishmentScope = NA,
isDegreeOfEstablishmentScopeFullyReported = NA,
targetGrowthFormScope = "flowering individuals",
excludedGrowthFormScope = NA,
isGrowthFormScopeFullyReported = NA,
hasNonTargetOrganisms = NA,
targetHabitatScope = NA,
excludedHabitatScope = NA,
isSamplingEffortReported = "true",
samplingEffortProtocol = "Two observers separately and simultaneously conducted butterfly and floral surveys at each site. In 2017–2018, each observer completed one fixed transect survey (100 m, 10 minutes) and one visual encounter survey (4 ha, 30 minutes). In 2019–2020, surveys switched entirely to fixed transect methods, with each observer completing two fixed transect surveys (100 m, 10 minutes each).",
samplingEffortValue = "40",
samplingEffortUnit = "minutes per site visit per observer",
samplingPerformedBy = survey_list$Observer
)Merge Event Tables
event_data_core <- bind_rows(
NDbutterflies_project_event_core,
added_counties_data_core,
added_site_data_core,
added_site_visit_data_core,
added_survey_data_core
)
event_data_humboldt <- bind_rows(
NDbutterflies_project_event_Humboldt,
added_county_data_humboldt,
added_site_data_humboldt,
added_site_visit_data_humboldt,
added_survey_data_humboldt
)
rownames(event_data_core) <- NULL
event_data_core <- event_data_core %>% select(where(~ !all(is.na(.)))) #Remove columns that are entirely NA
rownames(event_data_humboldt) <- NULL
event_data_humboldt <- event_data_humboldt %>% select(where(~ !all(is.na(.)))) #Remove columns that are entirely NAMap Occurrence Data
Butterfly Occurrences
#map species codes
butterfly_data$scientificName <- species_codes_data$Scientific.Name[
match(butterfly_data$Butterfly.Code, species_codes_data$Code)
]
butterfly_data$vernacularName <- species_codes_data$Common.Name[
match(butterfly_data$Butterfly.Code, species_codes_data$Code)
]added_butterfly_occurrence_data <- data.frame(eventID = butterfly_data$Code,
occurrenceID = butterfly_data$occurrenceID,
basisOfRecord = "HumanObservation",
kingdom = "Animalia",
scientificName = butterfly_data$scientificName,
vernacularName = butterfly_data$vernacularName,
taxonRank = ifelse(
is.na(butterfly_data$scientificName),
NA,
ifelse(grepl(" spp$", butterfly_data$scientificName), "genus", "species")
),
occurrenceStatus = ifelse(butterfly_data$Butterfly.Code == "None", "absent", "present"),
individualCount = butterfly_data$Count,
organismQuantity = butterfly_data$Count,
organismQuantityType = "Individuals",
recordedBy = butterfly_data$Observer,
behavior = ifelse(
butterfly_data$Behavior == "" & butterfly_data$Plant.Species == "",
NA,
ifelse(
grepl("ground|litter|rock", butterfly_data$Plant.Species, ignore.case = TRUE),
paste(butterfly_data$Behavior, butterfly_data$Plant.Species),
butterfly_data$Behavior
)
),
lifeStage = ifelse(butterfly_data$Butterfly.Code == "None", NA, "adult"),
vitality = ifelse(butterfly_data$Butterfly.Code == "None", NA, "alive"),
identifiedBy = ifelse(butterfly_data$Butterfly.Code == "None", NA, butterfly_data$Observer)
)Plant Occurrences from Butterfly Data
Add plant occurrences from butterfly_data where there is no matching flower occurrence
butterfly_data$associatedOccurrences <- flower_data$occurrenceID[
match(
paste(
dplyr::case_when(
grepl("^VES", butterfly_data$Code) ~ sub("^VES", "FVES", butterfly_data$Code),
grepl("^LTDS", butterfly_data$Code) ~ sub("^LTDS", "FLTDS", butterfly_data$Code),
grepl("^LT1", butterfly_data$Code) ~ sub("^LT1", "FT1", butterfly_data$Code),
grepl("^LT2", butterfly_data$Code) ~ sub("^LT2", "FT2", butterfly_data$Code),
TRUE ~ butterfly_data$Code
),
butterfly_data$Plant.Species
),
paste(flower_data$Code, flower_data$Flower.Species)
)
]
missing_matches <- is.na(butterfly_data$associatedOccurrences) & butterfly_data$Plant.Species != ""
alt_codes <- dplyr::case_when(
grepl("^VES", butterfly_data$Code[missing_matches]) ~ sub("^VES", "FVES", butterfly_data$Code[missing_matches]),
grepl("^LTDS", butterfly_data$Code[missing_matches]) ~ sub("^LTDS", "FLTDS", butterfly_data$Code[missing_matches]),
grepl("^LT1", butterfly_data$Code[missing_matches]) ~ sub("^LT1", "FT1", butterfly_data$Code[missing_matches]),
grepl("^LT2", butterfly_data$Code[missing_matches]) ~ sub("^LT2", "FT2", butterfly_data$Code[missing_matches]),
TRUE ~ butterfly_data$Code[missing_matches]
)
alt_codes <- ifelse(grepl("-A", alt_codes),
sub("-A", "-B", alt_codes),
sub("-B", "-A", alt_codes))
butterfly_data$associatedOccurrences[missing_matches] <- flower_data$occurrenceID[
match(
paste(alt_codes, butterfly_data$Plant.Species[missing_matches]),
paste(flower_data$Code, flower_data$Flower.Species)
)
]
plants_to_add <- butterfly_data %>%
dplyr::filter(
Plant.Species != "",
is.na(associatedOccurrences)
)
#map species codes
plants_to_add$scientificName <- species_codes_data$Scientific.Name[
match(plants_to_add$Plant.Species, species_codes_data$Code)
]
plants_to_add$vernacularName <- species_codes_data$Common.Name[
match(plants_to_add$Plant.Species, species_codes_data$Code)
]
#remove rows where could not map
plants_to_add <- plants_to_add %>%
dplyr::filter(!is.na(scientificName))
#redo occurrenceID
plants_to_add <- plants_to_add %>%
dplyr::select(-Butterfly.Code, -Temperature, -Wind.Speed, -Cloud.Cover, -Behavior, -Time, -Count, -occurrenceID) %>%
dplyr::distinct()
plants_to_add %<>% mutate(occurrenceID = paste(vdigest (paste(Code, Site, Date, Observer, Plant.Species), algo="md5"), sep=":"))added_plant_occurrence_data <- data.frame(eventID = plants_to_add$Code,
occurrenceID = plants_to_add$occurrenceID,
basisOfRecord = "HumanObservation",
kingdom = "Plantae",
scientificName = plants_to_add$scientificName,
vernacularName = plants_to_add$vernacularName,
taxonRank = ifelse(
is.na(plants_to_add$scientificName),
NA,
ifelse(grepl(" spp$", plants_to_add$scientificName), "family", "species")
),
occurrenceStatus = "present",
recordedBy = plants_to_add$Observer,
vitality = "alive",
identifiedBy = plants_to_add$Observer,
occurrenceRemarks = "Plant added as an associated occurrence noted during butterfly surveys"
)Flower Occurrences
#map species codes
flower_data$scientificName <- species_codes_data$Scientific.Name[
match(flower_data$Flower.Species, species_codes_data$Code)
]
flower_data$vernacularName <- species_codes_data$Common.Name[
match(flower_data$Flower.Species, species_codes_data$Code)
]
#remove rows where could not map
flower_data <- flower_data %>%
dplyr::filter(Flower.Species == "none" | !is.na(scientificName))added_flower_occurrence_data <- data.frame(eventID = flower_data$Code,
occurrenceID = flower_data$occurrenceID,
basisOfRecord = "HumanObservation",
kingdom = "Plantae",
scientificName = flower_data$scientificName,
vernacularName = flower_data$vernacularName,
taxonRank = ifelse(
is.na(flower_data$scientificName),
NA,
ifelse(grepl(" spp$", flower_data$scientificName), "genus", "species")
),
occurrenceStatus = ifelse(flower_data$Flower.Species == "none", "absent", "present"),
organismQuantity = flower_data$Count,
organismQuantityType = "Flowering ramets (stems)",
recordedBy = flower_data$Observer,
vitality = ifelse(flower_data$Flower.Species == "none", NA, "alive"),
identifiedBy = ifelse(flower_data$Flower.Species == "none", NA, butterfly_data$Observer)
)Map Resource Relationship Data
missing_matches <- is.na(butterfly_data$associatedOccurrences) & butterfly_data$Plant.Species != ""
butterfly_data$associatedOccurrences[missing_matches] <- plants_to_add$occurrenceID[
match(
paste(butterfly_data$Code[missing_matches], butterfly_data$Plant.Species[missing_matches]),
paste(plants_to_add$Code, plants_to_add$Plant.Species)
)
]
butterfly_plant_interactions <- butterfly_data %>%
dplyr::filter(!is.na(associatedOccurrences), Plant.Species != "") %>%
dplyr::transmute(
resourceID = occurrenceID,
relatedResourceID = associatedOccurrences,
relationshipOfResource = Behavior
)
butterfly_plant_interactions <- butterfly_plant_interactions %>%
dplyr::filter(!is.na(relatedResourceID))butterfly_plant_interactions <- butterfly_plant_interactions %>%
dplyr::filter(relationshipOfResource != "")
resource_relationship_data <- data.frame(resourceID = butterfly_plant_interactions$resourceID,
relatedResourceID = butterfly_plant_interactions$relatedResourceID,
relationshipOfResource = tolower(butterfly_plant_interactions$relationshipOfResource)
)Map Extended Measurement or Fact Data
Certain details do not fit into the core event or occurrence tables. These are represented in the ExtendedMeasurementOrFact (eMoF) table.
Site Vegetation
site_vegetation_data$eventID <- site_visit_list$eventID[
match(
paste(site_vegetation_data$Site, site_vegetation_data$Year),
paste(site_visit_list$Site, site_visit_list$Year)
)
]
emof_vegetation_data <- site_vegetation_data %>%
tidyr::pivot_longer(
cols = -c(Year, Site, Original.Site, eventID),
names_to = "measurementType",
values_to = "measurementValue"
) %>%
dplyr::filter(!is.na(eventID)) %>%
dplyr::select(eventID, measurementType, measurementValue)Site Agency
butterfly_data <- read.csv(input_butterfly)
flower_data <- read.csv(input_flower)
butterfly_data$Original.Site <- butterfly_data$Site
butterfly_data$Site <- site_name_mappings$Clean.Site.Abbr[
match(butterfly_data$Site, site_name_mappings$Dirty.Site)
]
flower_data$Original.Site <- flower_data$Site
flower_data$Site <- site_name_mappings$Clean.Site.Abbr[
match(flower_data$Site, site_name_mappings$Dirty.Site)
]
emof_agency_data <- dplyr::bind_rows(
butterfly_data %>% dplyr::select(Site, Agency),
flower_data %>% dplyr::select(Site, Agency)
) %>%
dplyr::distinct() %>%
dplyr::mutate(
measurementType = "agency",
measurementValue = Agency
) %>%
dplyr::select(
eventID = Site,
measurementType,
measurementValue
)Ecoregion Number
Once we have the shape file, fill this data out more
emof_ecoregion_data <- dplyr::bind_rows(
butterfly_data %>% dplyr::select(Site, Ecoregion.Number),
flower_data %>% dplyr::select(Site, Ecoregion.Number)
) %>%
dplyr::distinct() %>%
dplyr::mutate(
measurementType = "ecoregion number",
measurementValue = Ecoregion.Number
) %>%
dplyr::select(
eventID = Site,
measurementType,
measurementValue
) %>%
dplyr::filter(!is.na(measurementValue))Visit Number
Once we have the shape file, fill this data out more
Merge Extended Measurement or Fact Tables
emof_vegetation_data$measurementValue <- as.character(emof_vegetation_data$measurementValue)
emof_agency_data$measurementValue <- as.character(emof_agency_data$measurementValue)
emof_ecoregion_data$measurementValue <- as.character(emof_ecoregion_data$measurementValue)
emof_visit_data$measurementValue <- as.character(emof_visit_data$measurementValue)
emof_data <- bind_rows(emof_vegetation_data,
emof_agency_data,
emof_ecoregion_data,
emof_visit_data
)
rownames(emof_data) <- NULLSave Data
Full Dataset
The full processed event and occurrence tables generated through this workflow are available in the project repository. NDButterfliesDwC/full.
write.csv(event_data_core, "outputs/event.csv", row.names = FALSE)
write.csv(event_data_humboldt, "outputs/humboldtecologicalinventory.csv", row.names = FALSE)
write.csv(occurrence_data, "outputs/occurrence.csv", row.names = FALSE)
write.csv(resource_relationship_data, "outputs/resourceRelationship.csv", row.names = FALSE)
write.csv(emof_data, "outputs/extendedMeasurementOrFact.csv", row.names = FALSE)Some fun plots because why not
Butterfly/Flower Species Richness
library(dplyr)
library(ggplot2)
library(maps)
library(patchwork)
## Butterfly richness
butterfly_richness <- occurrence_data %>%
filter(
kingdom == "Animalia",
occurrenceStatus == "present",
taxonRank == "species"
) %>%
left_join(
event_data_core %>%
select(eventID, county),
by = "eventID"
) %>%
group_by(county) %>%
summarise(species_richness = n_distinct(scientificName))
butterfly_richness$county <- dplyr::recode(
butterfly_richness$county,
"LaMoure" = "La Moure",
"McHenry" = "Mchenry",
"McIntosh" = "Mcintosh",
"McKenzie" = "Mckenzie",
"McLean" = "Mclean"
)
## Flower richness
flower_richness <- occurrence_data %>%
filter(
kingdom == "Plantae",
is.na(individualCount),
occurrenceStatus == "present",
taxonRank == "species"
) %>%
left_join(
event_data_core %>%
select(eventID, county),
by = "eventID"
) %>%
group_by(county) %>%
summarise(species_richness = n_distinct(scientificName))
flower_richness$county <- dplyr::recode(
flower_richness$county,
"LaMoure" = "La Moure",
"McHenry" = "Mchenry",
"McIntosh" = "Mcintosh",
"McKenzie" = "Mckenzie",
"McLean" = "Mclean"
)
## Map base
nd_map <- map_data("county", region = "north dakota")
nd_map$county <- tools::toTitleCase(gsub("north dakota,", "", nd_map$subregion))
## Join data
butterfly_map <- left_join(nd_map, butterfly_richness, by = "county")
flower_map <- left_join(nd_map, flower_richness, by = "county")
## Butterfly plot
p1 <- ggplot(butterfly_map, aes(long, lat, group = group, fill = species_richness)) +
geom_polygon(color = "black", linewidth = 0.2) +
coord_fixed(1.3) +
scale_fill_gradient(
low = "lightyellow",
high = "darkred",
name = "Butterfly\nRichness",
na.value = "white"
) +
theme_minimal() +
labs(
title = "Butterfly Species Richness"
)
## Flower plot
p2 <- ggplot(flower_map, aes(long, lat, group = group, fill = species_richness)) +
geom_polygon(color = "black", linewidth = 0.2) +
coord_fixed(1.3) +
scale_fill_gradient(
low = "lightyellow",
high = "darkred",
name = "Flower\nRichness",
na.value = "white"
) +
theme_minimal() +
labs(
title = "Flower Species Richness"
)
p1 + p2Scatterplot butterfly v flower richness
library(dplyr)
library(ggplot2)
butterfly_richness <- occurrence_data %>%
filter(
kingdom == "Animalia",
occurrenceStatus == "present",
taxonRank == "species"
) %>%
left_join(
event_data_core %>%
select(eventID, parentEventID),
by = "eventID"
) %>%
group_by(parentEventID) %>%
summarise(butterfly_richness = n_distinct(scientificName), .groups = "drop")
flower_richness <- occurrence_data %>%
filter(
kingdom == "Plantae",
occurrenceStatus == "present",
taxonRank == "species"
) %>%
left_join(
event_data_core %>%
select(eventID, parentEventID),
by = "eventID"
) %>%
group_by(parentEventID) %>%
summarise(flower_richness = n_distinct(scientificName), .groups = "drop")
richness_data <- left_join(
butterfly_richness,
flower_richness,
by = "parentEventID"
)
ggplot(richness_data, aes(x = flower_richness, y = butterfly_richness)) +
geom_point(size = 3, alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE) +
theme_minimal() +
labs(
x = "Flower Species Richness",
y = "Butterfly Species Richness",
title = "Butterfly vs Flower Species Richness",
subtitle = "One point per visit"
)## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
Heatmap
library(dplyr)
library(ggplot2)
interaction_data <- butterfly_plant_interactions %>%
filter(
relationshipOfResource == "nectaring"
) %>%
left_join(
occurrence_data %>%
select(occurrenceID, butterfly = scientificName),
by = c("resourceID" = "occurrenceID")
) %>%
left_join(
occurrence_data %>%
select(occurrenceID, plant = scientificName),
by = c("relatedResourceID" = "occurrenceID")
) %>%
filter(!is.na(butterfly), !is.na(plant)) %>%
count(butterfly, plant, sort = TRUE)
butterfly_order <- interaction_data %>%
group_by(butterfly) %>%
summarise(total = sum(n)) %>%
arrange(total) %>%
pull(butterfly)
plant_order <- interaction_data %>%
group_by(plant) %>%
summarise(total = sum(n)) %>%
arrange(total) %>%
pull(plant)
interaction_data$butterfly <- factor(interaction_data$butterfly, levels = butterfly_order)
interaction_data$plant <- factor(interaction_data$plant, levels = plant_order)
ggplot(interaction_data, aes(x = plant, y = butterfly, fill = n)) +
geom_tile(color = "gray90") +
scale_fill_viridis_c(option = "plasma", trans = "log10") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
panel.grid = element_blank()
) +
labs(
x = "Plant Species",
y = "Butterfly Species",
fill = "Interactions",
title = "Butterfly–Plant Nectaring Heatmap"
)Map for a species
library(dplyr)
library(ggplot2)
library(maps)
target_species <- "Phyciodes batesii"
species_points <- occurrence_data %>%
filter(scientificName == target_species) %>%
left_join(
event_data_core %>%
select(eventID, decimalLongitude, decimalLatitude),
by = "eventID"
) %>%
distinct(decimalLongitude, decimalLatitude)
nd_map <- map_data("state")
ggplot() +
geom_polygon(
data = subset(nd_map, region == "north dakota"),
aes(long, lat, group = group),
fill = "gray95",
color = "black"
) +
geom_point(
data = species_points,
aes(decimalLongitude, decimalLatitude),
color = "red",
size = 3,
alpha = 0.7
) +
coord_fixed(1.3) +
theme_minimal() +
labs(
title = paste("Occurrences of", target_species),
x = "Longitude",
y = "Latitude"
)