--- title: "Basic workflow" output: rmarkdown::html_vignette: vignette: > %\VignetteIndexEntry{Basic workflow} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r libraryChunk, load-packages, include=FALSE} # markdown packages library(rgnparser) library(magrittr) library(knitr) library(rmarkdown) library(rmdformats) library(prettydoc) library(htmltools) library(pkgdown) # Load core packages library(devtools) library(BiocManager) library(purrr) library(here) library(renv) library(bdc) library(CoordinateCleaner) library(dplyr) library(readr) library(stringr) library(lubridate) library(tidyselect) library(R.utils) library(tidyr) library(ggplot2) library(forcats) library(emld) library(rlang) library(xml2) library(mgsub) library(rvest) library(rnaturalearth) library(rnaturalearthdata) library(countrycode) library(janitor) library(circlize) library(paletteer) library(cowplot) library(igraph) library(ggspatial) library(sf) library(parallel) library(terra) # Dont detect cores to avoid GitHbub error old <- options() # code line i on.exit(options(old)) # code line i+1 options(mc.cores = parallel::detectCores()) ``` ```{r secretRootPath, include=FALSE} # Set the RootPath to tempdir RootPath <- tempdir() if (!dir.exists(paste0(RootPath, "/Data_acquisition_workflow"))) { dir.create(paste0(RootPath, "/Data_acquisition_workflow"), recursive = TRUE) } ``` ```{r global-options, include=FALSE} knitr::opts_chunk$set(error = TRUE, eval = TRUE, tidy = TRUE, warning = FALSE, root.dir = normalizePath(tempdir())) ``` This workflow is meant to be a basic example workflow of how a user might take a flagged version of our (or some other) occurrence dataset and filter for specific taxa or countries, re-apply flagging functions, re-filter the data, or make maps based of those data. # 0.0 Script preparation ## 0.1 Working directory Choose the path to the root folder in which all other folders can be found. ```{r falseRootPath, eval=FALSE} RootPath <- paste0("/your/path/here") ``` ```{r CreateRootPath, warning=FALSE, collapse = TRUE} # Create the working directory in the RootPath if it doesn't exist already if (!dir.exists(paste0(RootPath, "/Data_acquisition_workflow"))) { dir.create(paste0(RootPath, "/Data_acquisition_workflow"), recursive = TRUE) } # Set the working directory setwd(paste0(RootPath,"/Data_acquisition_workflow")) ``` For the first time that you run BeeBDC, and if you want to use the renv package to manage your packages, you can install renv... install.packages("renv", repos = "http://cran.us.r-project.org") and then initialise renv the project. renv::init(project = paste0(RootPath,"/Data_acquisition_workflow")) If you have already initialised a project, you can instead just activate it. ```{r activate, collapse = TRUE} renv::activate(project = paste0(RootPath,"/Data_acquisition_workflow")) ``` ## 0.2 Install packages (if needed) You may need to install gdal on your computer. This can be done on a Mac by using Homebrew in the terminal and the command "brew install gdal". To start out, you will need to install **BiocManager**, **devtools**, **ComplexHeatmap**, and **rnaturalearthhires** to then install and fully use **BeeBDC**. ```{r installPackages, message=FALSE, warning=FALSE, results=FALSE, collapse = TRUE, eval = FALSE} if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager", repos = "http://cran.us.r-project.org") BiocManager::install("ComplexHeatmap") ``` ```{r rnaturalearthhires, eval=FALSE} # Install remotes if needed if (!require("remotes", quietly = TRUE)) install.packages("remotes", repos = "http://cran.us.r-project.org") # Download and then load rnaturalearthhires remotes::install_github("ropensci/rnaturalearthhires") install.packages("rnaturalearthhires", repos = "https://ropensci.r-universe.dev", type = "source") library(rnaturalearthhires) ``` Now install **BeeBDC**. ```{r installBeeBDC, results=TRUE, message=TRUE, eval = FALSE, collapse = TRUE} install.packages("BeeBDC") library(BeeBDC) ``` Snapshot the renv environment. ```{r snapshot, collapse = TRUE} renv::snapshot(project = paste0(RootPath,"/Data_acquisition_workflow"), prompt = FALSE) ``` Set up the directories used by **BeeBDC**. These directories include where the data, figures, reports, etc. will be saved. The RDoc needs to be a path RELATIVE to the RootPath; i.e., the file path from which the two diverge. ```{r dirMaker, collapse = TRUE, eval = FALSE} BeeBDC::dirMaker( RootPath = RootPath, RDoc = "vignettes/BeeBDC_main.Rmd") %>% # Add paths created by this function to the environment() list2env(envir = parent.env(environment())) ``` ```{r dirMakerSECRETELY, include = FALSE} # For the sake of this tutorial, we will not use here::i_am in dirMaker, because we aren't allowed # to mess with package directories in this way. This will work-around to use the tempdir() DataPath <- paste0(RootPath, "/Data_acquisition_workflow") OutPath_Check <- paste0(RootPath, "/Data_acquisition_workflow/Output/Check") OutPath_Figures <- paste0(RootPath, "/Data_acquisition_workflow/Output/Figures") OutPath_Intermediate <- paste0(RootPath, "/Data_acquisition_workflow/Output/Intermediate") OutPath_Report <- paste0(RootPath, "/Data_acquisition_workflow/Output/Report") # Create these files if (!dir.exists(DataPath)) { dir.create(DataPath, recursive = TRUE)} if (!dir.exists(OutPath_Check)) { dir.create(OutPath_Check, recursive = TRUE)} if (!dir.exists(OutPath_Figures)) { dir.create(OutPath_Figures, recursive = TRUE)} if (!dir.exists(OutPath_Intermediate)) { dir.create(OutPath_Intermediate, recursive = TRUE)} if (!dir.exists(OutPath_Report)) { dir.create(OutPath_Report, recursive = TRUE)} ``` ## 0.3 Load packages Load packages. ```{r lapply_library, results=FALSE, collapse = TRUE} lapply(c("ComplexHeatmap", "magrittr"), library, character.only = TRUE) ``` # 2.0 Taxon example If you want to filter the dataset to a particular taxon of interest, you can do so quite easily using **dplyr** from the **tidyverse** group of packages. To filter to a selected bee genus, in our case Anthophorini... ```{r 2.0, eval = FALSE} # Load some package data — the taxonomy and a flagged example dataset # Download the full beesTaxonomy file taxonomyFile <- BeeBDC::beesTaxonomy() ``` ```{r 2.0secret, collapse = TRUE, eval = TRUE} # load in the small test dataset in the background system.file("extdata", "testTaxonomy.rda", package="BeeBDC") |> load() # Rename the file taxonomyFile <- testTaxonomy rm(testTaxonomy) ``` ```{r 2.0ii} # Load the example beesFlagged dataset beesFlagged <- BeeBDC::beesFlagged selectedGenera <- taxonomyFile %>% # Select only tribe anthophorini (for example) dplyr::filter(tolower(tribe) == tolower("anthophorini")) %>% distinct(genus) # Filter the data taxonData <- beesFlagged %>% dplyr::filter(genus %in% selectedGenera$genus) # View the data taxonData ``` # 3.0 Country example Similarly to the above you can filter for only countries of interest. Keep in mind, that sometimes the *country* column may not hold all of the records that fall in that country, if it, or the coordinates, have been entered incorrectly. ```{r 3.0} # Select your study area studyArea <- c("Canada", "United states", "Mexico", "Guatemala") # Filter the data to that area countryData <- beesFlagged %>% dplyr::filter(country %in% studyArea) # View the data countryData ``` # 4.0 Filtering example ## 4.1 Simple filter The **BeeBDC** package provides a simple function that can re-build the *.summary* column based off of the filtering columns that are present in the dataset (those starting with "."). you can also choose which filters you DO NOT want to implement using the dontFilterThese argument. In this example, we are also removing all of the filtering columns in the output dataset (removeFilterColumns = TRUE) and filtering to only completely clean occurrences (filterClean = TRUE). For the latter, we are only keeping *.summary* == TRUE. ```{r 4.1} filteredData <- BeeBDC::summaryFun(data = beesFlagged, # Choose the columns to NOT filter (or NULL to filter all columns) dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms", ".uncertaintyThreshold"), # In the output, do you want to REMOVE all filtering columns (TRUE), or keep them (FALSE) removeFilterColumns = TRUE, # In the output, do you want to only keep clean data according to your filtering (TRUE), # Or keep all data and simply update the .summary column (FALSE) filterClean = TRUE) ``` ## 4.2 Uncertainty threshold You may also want to change the *.uncertaintyThreshold* as we have chosen a somewhat strict default of 1 km in our dataset. Here, we will instead flag to 10 km (threshold = 10000 [m]). Additionally, we use the **magrittr** package pipe (%>%) to feed the outputs directly into `summaryFun()` to filter our data in one action! ```{r 4.2} filteredData <- beesFlagged %>% # Remove any exiting .uncertaintyThreshold column dplyr::select(!tidyselect::any_of(".uncertaintyThreshold")) %>% # Chose the coordinate uncertainty to filter to... BeeBDC::coordUncerFlagR(data = ., uncerColumn = "coordinateUncertaintyInMeters", # 10 km here threshold = 10000) %>% # Now re-do the .summary column and filter the data using this new value BeeBDC::summaryFun( data = ., dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms"), removeFilterColumns = TRUE, filterClean = TRUE) ``` ## 4.2 Date filter ### a. bdc_year_outOfRange Another column that users are likely to want to pay close attention to is the *.year_outOfRange* column that is set at 1950 in our dataset. In this case, **bdc** provides the function where users can change the year_threshold argument to, in this case, 1970. As with above, we then use `summaryFun()` to get results in one go. ```{r 4.2a} filteredData <- beesFlagged %>% # Remove any exisitng .year_outOfRange column dplyr::select(!".year_outOfRange") %>% # Chose the minimum year to filter to... bdc::bdc_year_outOfRange(data = ., eventDate = "year", year_threshold = 1970) %>% # Now re-do the .summary column and filter the data using this new value BeeBDC::summaryFun( data = ., dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms", ".uncertaintyThreshold"), removeFilterColumns = TRUE, filterClean = TRUE) ``` ### b. year range Or, if you're interested in a particular time period, again **dplyr** comes to the rescue with some very straight forward filtering within a year range. ```{r 4.2b} filteredData <- # The input dataset beesFlagged %>% # Chose the year range... dplyr::filter(year > 1950 & year < 1970) %>% # Now re-do the .summary column and filter the data using this new value BeeBDC::summaryFun( # Select the input dataset to filter data = ., # Choose the columns to NOT filter (or NULL to filter all columns) dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms", ".uncertaintyThreshold"), # In the output, do you want to REMOVE all filtering columns (TRUE), or keep them (FALSE) removeFilterColumns = TRUE, # In the output, do you want to only keep clean data according to your filtering (TRUE), # Or keep all data and simply update the .summary column (FALSE) filterClean = TRUE) ``` Users may choose any number of filtering steps form the main workflow to include above `summaryFun()`, just use pipes '%>%' between the function and use '.' as the data input because this will feed in the data aoutput from the above function into the proceeding one. # 5. Summary figures Now, if you wanted to rebuild some figures, say after you've added or filtered data, then you can use some of the below processes. ## 5.1 Duplicate chordDiagrams Our `chordDiagramR()` function is very useful and it relies on two great packages, **circlize** and **ComplexHeatmap**. Unfortunately, the latter is not available on CRAN and so must be downloaded using **BiocManager**. ```{r 5.1, eval = FALSE} if(!require("BiocManager", quietly = TRUE)){ install.packages("BiocManager")} BiocManager::install("ComplexHeatmap", force = TRUE) renv::snapshot() ``` We don't actually have an example duplicates dataset with the package, so I'll magic one up behind the scences! ```{r 5.1ii, eval = FALSE} duplicates <- fileFinder(path = "PATH TO A FOLDER CONTAINING THE duplicateRun_ — could be supp. materials folder", fileName = "duplicateRun_") %>% readr::read_csv() %>% # Select only the stingless bee data dplyr::filter(database_id %in% stinglessData$database_id | database_id_match %in% stinglessData$database_id) ``` Then, set some parameters for figure borders and run your data through `chordDiagramR()`. ```{r 5.1on.exit, include = FALSE} oldpar <- par(no.readonly = TRUE) on.exit(oldpar) ``` ```{r 5.1iii, eval = FALSE} # Choose the global figure parameters par(mar = c(2, 2, 2, 2)/2, mfrow = c(1,1)) # Create the chorDiagram. You can leave many of the below values out but we show here # the defaults BeeBDC::chordDiagramR( # The duplicate data from the dupeSummary function output dupeData = duplicates, outPath = OutPath_Figures, fileName = "ChordDiagram.pdf", # These can be modified to help fit the final pdf that's exported. width = 9, height = 7.5, bg = "white", # How few distinct dataSources should a group have to be listed as "other" smallGrpThreshold = 3, title = "Duplicated record sources", # The default list of colour palettes to choose from usign the paleteer package palettes = c("cartography::blue.pal", "cartography::green.pal", "cartography::sand.pal", "cartography::orange.pal", "cartography::red.pal", "cartography::purple.pal", "cartography::brown.pal"), canvas.ylim = c(-1.0,1.0), canvas.xlim = c(-0.6, 0.25), text.col = "black", legendX = grid::unit(6, "mm"), legendY = grid::unit(18, "mm"), legendJustify = c("left", "bottom"), niceFacing = TRUE) ``` ## 5.2 Duplicate histogram In this example, we will use one of the example datasets to show you how this works. We will use beesFlagged, which has been filtered from a larger dataset and contains duplicates from that larger dataset. To print the plot in R, you need to specify returnPlot = TRUE, otherwise it will only save to the disk ```{r 5.2} data("beesFlagged", package = "BeeBDC") # Create a figure shoring the total number of duplicates, kept duplicates, and unique # records for each datasource (simplified to the text before the first underscore) and # the proportion of the above for each data source BeeBDC::dupePlotR( data = beesFlagged, # The outPath to save the plot as outPath = tempdir(), fileName = "Fig3_duplicatePlot.pdf", # Colours in order: duplicate, kept duplicate, unique dupeColours = c("#F2D2A2","#B9D6BC", "#349B90"), # Plot size and height base_height = 7, base_width = 7, legend.position = c(0.85, 0.8), # Extra variables can be fed into forcats::fct_recode() to change names on plot GBIF = "GBIF", SCAN = "SCAN", iDigBio = "iDigBio", USGS = "USGS", ALA = "ALA", ASP = "ASP", returnPlot = TRUE ) ``` ## 5.3 Flags by source The `plotFlagSummary()` function is one of the most important for quickly summarising and checking that your data and flags have worked together correctly. It can be a good starting point for error-checking. You will also see in `plotFlagSummary()` that you can filter to particular species and also output quick point maps of those species. ### a. all taxa in dataset #### # Visualise all flags for each dataSource (simplified to the text before the first underscore) BeeBDC::plotFlagSummary( data = beesFlagged, # Colours in order of pass (TRUE), fail (FALSE), and NA flagColours = c("#127852", "#A7002D", "#BDBABB"), fileName = paste0("Fig4_FlagsPlot_", Sys.Date(),".pdf"), outPath = tempdir(), width = 15, height = 9, # Extra variables can be fed into forcats::fct_recode() to change names on plot GBIF = "GBIF", SCAN = "SCAN", iDigBio = "iDigBio", USGS = "USGS", ALA = "ALA", ASP = "ASP", returnPlot = TRUE ) ``` ###### b. Single sp. summary #### In fact, lets build one of these single-species example below using the same data and the omnipresent *Apis mellifera*. ```{r 5.3b} # Visualise all flags for each dataSource (simplified to the text before the first underscore) # A clever user might also realise the potential to summarise and produce outputs in other columns BeeBDC::plotFlagSummary( # WARNING: alternate path if wanting to produce figures for the selected taxonData (2.0 above) # Select only the taxonData data data = beesFlagged, # Colours in order of pass (TRUE), fail (FALSE), and NA flagColours = c("#127852", "#A7002D", "#BDBABB"), fileName = paste0("FlagsPlot_Amell", Sys.Date(),".pdf"), outPath = tempdir(), width = 15, height = 9, # OPTIONAL: # # Filter to species speciesName = "Apis mellifera Linnaeus, 1758", # column to look in nameColumn = "scientificName", # Save the filtered data saveFiltered = FALSE, # Filter column to display on map filterColumn = ".summary", plotMap = TRUE, # amount to jitter points if desired, e.g. 0.25 or NULL jitterValue = NULL, # Map opacity value for points between 0 and 1 mapAlpha = 1, returnPlot = TRUE, # Extra variables can be fed into forcats::fct_recode() to change names on plot GBIF = "GBIF", SCAN = "SCAN", iDigBio = "iDigBio", USGS = "USGS", ALA = "ALA", ASP = "ASP", CAES = "CAES", 'B. Mont.' = "BMont", 'B. Minkley' = "BMin", Ecd = "Ecd", Gaiarsa = "Gai", EPEL = "EPEL" ) ``` ## 5.4 Maps We can also make some overall summary maps at the country level using `summaryMaps()`. If you get an error about breaks not being unique, then reduce class_n. ```{r 5.4} BeeBDC::summaryMaps( data = beesFlagged, width = 10, height = 10, class_n = 3, class_Style = "jenks", outPath = tempdir(), fileName = "CountryMaps_jenks.pdf", returnPlot = TRUE ) ``` # 6.0 Save data ```{r 6.0, eval = FALSE} mapData %>% readr::write_excel_csv(paste0(DataPath, "/Output/Intermediate/", "cleanTaxon_", Sys.Date(), ".csv")) ``` ```{r cleanup, include=FALSE, collapse = TRUE} # Remove the webpage folder unlink(paste0(dirname(getwd()), "/inst/extdata/WebDir"), recursive = TRUE) ```