Nomilo Fishpond Case Study
  • About
  • Data Processing
  • Data Explorer Dashboard

Steps:

  • Helper Functions
  • Data Importation
    • Data Sources
    • Procedure
  • Data Tidying
  • Data Exportation










Data Processing

Authors

Lysbeth Koster

Alemarie Ceria

Published

February 28, 2024

Helper Functions

Function to load XLSX file to a large list
load_xlsx_files_to_list <- function(dir_path) {
  fs::dir_ls(path = dir_path) %>%
  keep(str_detect(., ".xlsx$")) %>%
  set_names(nm = ~ gsub("data/raw/2024-02-28_|\\.xlsx$|\\.xls$", "", .)) %>%
  map(import_list) %>%
  set_names(~ gsub("-", "_", .)) %>%
  map(~ set_names(.x, gsub("-", "_", names(.x)))) %>% 
  map(~ set_names(.x, gsub(" ", "_", tolower(names(.x)))))
}
Function to export large list of dataframes to a RData file for more efficient data loading
export_to_rdata <- function(dfs_list, dir_path) {
  current_date <- format(Sys.Date(), "%Y-%m-%d")
  save(dfs_list, file = paste0(dir_path, current_date, "_dfs.RData"))
}
Function to load specific Excel sheets for each dataset
load_xlsx_sheet <- function(dfs_list, df, sheet) {
  dfs_list %>% 
    pluck(df, sheet)
}
Function to export data object to CSV
export_to_csv <- function(df, df_name, dir_path) {
  current_date <- format(Sys.Date(), "%Y-%m-%d")
  file_name <- gsub("_", "-", df_name)
  full_file_path <- paste0(dir_path, "/", current_date, "_", file_name, ".csv")
  write.csv(df, full_file_path, row.names = FALSE)
}

Data Importation

Data Sources

Data Producer Dataset Dataset Description
Lysbeth Water Samples Measurements of flow cytometry, nutrients, and chlorophyll-a to represent the biogeochemical dynamics of Nomilo Fishpond
Lysbeth Water Profiles Measurements of temperature, dissolved oxygen, salinity, and visibility to determine water column profiles.
Kauaʻi Sea Farm Clam Growth Growth data of Mercenaria mercenaria clams at Nomilo fishpond gathered by Kauaʻi Sea Farm.
Kauaʻi Sea Farm Oyster Growth Growth data of Crassostrea gigas oysters at Nomilo fishpond gathered by Kauaʻi Sea Farm.
Kauaʻi Sea Farm Weather Meteorological measurements and weather data reports that were kept on-site by Kauaʻi Sea Farm accessed through weather.Kauaʻiseafarm.com
Kauaʻi Sea Farm Kauaʻi Sea Farm Compiled Data Various daily water quality metrics at the surface level (approximately 0.5m) at the Production Dock location of Nomilo fishpond measured by Kauaʻi Sea Farm.

Procedure

Step 1: Efficiently install packages and load libraries
if (!requireNamespace("pacman", quietly = TRUE)) {
  install.packages("pacman")
}

pacman::p_load(
  renv, 
  yaml, 
  rmarkdown, 
  rio, 
  tidyverse, 
  janitor, 
  lubridate, 
  fs,
  hms, 
  zoo, 
  kableExtra, 
  ggplot2, 
  shiny, 
  rsconnect, 
  packrat, 
  plotly, 
  htmlwidgets
  )
Expand To Learn When to Run Step 2a

Run when there is new raw data added. Since the following code chunk is slow to execute, run step 2b.

Step 2a: Load all XLSX files into a large dataframe list and clean dataframe and sheet names then export as an .RData file
dfs <- slowly_load_xlsx_files_to_list("data/raw")
export_to_rdata(dfs, "data/raw/rdata-files/")
Step 2b: Efficiently load all dataframes into a large list
base::load("data/raw/rdata-files/2024-05-05_dfs.RData")
Step 3: Load the specific Excel sheet of each dataframe and store in their respective data objects
water_samples_data <- load_xlsx_sheet(dfs_list, "water_samples", "data_overview")

water_profiles_data <- map_dfr(c("l1", "l2", "l3", "l4"), ~ dfs_list$profile_data[[.x]])

clam_growth_data <- load_xlsx_sheet(dfs_list, "ksf_clam_growth", "sheet1")

oyster_growth_data <- load_xlsx_sheet(dfs_list, "ksf_oyster_cylinder_growth", "sheet1")

weather_data <- load_xlsx_sheet(dfs_list, "weather_data", "weather_ksf")

ksf_data <- load_xlsx_sheet(dfs_list, "ksf_compiled_data", "full_data")

Data Tidying

  • Water Samples
  • Water Profiles
  • Clam Growth
  • Oyster Growth
  • Weather Data
  • Kauaʻi Sea Farm Data
Steps to clean data
water_samples_data_tidied <- water_samples_data %>%
  slice(-c(44:52)) %>% 
  rename_with(~gsub("\\s*\\([^\\)]+\\)", "", .x)) %>%
  janitor::clean_names() %>%
  mutate(
    date = if_else(date == "44074",
            as.character(as.Date("2024-01-09")),
            format(dmy(date), "%Y-%m-%d"))
  ) %>%
  mutate(sample_id = 1:nrow(.)) %>%
  mutate(date = as.Date(date),
         across(c(nomilo_id, location, round, depth), as.factor)) %>%
  select(-c(sample_id, nomilo_id, tube_name)) %>% 
  mutate(location = str_to_title(location), 
         location = case_when(
           location == "Auwei" ~ "Auwai",
           TRUE ~ as.character(location)
         ),
         location = factor(location, 
                           levels = c("Mid Buoy", "Back Buoy", "Auwai", "Production Dock"))
  ) %>% 
  filter(!is.na(location))

paged_table(water_samples_data_tidied)
Steps to clean data
new_profile_var_names <- c("depth", "water_temperature", "dissolved_oxygen", "salinity", "conductivity", "visibility", "location", "date")

water_profiles_data_tidied <- water_profiles_data %>%
  select(-c(6, 8)) %>%
  mutate(
    temp_column1 = NA_character_,
    temp_column2 = NA_character_
  ) %>%
  setNames(new_profile_var_names) %>%
  mutate(
    location = ifelse(depth == "Location", water_temperature, NA_character_), 
    date = ifelse(depth == "Date",  water_temperature, NA_character_)
  ) %>%
  fill(location, date, .direction = "down") %>%
  filter(depth != "Location", depth != "Date") %>%
  mutate(
    location = case_when(
      location == "L1 Northwest buoy" ~ "back buoy",
      location == "L2 Middle Buoy" ~ "mid buoy",
      location == "L3 Production Dock" ~ "production dock",
      location == "L4 Auwai" ~ "Auwai",
      TRUE ~ location
    ),
    date = case_when(
      date %in% c("45258", "2023-11-28") ~ "2023-11-28",
      date %in% c("45282", "2023-12-21") ~ "2023-12-21",
      date %in% c("45536", "2024-01-09") ~ "2024-01-09",
      date %in% c("30/1/24", "30/01/24") ~ "2024-01-30",
      date %in% c("20/02/24", "20/2/24") ~ "2024-02-20",
      TRUE ~ date
    )) %>%
  mutate(
    date = as.Date(date, format = "%Y-%m-%d"),
     conductivity = case_when(
      row_number() %in% c(1:11, 53:62, 128:133, 159:161) ~ NA_character_,
      TRUE ~ as.character(conductivity)
    )
  ) %>%
  filter(!(depth %in% c("Samples", "Depth"))) %>%
  mutate(date = as.Date(date),
         across(c(depth, location), as.factor),
         across(c(water_temperature,  dissolved_oxygen, salinity, 
                  conductivity,visibility),  as.numeric)) %>%
   fill(visibility, .direction = "down") %>%
  mutate(visibility = if_else(date == "2023-11-28",  NA_real_, visibility)) %>%
  mutate(location = str_to_title(location)) %>%
  mutate(location = factor(location, 
                           levels = c("Mid Buoy", "Back Buoy", "Auwai", "Production Dock"))) %>% 
  mutate(conductivity = coalesce(conductivity, salinity)) %>% 
  rename("salinity1" = salinity,
         "salinity" = conductivity) %>% 
  select(-salinity1)

paged_table(water_profiles_data_tidied)
Steps to clean data
new_clam_var_names <- c(
  "sort_date", "color", "clams_in_count", "clams_in_lbs",  "clams_in_avg_per_lb",
  "clams_out_count", "clams_out_lbs", "clams_out_avg_per_lb", "growth_in_lbs", 
  "growth_pct", "sr", "days_btwn_sort"
  )

new_clam_date_col <- c(
  "2023-10-17", "2023-12-06", "2023-12-12", "2024-01-02",  "2024-01-10",
  "2024-01-24", "2024-01-31", "2024-02-08", "2024-02-13", "2024-02-07",
  "2024-02-13", "2024-03-14", "2024-03-19"
  )

clam_growth_data_tidied <- clam_growth_data %>%
  slice(-1) %>%
  setNames(new_clam_var_names) %>%
  mutate(date = as.Date(new_clam_date_col)) %>%
  dplyr::select(-sort_date) %>%
   pivot_longer(
    cols = c(
      clams_in_count, clams_in_lbs, clams_in_avg_per_lb,   clams_out_count, 
      clams_out_lbs, clams_out_avg_per_lb
      ),
    names_to = c("stage", ".value"),
    names_prefix = "clams_",
    names_sep = "_",
    values_to = "value"
  ) %>%
  mutate(stage = if_else(str_detect(stage, "in"), "In", "Out")) %>%
  rename(avg_per_lbs = avg) %>%
  mutate(across(c(color, stage), as.factor)) %>%
  mutate(across(c(count, lbs, avg_per_lbs, growth_in_lbs, growth_pct, sr),
                ~as.numeric(gsub("%", "", .)))) %>%
  arrange(date, color, stage) %>%
  dplyr::select(date, days_btwn_sort, color, stage, count, lbs, avg_per_lbs,
                growth_in_lbs, growth_pct, sr) %>%
  rename("days_btwn_clams_sort" = days_btwn_sort,
         "clams_color" = color,
         "clams_stage" = stage,
         "clams_count" = count,
         "weight" = lbs,
         "avg_weight" = avg_per_lbs,
         "clams_growth" = growth_in_lbs,
         "clams_sr" = sr, 
          "date_out" = date
         ) %>% 
    mutate(date_in = date_out - days_btwn_clams_sort) %>%
relocate(c(date_in, date_out), .before = days_btwn_clams_sort) %>% 
  mutate(
    date = case_when(
      clams_stage == 'In' ~ date_in,
      clams_stage == 'Out' ~ date_out
    )
  ) %>%
  select(-date_in, -date_out) %>% 
   mutate(grouping_variable = rep(1:(n() / 2), each = 2))


paged_table(clam_growth_data_tidied)
Steps to clean data
oyster_var_names <- c(
  "date", "oyster_large_weight", "oyster_large_gain", "oyster_small_weight",
  "oyster_small_gain", "oyster_chlorophyll"
  )

oyster_growth_data_tidied <- oyster_growth_data %>% 
  dplyr::select(c(1, 4, 5, 8, 9, 12)) %>%
  slice(-1) %>%
  setNames(oyster_var_names) %>%
  pivot_longer(
    cols = c(oyster_large_weight, oyster_large_gain,
             oyster_small_gain,
             oyster_small_weight),
    names_to = c("oyster_size", ".value"),
    names_prefix = "oyster_",
    names_sep = "_",
    values_to = "value"
  ) %>%
  mutate(oyster_size = if_else(str_detect(oyster_size, "small"), "Small", "Large")) %>%
  mutate(date = as.Date(date),
         oyster_size = as.factor(oyster_size),
         across(c(weight, gain), as.numeric)
        ) %>%
  filter(date >= as.Date("2023-11-20") & date <=
           as.Date("2024-02-20")) %>%
  mutate(weight = weight * 0.00220462) %>% 
  rename("growth_pct" = gain) %>%
  select(-growth_pct) %>% 
  filter(!is.na(weight) & oyster_size != "Large")

paged_table(oyster_growth_data_tidied)
Steps to clean data
weather_data_tidied <- weather_data %>% 

janitor::clean_names() %>%
   unite(date, year, month, day, sep = "-") %>%
  mutate(date = ymd(date)) %>%
   select(-c(1, 3)) %>%
  rename("outdoor_temperature" = outdoor_temp_f) %>%
   mutate(outdoor_temperature = (outdoor_temperature - 32) * (5/9)) %>%
  group_by(date) %>%
  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
  slice(-1)

paged_table(weather_data_tidied)
Steps to clean data
ksf_data_tidied <- ksf_data %>% 
  rename_with(~gsub("\\s*\\([^\\)]+\\)", "", .x)) %>%
  janitor::clean_names() %>%
  rename(date = date_time) %>%
  mutate(date = as.Date(date)) %>%
  filter(date >= as.Date("2023-11-20") & date <= as.Date("2024-02-20")) %>%
  arrange(date) %>%
  dplyr::select(-c(external_voltage, wk_num, wind_dir,
                   spadd, outdoor_temperature, hourly_rain,
                   solar_radiation, resistivity, battery_capacity,
                   hour, daynum, data_pt, wind_sp, diradd,
                   wind_speed, wind_direction, tide, day, month, year)
                ) %>%
  dplyr::select(where(~ !anyNA(.))) %>%
  group_by(date) %>%
  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
  rename("ksf_salinity" = salinity,
         "ksf_rdo_saturation" = rdo_saturation,
         "ksf_rdo_concentration" = rdo_concentration,
         "ksf_actual_conductivity" = actual_conductivity,
         "ksf_total_dissolved_solids" = total_dissolved_solids,
         "ksf_ammonium" = ammonium,
         "ksf_barometric_pressure" = barometric_pressure,
         "ksf_oxygen_partial_pressure" = oxygen_partial_pressure,
         "ksf_specific_conductivity" = specific_conductivity,
         "ksf_density" = density,
         "ksf_chlorophyll_a_fluorescence" = chlorophyll_a_fluorescence,
         "ksf_ammonium_m_v" = ammonium_m_v)

paged_table(ksf_data_tidied)

Data Exportation

Efficiently export tidied datasets to data/tidied folder
dfs_to_export <- list( 
  water_samples_data_tidied = water_samples_data_tidied,
  water_profiles_data_tidied = water_profiles_data_tidied,
  clam_growth_data_tidied = clam_growth_data_tidied,
  oyster_growth_data_tidied = oyster_growth_data_tidied,
  weather_data_tidied = weather_data_tidied,
  ksf_data_tidied = ksf_data_tidied
) 

imap(dfs_to_export, ~ export_to_csv(.x, .y, "data/tidied"))
Back to top