Data Processing

Authors

Lysbeth Koster

Alemarie Ceria

Published

February 28, 2024

Helper Functions

Function to load XLSX file to a large list

load_xlsx_files_to_list <- function(dir_path) {
  fs::dir_ls(path = dir_path) %>%
  keep(str_detect(., ".xlsx$")) %>%
  set_names(nm = ~ gsub("data/raw/2024-02-28_|\\.xlsx$|\\.xls$", "", .)) %>%
  map(import_list) %>%
  set_names(~ gsub("-", "_", .)) %>%
  map(~ set_names(.x, gsub("-", "_", names(.x)))) %>% 
  map(~ set_names(.x, gsub(" ", "_", tolower(names(.x)))))
}

Function to export large list of dataframes to a RData file for more efficient data loading

export_to_rdata <- function(dfs_list, dir_path) {
  current_date <- format(Sys.Date(), "%Y-%m-%d")
  save(dfs_list, file = paste0(dir_path, current_date, "_dfs.RData"))
}

Function to load specific Excel sheets for each dataset

load_xlsx_sheet <- function(dfs_list, df, sheet) {
  dfs_list %>% 
    pluck(df, sheet)
}

Function to export data object to CSV

export_to_csv <- function(df, df_name, dir_path) {
  current_date <- format(Sys.Date(), "%Y-%m-%d")
  file_name <- gsub("_", "-", df_name)
  full_file_path <- paste0(dir_path, "/", current_date, "_", file_name, ".csv")
  write.csv(df, full_file_path, row.names = FALSE)
}

Data Importation

Data Sources

Data Producer	Dataset	Dataset Description
Lysbeth	Water Samples	Measurements of flow cytometry, nutrients, and chlorophyll-a to represent the biogeochemical dynamics of Nomilo Fishpond
Lysbeth	Water Profiles	Measurements of temperature, dissolved oxygen, salinity, and visibility to determine water column profiles.
Kauaʻi Sea Farm	Clam Growth	Growth data of Mercenaria mercenaria clams at Nomilo fishpond gathered by Kauaʻi Sea Farm.
Kauaʻi Sea Farm	Oyster Growth	Growth data of Crassostrea gigas oysters at Nomilo fishpond gathered by Kauaʻi Sea Farm.
Kauaʻi Sea Farm	Weather	Meteorological measurements and weather data reports that were kept on-site by Kauaʻi Sea Farm accessed through weather.Kauaʻiseafarm.com
Kauaʻi Sea Farm	Kauaʻi Sea Farm Compiled Data	Various daily water quality metrics at the surface level (approximately 0.5m) at the Production Dock location of Nomilo fishpond measured by Kauaʻi Sea Farm.

Procedure

Step 1: Efficiently install packages and load libraries

if (!requireNamespace("pacman", quietly = TRUE)) {
  install.packages("pacman")
}

pacman::p_load(
  renv, 
  yaml, 
  rmarkdown, 
  rio, 
  tidyverse, 
  janitor, 
  lubridate, 
  fs,
  hms, 
  zoo, 
  kableExtra, 
  ggplot2, 
  shiny, 
  rsconnect, 
  packrat, 
  plotly, 
  htmlwidgets
  )

Expand To Learn When to Run Step 2a

Run when there is new raw data added. Since the following code chunk is slow to execute, run step 2b.

Step 2a: Load all XLSX files into a large dataframe list and clean dataframe and sheet names then export as an .RData file

dfs <- slowly_load_xlsx_files_to_list("data/raw")
export_to_rdata(dfs, "data/raw/rdata-files/")

Step 2b: Efficiently load all dataframes into a large list

base::load("data/raw/rdata-files/2024-05-05_dfs.RData")

Step 3: Load the specific Excel sheet of each dataframe and store in their respective data objects

water_samples_data <- load_xlsx_sheet(dfs_list, "water_samples", "data_overview")

water_profiles_data <- map_dfr(c("l1", "l2", "l3", "l4"), ~ dfs_list$profile_data[[.x]])

clam_growth_data <- load_xlsx_sheet(dfs_list, "ksf_clam_growth", "sheet1")

oyster_growth_data <- load_xlsx_sheet(dfs_list, "ksf_oyster_cylinder_growth", "sheet1")

weather_data <- load_xlsx_sheet(dfs_list, "weather_data", "weather_ksf")

ksf_data <- load_xlsx_sheet(dfs_list, "ksf_compiled_data", "full_data")

Data Tidying

Steps to clean data

water_samples_data_tidied <- water_samples_data %>%
  slice(-c(44:52)) %>% 
  rename_with(~gsub("\\s*\\([^\\)]+\\)", "", .x)) %>%
  janitor::clean_names() %>%
  mutate(
    date = if_else(date == "44074",
            as.character(as.Date("2024-01-09")),
            format(dmy(date), "%Y-%m-%d"))
  ) %>%
  mutate(sample_id = 1:nrow(.)) %>%
  mutate(date = as.Date(date),
         across(c(nomilo_id, location, round, depth), as.factor)) %>%
  select(-c(sample_id, nomilo_id, tube_name)) %>% 
  mutate(location = str_to_title(location), 
         location = case_when(
           location == "Auwei" ~ "Auwai",
           TRUE ~ as.character(location)
         ),
         location = factor(location, 
                           levels = c("Mid Buoy", "Back Buoy", "Auwai", "Production Dock"))
  ) %>% 
  filter(!is.na(location))

paged_table(water_samples_data_tidied)

Steps to clean data

new_profile_var_names <- c("depth", "water_temperature", "dissolved_oxygen", "salinity", "conductivity", "visibility", "location", "date")

water_profiles_data_tidied <- water_profiles_data %>%
  select(-c(6, 8)) %>%
  mutate(
    temp_column1 = NA_character_,
    temp_column2 = NA_character_
  ) %>%
  setNames(new_profile_var_names) %>%
  mutate(
    location = ifelse(depth == "Location", water_temperature, NA_character_), 
    date = ifelse(depth == "Date",  water_temperature, NA_character_)
  ) %>%
  fill(location, date, .direction = "down") %>%
  filter(depth != "Location", depth != "Date") %>%
  mutate(
    location = case_when(
      location == "L1 Northwest buoy" ~ "back buoy",
      location == "L2 Middle Buoy" ~ "mid buoy",
      location == "L3 Production Dock" ~ "production dock",
      location == "L4 Auwai" ~ "Auwai",
      TRUE ~ location
    ),
    date = case_when(
      date %in% c("45258", "2023-11-28") ~ "2023-11-28",
      date %in% c("45282", "2023-12-21") ~ "2023-12-21",
      date %in% c("45536", "2024-01-09") ~ "2024-01-09",
      date %in% c("30/1/24", "30/01/24") ~ "2024-01-30",
      date %in% c("20/02/24", "20/2/24") ~ "2024-02-20",
      TRUE ~ date
    )) %>%
  mutate(
    date = as.Date(date, format = "%Y-%m-%d"),
     conductivity = case_when(
      row_number() %in% c(1:11, 53:62, 128:133, 159:161) ~ NA_character_,
      TRUE ~ as.character(conductivity)
    )
  ) %>%
  filter(!(depth %in% c("Samples", "Depth"))) %>%
  mutate(date = as.Date(date),
         across(c(depth, location), as.factor),
         across(c(water_temperature,  dissolved_oxygen, salinity, 
                  conductivity,visibility),  as.numeric)) %>%
   fill(visibility, .direction = "down") %>%
  mutate(visibility = if_else(date == "2023-11-28",  NA_real_, visibility)) %>%
  mutate(location = str_to_title(location)) %>%
  mutate(location = factor(location, 
                           levels = c("Mid Buoy", "Back Buoy", "Auwai", "Production Dock"))) %>% 
  mutate(conductivity = coalesce(conductivity, salinity)) %>% 
  rename("salinity1" = salinity,
         "salinity" = conductivity) %>% 
  select(-salinity1)

paged_table(water_profiles_data_tidied)

Steps to clean data

new_clam_var_names <- c(
  "sort_date", "color", "clams_in_count", "clams_in_lbs",  "clams_in_avg_per_lb",
  "clams_out_count", "clams_out_lbs", "clams_out_avg_per_lb", "growth_in_lbs", 
  "growth_pct", "sr", "days_btwn_sort"
  )

new_clam_date_col <- c(
  "2023-10-17", "2023-12-06", "2023-12-12", "2024-01-02",  "2024-01-10",
  "2024-01-24", "2024-01-31", "2024-02-08", "2024-02-13", "2024-02-07",
  "2024-02-13", "2024-03-14", "2024-03-19"
  )

clam_growth_data_tidied <- clam_growth_data %>%
  slice(-1) %>%
  setNames(new_clam_var_names) %>%
  mutate(date = as.Date(new_clam_date_col)) %>%
  dplyr::select(-sort_date) %>%
   pivot_longer(
    cols = c(
      clams_in_count, clams_in_lbs, clams_in_avg_per_lb,   clams_out_count, 
      clams_out_lbs, clams_out_avg_per_lb
      ),
    names_to = c("stage", ".value"),
    names_prefix = "clams_",
    names_sep = "_",
    values_to = "value"
  ) %>%
  mutate(stage = if_else(str_detect(stage, "in"), "In", "Out")) %>%
  rename(avg_per_lbs = avg) %>%
  mutate(across(c(color, stage), as.factor)) %>%
  mutate(across(c(count, lbs, avg_per_lbs, growth_in_lbs, growth_pct, sr),
                ~as.numeric(gsub("%", "", .)))) %>%
  arrange(date, color, stage) %>%
  dplyr::select(date, days_btwn_sort, color, stage, count, lbs, avg_per_lbs,
                growth_in_lbs, growth_pct, sr) %>%
  rename("days_btwn_clams_sort" = days_btwn_sort,
         "clams_color" = color,
         "clams_stage" = stage,
         "clams_count" = count,
         "weight" = lbs,
         "avg_weight" = avg_per_lbs,
         "clams_growth" = growth_in_lbs,
         "clams_sr" = sr, 
          "date_out" = date
         ) %>% 
    mutate(date_in = date_out - days_btwn_clams_sort) %>%
relocate(c(date_in, date_out), .before = days_btwn_clams_sort) %>% 
  mutate(
    date = case_when(
      clams_stage == 'In' ~ date_in,
      clams_stage == 'Out' ~ date_out
    )
  ) %>%
  select(-date_in, -date_out) %>% 
   mutate(grouping_variable = rep(1:(n() / 2), each = 2))


paged_table(clam_growth_data_tidied)

Steps to clean data

oyster_var_names <- c(
  "date", "oyster_large_weight", "oyster_large_gain", "oyster_small_weight",
  "oyster_small_gain", "oyster_chlorophyll"
  )

oyster_growth_data_tidied <- oyster_growth_data %>% 
  dplyr::select(c(1, 4, 5, 8, 9, 12)) %>%
  slice(-1) %>%
  setNames(oyster_var_names) %>%
  pivot_longer(
    cols = c(oyster_large_weight, oyster_large_gain,
             oyster_small_gain,
             oyster_small_weight),
    names_to = c("oyster_size", ".value"),
    names_prefix = "oyster_",
    names_sep = "_",
    values_to = "value"
  ) %>%
  mutate(oyster_size = if_else(str_detect(oyster_size, "small"), "Small", "Large")) %>%
  mutate(date = as.Date(date),
         oyster_size = as.factor(oyster_size),
         across(c(weight, gain), as.numeric)
        ) %>%
  filter(date >= as.Date("2023-11-20") & date <=
           as.Date("2024-02-20")) %>%
  mutate(weight = weight * 0.00220462) %>% 
  rename("growth_pct" = gain) %>%
  select(-growth_pct) %>% 
  filter(!is.na(weight) & oyster_size != "Large")

paged_table(oyster_growth_data_tidied)

Steps to clean data

weather_data_tidied <- weather_data %>% 

janitor::clean_names() %>%
   unite(date, year, month, day, sep = "-") %>%
  mutate(date = ymd(date)) %>%
   select(-c(1, 3)) %>%
  rename("outdoor_temperature" = outdoor_temp_f) %>%
   mutate(outdoor_temperature = (outdoor_temperature - 32) * (5/9)) %>%
  group_by(date) %>%
  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
  slice(-1)

paged_table(weather_data_tidied)

Steps to clean data

ksf_data_tidied <- ksf_data %>% 
  rename_with(~gsub("\\s*\\([^\\)]+\\)", "", .x)) %>%
  janitor::clean_names() %>%
  rename(date = date_time) %>%
  mutate(date = as.Date(date)) %>%
  filter(date >= as.Date("2023-11-20") & date <= as.Date("2024-02-20")) %>%
  arrange(date) %>%
  dplyr::select(-c(external_voltage, wk_num, wind_dir,
                   spadd, outdoor_temperature, hourly_rain,
                   solar_radiation, resistivity, battery_capacity,
                   hour, daynum, data_pt, wind_sp, diradd,
                   wind_speed, wind_direction, tide, day, month, year)
                ) %>%
  dplyr::select(where(~ !anyNA(.))) %>%
  group_by(date) %>%
  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
  rename("ksf_salinity" = salinity,
         "ksf_rdo_saturation" = rdo_saturation,
         "ksf_rdo_concentration" = rdo_concentration,
         "ksf_actual_conductivity" = actual_conductivity,
         "ksf_total_dissolved_solids" = total_dissolved_solids,
         "ksf_ammonium" = ammonium,
         "ksf_barometric_pressure" = barometric_pressure,
         "ksf_oxygen_partial_pressure" = oxygen_partial_pressure,
         "ksf_specific_conductivity" = specific_conductivity,
         "ksf_density" = density,
         "ksf_chlorophyll_a_fluorescence" = chlorophyll_a_fluorescence,
         "ksf_ammonium_m_v" = ammonium_m_v)

paged_table(ksf_data_tidied)

Data Exportation

Efficiently export tidied datasets to data/tidied folder

dfs_to_export <- list( 
  water_samples_data_tidied = water_samples_data_tidied,
  water_profiles_data_tidied = water_profiles_data_tidied,
  clam_growth_data_tidied = clam_growth_data_tidied,
  oyster_growth_data_tidied = oyster_growth_data_tidied,
  weather_data_tidied = weather_data_tidied,
  ksf_data_tidied = ksf_data_tidied
) 

imap(dfs_to_export, ~ export_to_csv(.x, .y, "data/tidied"))